In [17]:
%matplotlib inline
import warnings
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from requests import get
from bs4 import BeautifulSoup # for web scrapping

# map functions
import os 
import folium
import rasterio as rio
import earthpy as et
from folium import plugins
from rasterio.warp import calculate_default_transform, reproject, Resampling
from IPython.display import IFrame

# to ignore the warnings and make the notebook more presentable
warnings.filterwarnings('ignore') 

# Defining Fuctions

This function will take in a list of URLs corresponding each to a specific page from the website. Each page contains approximately 10 different links that contains information on a specific complaint. The objectif is to list the links coming from all pages combined. 

In [2]:
def Get_Links_from_Page(Pages):
    # empty list for links
    list_links = []
    # we import the values from this website to fetch the missing information for the previous graph

    for Page in Pages:
        print(Page)
        # to make a request
        response1 = get(Page)

        # the response variable will contain the response of that request object.
        r1 = requests.post(Page, data = {'key':'value'})

        # the soup_obj will be used to fetch our required results
        soup_obj1 = BeautifulSoup(response1.text,'html.parser')

        # find_all() will help to fetch all the details of the selected tag.
        list_links.append(soup_obj1.find_all('a'))

    return list_links

This function takes in a list of links and for each, grabs the information of interest and stores it into a dataframe. The task is complicated because some links have '/biz/' and others have '/incident/' in them. The two kinds of links can not be webscraped the same way because the information is organised differently. For this reason, there is an if-condition in the Create_df function.

In [3]:
def Create_df_(Links):

    # creation of a dataframe which will take the values from the web scrapping
    d = ['Name','Address','Zip', 'Latest_Report_Date', 'Total_all_time_reports', 'Total_all_time_sick_persons','Latitude','Longitude']
    df = pd.DataFrame(columns = d)
    
    # taking each link seperately
    for link in Links:

        response1 = get(link)
        
        # the soup_obj will be used to fetch the our required results
        soup_obj2 = BeautifulSoup(response1.text,'html.parser')
        # biz type complaint
        if '/biz/' in link:
            print(link)
            post2 = soup_obj2.find(class_ = 'col-12 single-post single-incident')
            
            # Total_all_time_reports and Total_all_time_sick_persons
            Reports_Sick = post2.find(class_ = 'row justify-content-start text-muted').get_text()
            Total_all_time_reports, Total_all_time_sick_persons = [int(s) for s in Reports_Sick.split() if s.isdigit()]
            
            # Latest_Report_Date
            p = post2.find_all(class_ = 'text-muted my-2')
            # getting rid of extra spaces
            p = p[1].get_text().replace('  ', '')
            # getting rid of \n
            p = p.replace('\n', '')
            p = p.replace('Latest report:', '')
            Latest_Report_Date = p
            
            # Longitude and Latitude
            s = post2.find(class_ = 'col-12 col-md-12 col-lg-5 mt-3 mt-md-0').find(class_ = 'img-fluid lazyload')['data-src']
            start = '=en¢er='
            end = '&zoom='
            loc = s[s.find(start)+len(start):s.rfind(end)]
            lat, lon = [x.strip() for x in loc.split(',')]
                
            # Address, Name and Zip
            # splitting with respect to ','
            Name_Address = post2.find(class_ = 'h1 post-title').get_text().split(",")
            Name = Name_Address[0]
            Address = Name_Address[1:]
            Zip = Name_Address[3][:6]
            
 
        else:
            print(link)
            
            # Latest_Report_Date
            post3 = soup_obj2.find_all(class_ = 'text-muted')
            for i in range(10):
                if 'date' in str(post3[i]):
                    p = post3[i].get_text()
                    p = p.replace('  ', '')
                    p = p.replace('\n', '')
                    p = p.replace('Reported:', '')
                    Latest_Report_Date = p
                
            post2 = soup_obj2.find(class_ = 'col-12 page-content mt-4 location-post single-post card py-3')
            
            # Total_all_time_reports, Total_all_time_sick_persons
            Reports_Sick = post2.find(class_ = 'row justify-content-start text-muted').get_text()
            Total_all_time_reports, Total_all_time_sick_persons = [int(s) for s in Reports_Sick.split() if s.isdigit()]
            
            # this gives the link for google map and has the longitude and latitude
            s = post2.find(class_ = 'col-12 col-md-12 col-lg-4 mt-3 mt-md-0').find(class_ = 'img-fluid lazyload')['data-src']
            
            # longitude and latitude values are in between the two following words
            start = '=en¢er='
            end = '&zoom='
            # retrieving longitude and latitude
            loc = s[s.find(start)+len(start):s.rfind(end)]
            lat, lon = [x.strip() for x in loc.split(',')]
            
            # Name and Address
            Name_Address = post2.find(class_ = 'h1 post-title').get_text().split(",")
            Name = Name_Address[0]
            Address = Name_Address[1:]
            
            # ZipCode
            Zip = soup_obj2.find_all(class_ = 'my-2')[2].get_text()
            # cleaning ZipCode
            Zip = Zip.replace('\n', '')
            Zip = Zip.replace('  ', '')
            Zip = Zip.split(',')
            Zip = Zip[2]
        
        # adding to dataframe
        df = df.append({'Name': Name, 'Address': Address,'Zip': Zip, 'Latest_Report_Date' : Latest_Report_Date,
                                    'Total_all_time_reports' : Total_all_time_reports,
                                    'Total_all_time_sick_persons' : Total_all_time_sick_persons,
                                    'Latitude' : lat, 'Longitude' : lon}, ignore_index=True) 
    return df


# Webscraping

There are many links given on the website and many different pages as well. In each page, we hope to extract approximately 10 links that lead to a complaint. However, the webscrapping gives us all the links of the page, most of which are useless to us. In the following, we must choose carefully how to filter the given links.

>First, we may easily find the links for the 12 first pages by changing one number in the same link. 

In [4]:
# creating empty list of pages
Pages = []
# adding the 20 first pages from the website
for i in range(12):   
    i += 1
    Pages.append('https://iwaspoisoned.com/location/united-states/illinois/chicago?page='+ str(i)+'#emailscroll')
Pages

['https://iwaspoisoned.com/location/united-states/illinois/chicago?page=1#emailscroll',
 'https://iwaspoisoned.com/location/united-states/illinois/chicago?page=2#emailscroll',
 'https://iwaspoisoned.com/location/united-states/illinois/chicago?page=3#emailscroll',
 'https://iwaspoisoned.com/location/united-states/illinois/chicago?page=4#emailscroll',
 'https://iwaspoisoned.com/location/united-states/illinois/chicago?page=5#emailscroll',
 'https://iwaspoisoned.com/location/united-states/illinois/chicago?page=6#emailscroll',
 'https://iwaspoisoned.com/location/united-states/illinois/chicago?page=7#emailscroll',
 'https://iwaspoisoned.com/location/united-states/illinois/chicago?page=8#emailscroll',
 'https://iwaspoisoned.com/location/united-states/illinois/chicago?page=9#emailscroll',
 'https://iwaspoisoned.com/location/united-states/illinois/chicago?page=10#emailscroll',
 'https://iwaspoisoned.com/location/united-states/illinois/chicago?page=11#emailscroll',
 'https://iwaspoisoned.com/loc

>We must now use our Get_Links_from_Page function to extract all the links per page.

In [5]:
# the function returns all links per page that lead to a specific restaurant complaint
list_links = Get_Links_from_Page(Pages)

https://iwaspoisoned.com/location/united-states/illinois/chicago?page=1#emailscroll
https://iwaspoisoned.com/location/united-states/illinois/chicago?page=2#emailscroll
https://iwaspoisoned.com/location/united-states/illinois/chicago?page=3#emailscroll
https://iwaspoisoned.com/location/united-states/illinois/chicago?page=4#emailscroll
https://iwaspoisoned.com/location/united-states/illinois/chicago?page=5#emailscroll
https://iwaspoisoned.com/location/united-states/illinois/chicago?page=6#emailscroll
https://iwaspoisoned.com/location/united-states/illinois/chicago?page=7#emailscroll
https://iwaspoisoned.com/location/united-states/illinois/chicago?page=8#emailscroll
https://iwaspoisoned.com/location/united-states/illinois/chicago?page=9#emailscroll
https://iwaspoisoned.com/location/united-states/illinois/chicago?page=10#emailscroll
https://iwaspoisoned.com/location/united-states/illinois/chicago?page=11#emailscroll
https://iwaspoisoned.com/location/united-states/illinois/chicago?page=12#e

>Next, we must filter our list of links. The ones we are interested in are similar to :

>'https://iwaspoisoned.com/biz/a-j-krazy-kitchen-7547-west-irving-park-road-chicago-60634-illinois-united-states#emailscroll'. 

>Some links have 'biz' and some have 'incidents'. The commun characteristic is '-chicago-', so we may filter the links by only taking the links with '-chicago-' in them.  

In [6]:
# obtaining all href of the links and putting them into a list
List = []
for link in list_links:
    for l in link:
        url = l.get('href')
        # only keeping the compaints from chicago, so deleting all other links that aren't complaints
        if '-chicago-' in url:
            List.append(l.get('href'))
# delete duplicates
List = list(set(List))

In [7]:
List

['https://iwaspoisoned.com/incident/swiss-air-11601-west-touhy-avenue-chicago-il-usa-279958#emailscroll',
 'https://iwaspoisoned.com/biz/subwayrestaurants-4036-north-narragansett-avenue-chicago-60634-illinois-united-states#emailscroll',
 'https://iwaspoisoned.com/biz/del-seoul-2568-north-clark-street-chicago-60614-illinois-united-states#emailscroll',
 'https://iwaspoisoned.com/biz/doc-b-s-restaurant-bar-gold-coast-100-east-walton-street-chicago-60611-illinois-united-states#emailscroll',
 'https://iwaspoisoned.com/biz/northside-bar-grill-1635-north-damen-avenue-chicago-60647-illinois-united-states#emailscroll',
 'https://iwaspoisoned.com/incident/sweetgreen-west-randolph-street-chicago-il-usa-261399#emailscroll',
 'https://iwaspoisoned.com/biz/rocks-lakeview-3463-north-broadway-chicago-60657-illinois-united-states#emailscroll',
 'https://iwaspoisoned.com/biz/big-bowl-chicago-60666-illinois-united-states-1#emailscroll',
 'https://iwaspoisoned.com/biz/kabobi-persian-and-mediterranean-gril

>Now that we have the list of links, we may extract the information from each link.

In [8]:
# creating a dataframe with all the information
df = Create_df_(List)

https://iwaspoisoned.com/incident/swiss-air-11601-west-touhy-avenue-chicago-il-usa-279958#emailscroll
https://iwaspoisoned.com/biz/subwayrestaurants-4036-north-narragansett-avenue-chicago-60634-illinois-united-states#emailscroll
https://iwaspoisoned.com/biz/del-seoul-2568-north-clark-street-chicago-60614-illinois-united-states#emailscroll
https://iwaspoisoned.com/biz/doc-b-s-restaurant-bar-gold-coast-100-east-walton-street-chicago-60611-illinois-united-states#emailscroll
https://iwaspoisoned.com/biz/northside-bar-grill-1635-north-damen-avenue-chicago-60647-illinois-united-states#emailscroll
https://iwaspoisoned.com/incident/sweetgreen-west-randolph-street-chicago-il-usa-261399#emailscroll
https://iwaspoisoned.com/biz/rocks-lakeview-3463-north-broadway-chicago-60657-illinois-united-states#emailscroll
https://iwaspoisoned.com/biz/big-bowl-chicago-60666-illinois-united-states-1#emailscroll
https://iwaspoisoned.com/biz/kabobi-persian-and-mediterranean-grill-4748-north-kedzie-avenue-chicago

https://iwaspoisoned.com/biz/marcello-s-father-and-son-restaurant-2475-north-milwaukee-avenue-chicago-60647-illinois-united-states#emailscroll
https://iwaspoisoned.com/biz/chop-shop-2033-west-north-avenue-chicago-60647-illinois-united-states#emailscroll
https://iwaspoisoned.com/incident/roti-modern-mediterranean-1012-west-randolph-street-chicago-il-usa-261443#emailscroll
https://iwaspoisoned.com/biz/domino-s-pizza-143-west-division-street-chicago-60610-illinois-united-states#emailscroll
https://iwaspoisoned.com/biz/walmart-supercenter-8331-south-stewart-avenue-chicago-60620-illinois-united-states#emailscroll
https://iwaspoisoned.com/biz/i-57-rib-house-1524-west-115th-street-chicago-60643-illinois-united-states#emailscroll
https://iwaspoisoned.com/biz/taqueria-los-comales-6035-south-pulaski-road-chicago-60629-illinois-united-states#emailscroll
https://iwaspoisoned.com/biz/mekato-s-colombian-bakery-5423-north-lincoln-avenue-chicago-60625-illinois-united-states#emailscroll
https://iwaspoi

In [9]:
df

Unnamed: 0,Name,Address,Zip,Latest_Report_Date,Total_all_time_reports,Total_all_time_sick_persons,Latitude,Longitude
0,SWISS AIR,"[ 11601 West Touhy Avenue, Chicago, IL, USA ]",60631Illinois,Nov 22 2019 at 1:02 AM,1,0,42.0113823,-87.81998920000001
1,SUBWAY®Restaurants,"[ 4036 North Narragansett Avenue, Chicago, 6...",60634,Nov 8 2019 at 12:51 AM,2,1,41.9539562,-87.78699640000002
2,Del Seoul,"[ 2568 North Clark Street, Chicago, 60614 Il...",60614,Oct 14 2019 at 9:26 AM,1,2,41.92944199999999,-87.64301499999999
3,Doc B's Restaurant + Bar (Gold Coast),"[ 100 East Walton Street, Chicago, 60611 Ill...",60611,Nov 9 2019 at 10:52 PM,1,1,41.9001175,-87.6249128
4,Northside Bar & Grill,"[ 1635 North Damen Avenue, Chicago, 60647 Il...",60647,Oct 22 2019 at 7:50 AM,1,0,41.911534,-87.6772479
...,...,...,...,...,...,...,...,...
101,Giordano's,"[ 1040 West Belmont Avenue, Chicago, 60657 I...",60657,Nov 2 2019 at 6:44 AM,3,3,41.94009459999999,-87.65569479999999
102,Cherubs,"[ 2524 West Fullerton Avenue, Chicago, 60647...",60647,Oct 22 2019 at 2:09 AM,1,0,41.9250066,-87.6911629
103,The Shrimp Shack,"[ 6601 West Archer Avenue, Chicago, 60638 Il...",60638,Oct 14 2019 at 7:31 AM,1,0,41.7920687,-87.78686010000001
104,Poke Poké,"[ 802 West Belmont Avenue, Chicago, IL, USA ]",60657Illinois,Oct 8 2019 at 1:46 PM,2,1,41.94008649999999,-87.64953559999998


>Let's clean the Zip Codes:

In [10]:
# only keeping the 1st character to the 6th (location of zipcode)
new_zip = list(map(lambda x: x[1:6], list(df.Zip.values)))

In [11]:
# replacing in df
df = df.replace(list(df.Zip.values),new_zip)

In [12]:
df.Zip.values

array(['60631', '60634', '60614', '60611', '60647', '60607', '60657',
       'Unite', '60625', '60614', '60639', '60614', '60666', '60638',
       '60647', '60656', '60605', '60638', '60614', '60625', '60622',
       '60645', '60608', '60642', '60634', '60647', '60614', '60640',
       '60603', '60639', '60630', '60622', '60652', '60623', '60639',
       '60607', '60639', '60707', '60666', '60634', '60647', '60643',
       '60621', '60630', '60707', '60611', '60609', '60660', '60647',
       '60639', '60607', '60647', '60622', '60647', '60630', 'Unite',
       '60646', '60639', '60622', '60646', '60614', '60655', '60640',
       'Unite', '60647', '60632', '60647', '60611', '60647', '60629',
       '60647', '60647', '60607', '60610', '60620', '60643', '60629',
       '60625', '60629', '60646', '60656', '60647', '60614', '60656',
       '60647', '60622', '60637', 'Illin', '60647', '60625', '60647',
       '60657', '60707', '60659', '60647', '60625', '60613', 'Unite',
       '60607', '606

>4 out of 105 do not have a zipcode. This is not important for mapping the location because we have their longitude and latitude. For now we shall replace the abberant values with NaN.

In [13]:
new_zip_ = list(map(lambda x: np.nan if (x[0] != '6') else x, list(df.Zip.values)))

In [20]:
df = df.replace(list(df.Zip.values),new_zip_)

In [15]:
df.to_csv('Food_Poisoning.csv')

# Mapping

We would like to create a map that shows the location of each complaint. The blue labels are those with < 2 complaints, orange = 2 complaints, red > 2 complaints.

In [23]:
# Create a map using the Map() function and the coordinates for Chicago
m = folium.Map(location=[41.714168, -87.655291])
# function that adds a marker which locates a facility on the map
def Adding_Marker(map_,longitude, latitude, popup, colour):
    '''
     adds a marker which locates a facility on the map
    
    map_: folium.folium.Map
        basic map
    
    longitude: numpy.float64
    
    latitude: numpy.float64
    
    popup: str
        Name of facility and number of sick persons
    
    colour: str
    '''
    folium.Marker(
        location=[latitude,longitude], # coordinates for the marker 
        popup= popup ,  # pop-up label for the marker
        icon=folium.Icon(color= colour)
    ).add_to(map_)

for i in range(106):
    popup = str(df.Name.values[i]) + '\n'+'#Sick Persons :'+ str(df.Total_all_time_sick_persons.values[i]) + '\n' +'#Reports :'+ str(df.Total_all_time_reports.values[i])
    if (df.Total_all_time_sick_persons.values[i] + df.Total_all_time_reports.values[i] < 2):
        colour = 'blue'
    if (df.Total_all_time_sick_persons.values[i]+ df.Total_all_time_reports.values[i] == 2):
        colour = 'orange'
    if (df.Total_all_time_sick_persons.values[i]+ df.Total_all_time_reports.values[i] > 2):
        colour = 'red'
    Adding_Marker(m,df.Longitude.values[i], df.Latitude.values[i], popup , colour)

# saving map to html for display
#m.save("complaint_map.html")
IFrame(src = 'maps/complaint_map.html', width = 700, height = 600)