Web Scraping with Pandas and Beautifulsoup

In [65]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [66]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
res=requests.get(url)


In [67]:
soup = BeautifulSoup(res.content,'lxml')

In [68]:
table = soup.find_all('table')[0] 

In [69]:
df = pd.read_html(str(table))[0]

Renaming columns

In [70]:
df=df.rename(columns={'Postcode':'PostalCode'})

In [71]:
df=df.rename(columns={'Neighbourhood':'Neighborhood'})

In [72]:
df = df[~df['Borough'].isin(['Not assigned'])]

In [73]:
df = df.groupby(['PostalCode','Borough',])['Neighborhood'].apply(', '.join).reset_index()

Change one value based on another value in pandas


In [74]:
df['Neighborhood']=df.apply(lambda x: x['Borough'] if x['Neighborhood'] == 'Not assigned' else x['Neighborhood'],axis=1)

In [76]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [77]:
df.shape

(103, 3)

In [78]:
df1=pd.read_csv('http://cocl.us/Geospatial_data')

In [79]:
df1

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [80]:
df1=df1.rename(columns={'Postal Code':'PostalCode'})

In [15]:
df1

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


Merge Two dataframe based on PostalCode column

In [81]:
df2=pd.merge(df, df1, on='PostalCode')

In [18]:
df2

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [82]:
import matplotlib 

In [83]:
import sklearn

In [84]:
downtown_toronto_grouped = df2.groupby(['Neighborhood']).mean().reset_index()




In [85]:
downtown_toronto_grouped

Unnamed: 0,Neighborhood,Latitude,Longitude
0,"Adelaide, King, Richmond",43.650571,-79.384568
1,Agincourt,43.794200,-79.262029
2,"Agincourt North, L'Amoreaux East, Milliken, St...",43.815252,-79.284577
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
4,"Alderwood, Long Branch",43.602414,-79.543484
5,"Bathurst Manor, Downsview North, Wilson Heights",43.754328,-79.442259
6,Bayview Village,43.786947,-79.385975
7,"Bedford Park, Lawrence Manor East",43.733283,-79.419750
8,Berczy Park,43.644771,-79.373306
9,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [86]:
## Cluster Neighborhoods
## Run k-means to cluster the neighborhood into 5 clusters.

# set number of clusters
kclusters = 5

toronto_grouped_clustering = downtown_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 3, 3, 0, 0, 4, 4, 4, 2, 1], dtype=int32)

In [50]:
toronto_merged = df2.head(103)
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood


toronto_merged

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,2
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,3
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,3
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,4
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029,4
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577,4
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476,2
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,1


In [87]:
toronto_merged_Kmeans_cluster = toronto_merged[toronto_merged['Borough'].str.contains('Toronto')]

In [88]:
toronto_merged_Kmeans_cluster

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0
43,M4M,East Toronto,Studio District,43.659526,-79.340923,4
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,2
47,M4S,Central Toronto,Davisville,43.704324,-79.38879,4
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,3
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,2


In [61]:
!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    altair-3.1.0               |           py36_0         724 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be 