In [245]:
import numpy as np
import pandas as pd
import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

In [246]:
url = "https://pl.wikipedia.org/wiki/Podzia%C5%82_administracyjny_Warszawy"
table = pd.read_html(url, header=0)[0]
table.rename(columns={"Dzielnica": "District", 
                          "Liczba mieszkańców(1.01.2019)[1]": "Population",
                          "Powierzchnia[km²][1]": "Area"}, inplace=True)
table.head()

Unnamed: 0,District,Population,Gęstość zaludnienia[osób/km²](1.01.2019)[1],Area
0,Mokotów,217 683,6146,3542
1,Praga-Południe,179 836,8036,2238
2,Ursynów,150 668,3441,4379
3,Wola,140 958,7319,1926
4,Bielany,131 910,4079,3234


In [247]:
df = table.filter(items=['District', 'Population', 'Area'])
df['Area'] = df['Area']/100
df['Radius'] = np.sqrt(df['Area'])/2.5*1000
df['Radius'] = df['Radius'].round(0)
df.head()

Unnamed: 0,District,Population,Area,Radius
0,Mokotów,217 683,35.42,2381.0
1,Praga-Południe,179 836,22.38,1892.0
2,Ursynów,150 668,43.79,2647.0
3,Wola,140 958,19.26,1755.0
4,Bielany,131 910,32.34,2275.0


In [248]:
geolocator = Nominatim(user_agent="wwa_explorer")
for d in df['District']:
    location = geolocator.geocode(d+' , Warsaw')
    df.loc[df['District']==d, 'Latitude'] = location.latitude
    df.loc[df['District']==d, 'Longitude'] = location.longitude

In [249]:
df.head()

Unnamed: 0,District,Population,Area,Radius,Latitude,Longitude
0,Mokotów,217 683,35.42,2381.0,52.193987,21.045781
1,Praga-Południe,179 836,22.38,1892.0,52.237396,21.071258
2,Ursynów,150 668,43.79,2647.0,52.141039,21.032321
3,Wola,140 958,19.26,1755.0,52.236238,20.954781
4,Bielany,131 910,32.34,2275.0,52.285043,20.943949


In [250]:
map_to = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, district, rad in zip(df['Latitude'], df['Longitude'], df['District'], df['Radius']):
    label = '{}'.format(district)
    label = folium.Popup(label, parse_html=True)
    folium.Circle(
        [lat, lng],
        radius=rad,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_to)  
    
map_to

In [251]:
CLIENT_ID = 'A3RTG2OICPJV1JDY0UO24PM0DSO3VZXSSHM33BNV5NCGWP1X'
CLIENT_SECRET = '3UJK5I5X00K4ZUSJ41WIRYHNMIVBZ0EECLS1RHULT1DWLMIZ'
VERSION = '20200701'

In [252]:
def getNearbyVenues(names, latitudes, longitudes, radius, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng, rad in zip(names, latitudes, longitudes, radius):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&section=food&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            rad, 
            LIMIT)
        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['District', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [253]:
df_venues = getNearbyVenues(names=df['District'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude'],
                                   radius=df['Radius']
                                  )

Mokotów
Praga-Południe
Ursynów
Wola
Bielany
Targówek
Bemowo
Śródmieście
Białołęka
Ochota
Wawer
Praga-Północ
Ursus
Żoliborz
Włochy
Wilanów
Wesoła
Rembertów


In [254]:
df_venues.head()

Unnamed: 0,District,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Mokotów,52.193987,21.045781,Green Caffè Nero,52.198116,21.047283,Café
1,Mokotów,52.193987,21.045781,Stary Dom,52.195544,21.024004,Polish Restaurant
2,Mokotów,52.193987,21.045781,NABO Cafe,52.189653,21.068752,Scandinavian Restaurant
3,Mokotów,52.193987,21.045781,La Civetta,52.189742,21.056466,Italian Restaurant
4,Mokotów,52.193987,21.045781,MEZZE hummus & falafel,52.203548,21.022705,Falafel Restaurant


In [255]:
df_res = df_venues[df_venues['Venue Category'].str.contains(' Restaurant', regex=False)].reset_index(drop=True)
df_res['Cousine'] = df_res['Venue Category'].str.rsplit(' ', 1).str[0]
df_res.head()

Unnamed: 0,District,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Cousine
0,Mokotów,52.193987,21.045781,Stary Dom,52.195544,21.024004,Polish Restaurant,Polish
1,Mokotów,52.193987,21.045781,NABO Cafe,52.189653,21.068752,Scandinavian Restaurant,Scandinavian
2,Mokotów,52.193987,21.045781,La Civetta,52.189742,21.056466,Italian Restaurant,Italian
3,Mokotów,52.193987,21.045781,MEZZE hummus & falafel,52.203548,21.022705,Falafel Restaurant,Falafel
4,Mokotów,52.193987,21.045781,Boston Port,52.197249,21.024606,Seafood Restaurant,Seafood


In [256]:
df_res['Cousine'].unique()

array(['Polish', 'Scandinavian', 'Italian', 'Falafel', 'Seafood', 'Asian',
       'Eastern European', 'Korean', 'Kebab', 'Ramen', 'Ukrainian',
       'Thai', 'Vegetarian / Vegan', 'Mediterranean', 'French',
       'Middle Eastern', 'Sushi', 'Modern European', 'Chinese', 'Indian',
       'Vietnamese', 'Turkish', 'Greek', 'Dim Sum', 'Hungarian',
       'Spanish', 'Bulgarian', 'Mexican', 'Fast Food', 'American',
       'Cantonese', 'German', 'Dumpling', 'Japanese', 'New American',
       'Doner', 'Hawaiian', 'Udon', 'Kosher', 'Caucasian', 'Lebanese',
       'Israeli', 'Comfort Food', 'African', 'Caribbean'], dtype=object)

In [257]:
df_res.loc[df_res['Cousine']=='Falafel', 'Cousine'] = 'Vegetarian / Vegan'
df_res.loc[df_res['Cousine']=='Kebab', 'Cousine'] = 'Turkish'
df_res.loc[df_res['Cousine']=='Ramen', 'Cousine'] = 'Japanese'
df_res.loc[df_res['Cousine']=='Sushi', 'Cousine'] = 'Japanese'
df_res.loc[df_res['Cousine']=='Dim Sum', 'Cousine'] = 'Chinese'
df_res.loc[df_res['Cousine']=='New American', 'Cousine'] = 'American'
df_res.loc[df_res['Cousine']=='Doner', 'Cousine'] = 'Turkish'
df_res.loc[df_res['Cousine']=='Udon', 'Cousine'] = 'Japanese'
df_res.loc[df_res['Cousine']=='Kosher', 'Cousine'] = 'Israeli'
df_res.loc[df_res['Cousine']=='Dumpling', 'Cousine'] = 'Polish'
df_res.loc[df_res['Cousine']=='Fast Food', 'Cousine'] = 'American'
df_res.loc[df_res['Cousine']=='Comfort Food', 'Cousine'] = 'Polish'

In [258]:
df_res['Cousine'].unique()

array(['Polish', 'Scandinavian', 'Italian', 'Vegetarian / Vegan',
       'Seafood', 'Asian', 'Eastern European', 'Korean', 'Turkish',
       'Japanese', 'Ukrainian', 'Thai', 'Mediterranean', 'French',
       'Middle Eastern', 'Modern European', 'Chinese', 'Indian',
       'Vietnamese', 'Greek', 'Hungarian', 'Spanish', 'Bulgarian',
       'Mexican', 'American', 'Cantonese', 'German', 'Hawaiian',
       'Israeli', 'Caucasian', 'Lebanese', 'African', 'Caribbean'],
      dtype=object)

In [259]:
df_oh = pd.get_dummies(df_res[['Cousine']], prefix="", prefix_sep="")

df_oh['District'] = df_res['District'] 

fixed_columns = [df_oh.columns[-1]] + list(df_oh.columns[:-1])
df_oh = df_oh[fixed_columns]

df_oh

Unnamed: 0,District,African,American,Asian,Bulgarian,Cantonese,Caribbean,Caucasian,Chinese,Eastern European,...,Modern European,Polish,Scandinavian,Seafood,Spanish,Thai,Turkish,Ukrainian,Vegetarian / Vegan,Vietnamese
0,Mokotów,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,Mokotów,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Mokotów,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Mokotów,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,Mokotów,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,Mokotów,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Mokotów,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,Mokotów,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,Mokotów,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Mokotów,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [260]:
df_g = df_oh.groupby('District').mean().reset_index()
df_g

Unnamed: 0,District,African,American,Asian,Bulgarian,Cantonese,Caribbean,Caucasian,Chinese,Eastern European,...,Modern European,Polish,Scandinavian,Seafood,Spanish,Thai,Turkish,Ukrainian,Vegetarian / Vegan,Vietnamese
0,Bemowo,0.0,0.272727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0
1,Białołęka,0.0,0.25,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0,0.0625,0.125
2,Bielany,0.0,0.105263,0.315789,0.0,0.0,0.0,0.0,0.052632,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.105263
3,Mokotów,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.041667,0.041667,...,0.020833,0.104167,0.020833,0.041667,0.0,0.0625,0.0625,0.020833,0.041667,0.020833
4,Ochota,0.0,0.275862,0.068966,0.0,0.0,0.0,0.0,0.034483,0.0,...,0.0,0.068966,0.0,0.034483,0.0,0.034483,0.034483,0.0,0.034483,0.068966
5,Praga-Południe,0.0,0.020408,0.081633,0.020408,0.0,0.0,0.0,0.040816,0.040816,...,0.020408,0.040816,0.0,0.0,0.020408,0.020408,0.061224,0.020408,0.040816,0.020408
6,Praga-Północ,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.166667,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0
7,Targówek,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0
8,Ursus,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2
9,Ursynów,0.0,0.081633,0.020408,0.0,0.020408,0.0,0.0,0.142857,0.081633,...,0.020408,0.020408,0.0,0.0,0.0,0.020408,0.040816,0.0,0.061224,0.020408


In [261]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [262]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['District']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Cuisine'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Cuisine'.format(ind+1))

# create a new dataframe
df_venues_sorted = pd.DataFrame(columns=columns)
df_venues_sorted['District'] = df_g['District']

for ind in np.arange(df_g.shape[0]):
    df_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_g.iloc[ind, :], num_top_venues)

df_venues_sorted

Unnamed: 0,District,1st Most Common Cuisine,2nd Most Common Cuisine,3rd Most Common Cuisine,4th Most Common Cuisine,5th Most Common Cuisine
0,Bemowo,Italian,American,Indian,Japanese,Seafood
1,Białołęka,American,Italian,Vietnamese,Turkish,Mediterranean
2,Bielany,Asian,Vietnamese,American,Indian,Japanese
3,Mokotów,Italian,Polish,Japanese,Middle Eastern,Asian
4,Ochota,American,Italian,Japanese,Polish,Asian
5,Praga-Południe,Italian,Asian,Japanese,Turkish,Greek
6,Praga-Północ,American,Turkish,Eastern European,Chinese,Vietnamese
7,Targówek,Eastern European,Vegetarian / Vegan,American,Japanese,Hungarian
8,Ursus,Vietnamese,American,Japanese,Mediterranean,Mexican
9,Ursynów,Chinese,Japanese,Italian,Indian,American


In [263]:
# set number of clusters
kclusters = 4

df_grouped_clustering = df_g.drop('District', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([2, 2, 2, 0, 2, 0, 1, 3, 2, 0, 2, 0, 2, 1, 0, 0])

In [264]:
# add clustering labels
df_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [265]:
df_merged = df

# merge data to add latitude/longitude for each neighborhood
df_merged = df_merged.join(df_venues_sorted.set_index('District'), how='inner', on='District')

df_merged.head() # check the last columns!

Unnamed: 0,District,Population,Area,Radius,Latitude,Longitude,Cluster Labels,1st Most Common Cuisine,2nd Most Common Cuisine,3rd Most Common Cuisine,4th Most Common Cuisine,5th Most Common Cuisine
0,Mokotów,217 683,35.42,2381.0,52.193987,21.045781,0,Italian,Polish,Japanese,Middle Eastern,Asian
1,Praga-Południe,179 836,22.38,1892.0,52.237396,21.071258,0,Italian,Asian,Japanese,Turkish,Greek
2,Ursynów,150 668,43.79,2647.0,52.141039,21.032321,0,Chinese,Japanese,Italian,Indian,American
3,Wola,140 958,19.26,1755.0,52.236238,20.954781,2,Japanese,Italian,American,Asian,Chinese
4,Bielany,131 910,32.34,2275.0,52.285043,20.943949,2,Asian,Vietnamese,American,Indian,Japanese


In [266]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['District'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [267]:
df_merged.loc[df_merged['Cluster Labels'] == 0, df_merged.columns[[0,1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,District,Population,Longitude,Cluster Labels,1st Most Common Cuisine,2nd Most Common Cuisine,3rd Most Common Cuisine,4th Most Common Cuisine,5th Most Common Cuisine
0,Mokotów,217 683,21.045781,0,Italian,Polish,Japanese,Middle Eastern,Asian
1,Praga-Południe,179 836,21.071258,0,Italian,Asian,Japanese,Turkish,Greek
2,Ursynów,150 668,21.032321,0,Chinese,Japanese,Italian,Indian,American
7,Śródmieście,115 395,21.019067,0,Vegetarian / Vegan,Polish,Japanese,Italian,Mexican
13,Żoliborz,52 293,20.979698,0,Italian,Vegetarian / Vegan,Japanese,Thai,Eastern European
15,Wilanów,40 060,21.110441,0,Italian,Japanese,Asian,Eastern European,Polish


In [268]:
df_merged.loc[df_merged['Cluster Labels'] == 1, df_merged.columns[[0,1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,District,Population,Longitude,Cluster Labels,1st Most Common Cuisine,2nd Most Common Cuisine,3rd Most Common Cuisine,4th Most Common Cuisine,5th Most Common Cuisine
11,Praga-Północ,64 113,21.027344,1,American,Turkish,Eastern European,Chinese,Vietnamese
14,Włochy,42 862,20.948438,1,American,Italian,Turkish,Asian,Mediterranean


In [269]:
df_merged.loc[df_merged['Cluster Labels'] == 2, df_merged.columns[[0,1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,District,Population,Longitude,Cluster Labels,1st Most Common Cuisine,2nd Most Common Cuisine,3rd Most Common Cuisine,4th Most Common Cuisine,5th Most Common Cuisine
3,Wola,140 958,20.954781,2,Japanese,Italian,American,Asian,Chinese
4,Bielany,131 910,20.943949,2,Asian,Vietnamese,American,Indian,Japanese
6,Bemowo,123 932,20.913288,2,Italian,American,Indian,Japanese,Seafood
8,Białołęka,124 125,21.021177,2,American,Italian,Vietnamese,Turkish,Mediterranean
9,Ochota,82 774,20.97263,2,American,Italian,Japanese,Polish,Asian
10,Wawer,77 205,21.137083,2,Italian,American,Japanese,Turkish,Asian
12,Ursus,60 112,20.882899,2,Vietnamese,American,Japanese,Mediterranean,Mexican


In [270]:
df_merged.loc[df_merged['Cluster Labels'] == 3, df_merged.columns[[0,1] + list(range(5, df_merged.shape[1]))]]

Unnamed: 0,District,Population,Longitude,Cluster Labels,1st Most Common Cuisine,2nd Most Common Cuisine,3rd Most Common Cuisine,4th Most Common Cuisine,5th Most Common Cuisine
5,Targówek,124 279,21.058085,3,Eastern European,Vegetarian / Vegan,American,Japanese,Hungarian
