# Applied Data Science Capstone
## Batle of Neighborhoods
### Week 2 - Code
### Author: Miguel Burg Demay
#### July 2021

# Code summary

The code has 5  main steps: 
1. Data preparation: prepare NY and Toronto data for processing - it includes Foursquare data
2. Select places in NY: selecting the neighborhoods in New York which the client likes or would like to live. 
3. Model training: train a model for NY data 
4. Place prediction: use the developed model and Toronto Foursquare data to predict the neighborhoods in Toronto where the client should live
5. Results.




# Data preparation

In [348]:
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_score
from sklearn.linear_model import LogisticRegression

## Preparing NY data

In [349]:
with open('newyork_data.json') as file:
    newyork_data = json.load(file)


In [350]:
columns = ['Borough', 'NY_Neighborhood', 'Latitude', 'Longitude']
ny_data = pd.DataFrame(columns=columns)

In [351]:
ny_info = newyork_data['features']

for data in ny_info:
    name = data['properties']['name']
    borough = data['properties']['borough']
    longitude = data['geometry']['coordinates'][0]
    latitude = data['geometry']['coordinates'][1]
    ny_data = ny_data.append({'Borough':borough,
                    'NY_Neighborhood':name,
                    'Longitude': longitude,
                    'Latitude': latitude},ignore_index=True)
        


In [5]:
csv_credentials = pd.read_csv('fs.csv')
CLIENT_ID = csv_credentials['CLIENT_ID'][0]
CLIENT_SECRET = csv_credentials['CLIENT_SECRET'][0]
VERSION = VERSION = '20210630' 
LIMIT = 100
radius=500


In [6]:
columns = ['NY_Neighborhood','Venue_categories']
ny_venues=pd.DataFrame(columns=columns)

In [10]:
for i in range(ny_data.shape[0]):
    row = ny_data.iloc[[i]]
    latitude = row['Latitude'][i]
    longitude = row['Longitude'][i]
    neighborhood = row['NY_Neighborhood'][i]

    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, latitude,  longitude,radius, LIMIT)
    results = requests.get(url).json()
    venues=pd.json_normalize(results['response']['groups'][0]['items'])
    for k in range(venues.shape[0]):
         ny_venues = ny_venues.append([{'Neighborhood': neighborhood, 'Venue_categories':venues.loc[k,'venue.categories'][0]['name']}],ignore_index=True)

In [352]:
ny_venues.shape

(10197, 3)

In [353]:
ny_one_hot = pd.get_dummies(ny_venues['Venue_categories'])    
len(ny_venues['Venue_categories'].unique())

431

In [355]:
ny_one_hot.insert(0,'NY_Neighborhood',ny_venues['Neighborhood'])


In [356]:
ny_data=pd.merge(ny_data,ny_one_hot,on='NY_Neighborhood')
ny_data

Unnamed: 0,Borough,NY_Neighborhood,Latitude,Longitude,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,...,Waste Facility,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yemeni Restaurant,Yoga Studio
0,Bronx,Wakefield,40.894705,-73.847201,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Bronx,Wakefield,40.894705,-73.847201,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Bronx,Wakefield,40.894705,-73.847201,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Bronx,Wakefield,40.894705,-73.847201,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Bronx,Wakefield,40.894705,-73.847201,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10541,Queens,Queensbridge,40.756091,-73.945631,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10542,Staten Island,Fox Hills,40.617311,-74.081740,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10543,Staten Island,Fox Hills,40.617311,-74.081740,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10544,Staten Island,Fox Hills,40.617311,-74.081740,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [357]:
ny_data_grouped = ny_data.groupby('NY_Neighborhood').sum()
ny_data_grouped.reset_index(inplace=True)
ny_data_grouped

Unnamed: 0,NY_Neighborhood,Latitude,Longitude,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,American Restaurant,...,Waste Facility,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yemeni Restaurant,Yoga Studio
0,Allerton,1225.973636,-2215.779559,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Annadale,445.919256,-815.964035,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
2,Arden Heights,243.295715,-445.115320,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Arlington,284.447276,-519.155729,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,Arrochar,852.522564,-1555.409596,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,Woodhaven,976.557285,-1772.594651,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
296,Woodlawn,1022.456815,-1846.682874,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
297,Woodrow,729.755417,-1335.694425,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
298,Woodside,3178.215229,-5764.343650,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0


## Preparing Toronto Data

In [358]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_data = requests.get(url).text
soup = bs(wiki_data,"html5lib")
table = soup.find('table')

In [359]:
tr=[]
for row in table.findAll('td'):
    cell={}
    if row.span.text != 'Not assigned':
        cell['Postal Code'] = row.p.text[:3]
        cell['Borough'] = row.span.text.split('(')[0]
        cell['Tr_Neighborhood'] = row.span.text.split('(')[1].strip(')').replace(' /',',').replace(')',' ').strip(' ')
        tr.append(cell)
tr_data = pd.DataFrame(tr)
tr_data['Borough']=tr_data['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [360]:
file = 'Geospatial_coordinates.csv' 
geo_data=pd.read_csv(file)
tr_data = pd.merge(tr_data,geo_data, on='Postal Code') # adding geo coordinates

In [56]:
columns = ['Tr_Neighborhood','Venue_categories']
tr_venues=pd.DataFrame(columns=columns)

In [57]:
for i in range(tr_data.shape[0]):
    row = tr_data.iloc[[i]]
    latitude = row['Latitude'][i]
    longitude = row['Longitude'][i]
    neighborhood = row['Tr_Neighborhood'][i]

    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, latitude,  longitude,radius, LIMIT)
    results = requests.get(url).json()
    venues=pd.json_normalize(results['response']['groups'][0]['items'])
    for k in range(venues.shape[0]):
         tr_venues = tr_venues.append([{'Tr_Neighborhood': neighborhood, 'Venue_categories':venues.loc[k,'venue.categories'][0]['name']}],ignore_index=True)

In [361]:
tr_one_hot = pd.get_dummies(tr_venues['Venue_categories'])

In [362]:
#tr_one_hot.drop('Tr_Neighborhood',axis=1,inplace=True)
len(tr_venues['Venue_categories'].unique())

263

In [363]:
tr_one_hot.insert(0,'Tr_Neighborhood',tr_venues['Tr_Neighborhood'])

In [364]:
tr_data=pd.merge(tr_data,tr_one_hot,on='Tr_Neighborhood')

In [365]:
tr_data_grouped = tr_data.groupby('Tr_Neighborhood').sum()
tr_data_grouped.reset_index(inplace=True)
tr_data_grouped

Unnamed: 0,Tr_Neighborhood,Latitude,Longitude,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,175.176801,-317.048118,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Alderwood, Long Branch",348.819310,-636.347873,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Bathurst Manor, Wilson Heights, Downsview North",1006.349551,-1827.171964,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Bayview Village,175.147789,-317.543900,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Bedford Park, Lawrence Manor East",1049.598780,-1906.073993,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Willowdale West,262.696418,-476.653556,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,"Willowdale, Newtonbrook",131.367159,-238.225478,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,Woburn,175.083968,-316.867670,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,Woodbine Heights,305.867407,-555.228721,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


# Select places in NY

In [367]:
client_A_list = ny_data_grouped[ny_data_grouped['Park']>0].index.tolist()
client_A_list= client_A_list + ny_data_grouped[ny_data_grouped['Bus Station']>0].index.tolist()
client_A_list.sort()
limit = int(len(client_A_list)*0.8)
client_A_list = client_A_list[:limit]
client_A_list=set(client_A_list)


In [368]:
client_B_list = ny_data_grouped[ny_data_grouped['American Restaurant']>0].index
client_B_list = client_B_list.tolist() + ny_data_grouped[ny_data_grouped['Baseball Field']>0].index.tolist()
client_B_list = client_B_list + ny_data_grouped[ny_data_grouped['Baseball Stadium']>0].index.tolist()
client_B_list = client_B_list + ny_data_grouped[ny_data_grouped['Basketball Court']>0].index.tolist()
client_B_list = client_B_list + ny_data_grouped[ny_data_grouped['Beer Bar']>0].index.tolist()
client_B_list = client_B_list + ny_data_grouped[ny_data_grouped['Boxing Gym']>0].index.tolist()
client_B_list.sort()
limit = int(len(client_B_list)*0.6)
client_B_list = client_B_list[:limit]
client_B_list=set(client_B_list)

In [369]:
client_C_list = ny_data_grouped[ny_data_grouped['Comedy Club']>0].index.tolist()
client_C_list = client_C_list + ny_data_grouped[ny_data_grouped['Bookstore']>0].index.tolist()
client_C_list = client_C_list + ny_data_grouped[ny_data_grouped['Brazilian Restaurant']>0].index.tolist()
client_C_list = client_C_list + ny_data_grouped[ny_data_grouped['Cocktail Bar']>0].index.tolist()
client_C_list.sort()
limit=int(len(client_C_list)*0.8)
client_C_list=client_C_list[:limit]
client_C_list = set(client_C_list)

In [370]:
ny_data_grouped['Client A']=0
ny_data_grouped['Client B']=0
ny_data_grouped['Client C']=0

In [371]:
for i in client_A_list: ny_data_grouped.loc[i,'Client A'] = 1
for i in client_B_list: ny_data_grouped.loc[i,'Client B'] = 1
for i in client_C_list: ny_data_grouped.loc[i,'Client C'] = 1    

# Model Training|

In [372]:
x_list_ny = ny_data_grouped.columns[3:-3]

In [373]:
x_list_tr = tr_data_grouped.columns[3:]

In [374]:
x_list=[]
for i in x_list_tr:
    if i in x_list_ny: x_list.append(i)


In [375]:
len(x_list)

237

In [376]:
X = np.asarray(ny_data_grouped[x_list])

In [377]:
y_client_A = np.asarray(ny_data_grouped['Client A'])
y_client_B = np.asarray(ny_data_grouped['Client B'])
y_client_C = np.asarray(ny_data_grouped['Client C'])

In [378]:
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split( X, y_client_A, test_size=0.2, random_state=4)
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split( X, y_client_B, test_size=0.2, random_state=4)
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split( X, y_client_C, test_size=0.2, random_state=4)
print ('Train set:', X_train_A.shape,  y_train_A.shape, X_train_B.shape, y_train_B.shape, X_train_C.shape, y_train_C.shape)
print ('Test set:', X_test_A.shape,  y_test_A.shape, X_test_B.shape,  y_test_B.shape, X_test_C.shape,  y_test_C.shape )

Train set: (240, 237) (240,) (240, 237) (240,) (240, 237) (240,)
Test set: (60, 237) (60,) (60, 237) (60,) (60, 237) (60,)


In [379]:
 len(X)

300

In [380]:
LR_A = LogisticRegression(C=0.01, solver='liblinear').fit(X_train_A,y_train_A)
LR_B = LogisticRegression(C=0.01, solver='liblinear').fit(X_train_B,y_train_B)
LR_C = LogisticRegression(C=0.01, solver='liblinear').fit(X_train_C,y_train_C)

In [381]:
yhat_A = LR_A.predict(X_test_A)
yhat_B = LR_B.predict(X_test_B)
yhat_C = LR_C.predict(X_test_C)

In [382]:
yhat_prob_A = LR_A.predict_proba(X_test_A)
yhat_prob_B = LR_B.predict_proba(X_test_B)
yhat_prob_C = LR_B.predict_proba(X_test_C)


In [383]:
jaccard_score(y_test_A, yhat_A,pos_label=0)


0.6415094339622641

In [384]:
jaccard_score(y_test_B, yhat_B,pos_label=0)

0.7192982456140351

In [385]:
jaccard_score(y_test_C, yhat_C,pos_label=0)

0.875

# Place prediction: in which neighborhood would the client rather to live?

In [386]:
X_tr= np.asarray(tr_data_grouped[x_list])

In [387]:
yhat_tr_A = LR_A.predict(X_tr)
yhat_prob_tr_A = LR_A.predict_proba(X_tr)

In [388]:
yhat_tr_B = LR_B.predict(X_tr)
yhat_prob_tr_B = LR_B.predict_proba(X_tr)

In [389]:
yhat_tr_C = LR_C.predict(X_tr)
yhat_prob_tr_C = LR_C.predict_proba(X_tr)

In [391]:
#tr_data_grouped.drop('Likelihood_A',inplace=True,axis=1)
#tr_data_grouped.drop('Likelihood_B',inplace=True,axis=1)
#tr_data_grouped.drop('Likelihood_C',inplace=True,axis=1)

# Results

In [392]:
tr_data_grouped.insert(1,'Likelihood_A',yhat_prob_tr_A[:,1].tolist())

In [393]:
tr_data_grouped.insert(1,'Likelihood_B',yhat_prob_tr_B[:,1].tolist())

In [394]:
tr_data_grouped.insert(1,'Likelihood_C',yhat_prob_tr_C[:,1].tolist())

In [395]:
not_recommended_A = tr_data_grouped.sort_values('Likelihood_A').head()[['Tr_Neighborhood','Likelihood_A']]
not_recommended_A


Unnamed: 0,Tr_Neighborhood,Likelihood_A
68,"Richmond, Adelaide, King",0.225488
34,"First Canadian Place, Underground city",0.232702
87,"Toronto Dominion Centre, Design Exchange",0.263346
36,"Garden District, Ryerson",0.29001
72,"Runnymede, Swansea",0.298846


In [396]:
recommended_A = tr_data_grouped.sort_values('Likelihood_A',ascending=False).head()[['Tr_Neighborhood','Likelihood_A']]
recommended_A

Unnamed: 0,Tr_Neighborhood,Likelihood_A
67,"Regent Park, Harbourfront",0.596527
69,Rosedale,0.590186
96,"Willowdale, Newtonbrook",0.571975
99,York Mills West,0.569812
9,Caledonia-Fairbanks,0.569779


In [397]:
not_recommended_B = tr_data_grouped.sort_values('Likelihood_B').head(5)[['Tr_Neighborhood','Likelihood_B']]
not_recommended_B

Unnamed: 0,Tr_Neighborhood,Likelihood_B
75,"South Steeles, Silverstone, Humbergate, Jamest...",0.343186
2,"Bathurst Manor, Wilson Heights, Downsview North",0.352838
65,"Parkview Hill, Woodbine Gardens",0.354297
1,"Alderwood, Long Branch",0.358251
28,"Dufferin, Dovercourt Village",0.35938


In [398]:
recommended_B = tr_data_grouped.sort_values('Likelihood_B',ascending=False).head()[['Tr_Neighborhood','Likelihood_B']]
recommended_B

Unnamed: 0,Tr_Neighborhood,Likelihood_B
17,"Commerce Court, Victoria Hotel",0.7649
36,"Garden District, Ryerson",0.707946
87,"Toronto Dominion Centre, Design Exchange",0.673757
76,St. James Town,0.664976
31,Enclave of M5E,0.642564


In [399]:
not_recommended_C = tr_data_grouped.sort_values('Likelihood_C').head()[['Tr_Neighborhood','Likelihood_C']]
not_recommended_C

Unnamed: 0,Tr_Neighborhood,Likelihood_C
15,"Clarks Corners, Tam O'Shanter, Sullivan",0.241236
2,"Bathurst Manor, Wilson Heights, Downsview North",0.260574
46,"India Bazaar, The Beaches West",0.263605
86,Thorncliffe Park,0.272554
75,"South Steeles, Silverstone, Humbergate, Jamest...",0.275503


In [400]:
recommended_C = tr_data_grouped.sort_values('Likelihood_C',ascending=False).head()[['Tr_Neighborhood','Likelihood_C']]
recommended_C

Unnamed: 0,Tr_Neighborhood,Likelihood_C
17,"Commerce Court, Victoria Hotel",0.876741
36,"Garden District, Ryerson",0.820029
87,"Toronto Dominion Centre, Design Exchange",0.80635
40,"Harbourfront East, Union Station, Toronto Islands",0.791681
11,Central Bay Street,0.779451
