The idea here is to cluster neighbourhoods with similar pickup profiles to find similar neighbourhoods

In [29]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
# Data Import
df = pd.read_csv('../data/processed/zones_data_August.csv')
df.head()

Unnamed: 0,pu_time,do_time,pu_nbrhood,do_nbrhood,pass_count,distance,fare,tip,total,payment_type,trip_type
0,08/29/2015 05:16:18 PM,08/29/2015 05:23:48 PM,MN33,MN11,1,1.9,8.0,0.0,8.8,1,1.0
1,08/28/2015 08:05:28 PM,08/28/2015 08:15:25 PM,BK73,BK75,1,2.3,9.5,2.16,12.96,1,1.0
2,08/01/2015 01:07:39 PM,08/01/2015 01:21:50 PM,BK42,BK34,1,2.7,12.0,2.55,15.35,1,1.0
3,08/10/2015 05:35:00 PM,08/10/2015 05:50:06 PM,QN18,QN21,1,3.08,13.0,2.96,17.76,1,1.0
4,08/07/2015 09:18:39 PM,08/07/2015 09:21:57 PM,QN68,QN70,1,0.69,4.5,0.0,5.8,2,1.0


In [61]:
df.shape

(1532343, 11)

In [67]:
# Drop XX00 neighbours
df = df[df.pu_nbrhood != 'XX00']
df.head()

Unnamed: 0,pu_time,do_time,pu_nbrhood,do_nbrhood,pass_count,distance,fare,tip,total,payment_type,trip_type
0,08/29/2015 05:16:18 PM,08/29/2015 05:23:48 PM,MN33,MN11,1,1.9,8.0,0.0,8.8,1,1.0
1,08/28/2015 08:05:28 PM,08/28/2015 08:15:25 PM,BK73,BK75,1,2.3,9.5,2.16,12.96,1,1.0
2,08/01/2015 01:07:39 PM,08/01/2015 01:21:50 PM,BK42,BK34,1,2.7,12.0,2.55,15.35,1,1.0
3,08/10/2015 05:35:00 PM,08/10/2015 05:50:06 PM,QN18,QN21,1,3.08,13.0,2.96,17.76,1,1.0
4,08/07/2015 09:18:39 PM,08/07/2015 09:21:57 PM,QN68,QN70,1,0.69,4.5,0.0,5.8,2,1.0


In [62]:
pu_group = df.groupby('pu_nbrhood').count()
pu_group.head()

Unnamed: 0_level_0,pu_time,do_time,do_nbrhood,pass_count,distance,fare,tip,total,payment_type,trip_type
pu_nbrhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BK09,30284,30284,30284,30284,30284,30284,30284,30284,30284,30284
BK17,4907,4907,4907,4907,4907,4907,4907,4907,4907,4907
BK19,429,429,429,429,429,429,429,429,429,429
BK21,2530,2530,2530,2530,2530,2530,2530,2530,2530,2530
BK23,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118


### Data Prep
For the purposes of clustering, I want an hourly profile of pickup numbers per neighbourhood.

What do I need to do:
- get a list of the neighbourhoods
- convert pu times to pandas datetime and isolate hours
- create function to cycle through each hour and each neighbour and create unique column for each hour

In [63]:
# Create list of NYC neighbourhoods
vector_df = df.pu_nbrhood.unique()
vector_df

array(['MN33', 'BK73', 'BK42', 'QN18', 'QN68', 'BK61', 'MN34', 'BK09',
       'MN11', 'BK60', 'QN31', 'BK33', 'BK37', 'BK72', 'QN29', 'BX31',
       'XX00', 'BK68', 'MN04', 'QN02', 'BK63', 'BK35', 'MN03', 'QN28',
       'BX05', 'QN71', 'BK69', 'BK77', 'MN36', 'BK32', 'BK78', 'MN09',
       'QN63', 'BX35', 'BK90', 'MN35', 'BK75', 'QN61', 'BK38', 'BK31',
       'QN17', 'BX34', 'QN70', 'BX63', 'QN22', 'BK64', 'QN21', 'BK96',
       'QN50', 'BK21', 'BK76', 'BK81', 'QN72', 'BX39', 'QN60', 'BK82',
       'BX28', 'QN52', 'BX01', 'QN26', 'BX26', 'BK17', 'BX55', 'QN54',
       'MN06', 'BX14', 'BX43', 'QN27', 'BX37', 'BK58', 'BK83', 'BK91',
       'BK79', 'QN01', 'BX27', 'MN40', 'QN35', 'BX17', 'QN53', 'BX07',
       'BX75', 'QN19', 'BX40', 'QN25', 'QN62', 'BK41', 'BX46', 'BX30',
       'BX41', 'QN37', 'BX06', 'BX29', 'BX36', 'BK40', 'BK95', 'QN30',
       'QN55', 'MN31', 'BK45', 'BX08', 'BK34', 'BK44', 'BK46', 'QN34',
       'BK23', 'BK88', 'BX33', 'BK27', 'QN20', 'QN06', 'BK29', 'BX49',
      

In [64]:
# Find total trips per neighbourhood
count_df = df.groupby('pu_nbrhood').count()
count_df = count_df.loc[:, ['pu_time']]
count_df.columns = ['total']
count_df.head()

Unnamed: 0_level_0,total
pu_nbrhood,Unnamed: 1_level_1
BK09,30284
BK17,4907
BK19,429
BK21,2530
BK23,1118


In [65]:
nbrhood_df = df.loc[:,['pu_time', 'pu_nbrhood']]
nbrhood_df['pu_time'] = pd.to_datetime(nbrhood_df['pu_time'])

In [66]:
nbrhood_df['day'] = nbrhood_df['pu_time'].dt.day
nbrhood_df['hour'] = nbrhood_df['pu_time'].dt.hour
nbrhood_df.head()

Unnamed: 0,pu_time,pu_nbrhood,day,hour
0,2015-08-29 17:16:18,MN33,29,17
1,2015-08-28 20:05:28,BK73,28,20
2,2015-08-01 13:07:39,BK42,1,13
3,2015-08-10 17:35:00,QN18,10,17
4,2015-08-07 21:18:39,QN68,7,21


In [36]:
# df_temp = nbrhood_df[nbrhood_df.hour == 1]
# df_temp = df_temp.groupby('pu_nbrhood').count()

# temp_name = 'hour_{}'.format(1)

# df_temp[temp_name] = df_temp['hour']
# df_temp = df_temp.loc[:, temp_name]
# df_temp = df_temp.to_frame()
# df_temp
# count_df.join(df_temp, how='left').head()

In [37]:
for i in range(1, 25):
    df_temp = nbrhood_df[nbrhood_df.hour == i]
    df_temp = df_temp.groupby('pu_nbrhood').count()

    temp_name = 'hour_{}'.format(i)
    
    df_temp[temp_name] = df_temp['hour']
    df_temp = df_temp.loc[:, temp_name]
    df_temp = df_temp.to_frame()
    count_df = count_df.join(df_temp, how='left')

In [38]:
count_df.fillna(0, inplace=True)
count_df.head()

Unnamed: 0_level_0,total,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,...,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,hour_24
pu_nbrhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BK09,570,13.0,9.0,0.0,2.0,5.0,8.0,12.0,21.0,16.0,...,55.0,56.0,25.0,30.0,35.0,34.0,36.0,24.0,27.0,0.0
BK17,98,2.0,1.0,2.0,0.0,0.0,2.0,1.0,3.0,3.0,...,4.0,7.0,5.0,9.0,6.0,6.0,7.0,8.0,4.0,0.0
BK19,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,0.0,0.0
BK21,32,1.0,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,...,2.0,4.0,2.0,3.0,4.0,3.0,4.0,0.0,1.0,0.0
BK23,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [39]:
for i in range(1, 25, 2):
    temp_name1 = 'hour_{}'.format(i)
    temp_name2 = 'hour_{}'.format(i+1)

    final_df[[temp_name1, temp_name2]] = count_df[[temp_name1, temp_name2]].div(count_df.total, axis=0)


In [52]:
# Reorder columns
cols = final_df.columns.tolist()
cols = cols[-22:] + cols[:-22]
final_df = final_df[cols]
final_df.head()

Unnamed: 0_level_0,hour_5,hour_6,hour_7,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,...,hour_19,hour_20,hour_21,hour_22,hour_23,hour_24,hour_1,hour_2,hour_3,hour_4
pu_nbrhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BK09,0.008772,0.014035,0.021053,0.036842,0.02807,0.02807,0.057895,0.052632,0.054386,0.068421,...,0.061404,0.059649,0.063158,0.042105,0.047368,0.0,0.022807,0.015789,0.0,0.003509
BK17,0.0,0.020408,0.010204,0.030612,0.030612,0.020408,0.040816,0.020408,0.05102,0.071429,...,0.061224,0.061224,0.071429,0.081633,0.040816,0.0,0.020408,0.010204,0.020408,0.0
BK19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,0.090909,...,0.090909,0.090909,0.090909,0.181818,0.0,0.0,0.0,0.0,0.0,0.0
BK21,0.0,0.0,0.0,0.03125,0.0,0.0,0.03125,0.09375,0.0,0.0,...,0.125,0.09375,0.125,0.0,0.03125,0.0,0.03125,0.0,0.03125,0.0625
BK23,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25


In [69]:
zone_names = pd.read_csv('../code/zones.csv', index_col=0)
zone_names

Unnamed: 0_level_0,borough,zone,service_zone,nta_code
location_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,EWR,Newark Airport,EWR,NJ01
2,Queens,Jamaica Bay,Boro Zone,QN61
3,Bronx,Allerton/Pelham Gardens,Boro Zone,BX31
4,Manhattan,Alphabet City,Yellow Zone,MN22
5,Staten Island,Arden Heights,Boro Zone,SI48
6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone,SI14
7,Queens,Astoria,Boro Zone,QN70
8,Queens,Astoria Park,Boro Zone,QN70
9,Queens,Auburndale,Boro Zone,QN48
10,Queens,Baisley Park,Boro Zone,QN76


In [54]:
from sklearn.cluster import KMeans

In [57]:
kmeans = KMeans(n_clusters=5, random_state=0).fit(final_df)

In [58]:
kmeans.labels_

array([0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 3, 0], dtype=int32)