In [1]:
import numpy as np
import pandas as pd
import networkx as nx 
import matplotlib.pyplot as plt

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
r_dir = r'../data/raw_data/'
c_dir = r'../data/cleaned_data/'

In [3]:
tz=pd.read_csv(r_dir+'taxi_zones.csv')
tz.head()

Unnamed: 0,OBJECTID,Shape_Leng,the_geom,Shape_Area,zone,LocationID,borough
0,1,0.116357,MULTIPOLYGON (((-74.18445299999996 40.69499599...,0.000782,Newark Airport,1,EWR
1,2,0.43347,MULTIPOLYGON (((-73.82337597260663 40.63898704...,0.004866,Jamaica Bay,2,Queens
2,3,0.084341,MULTIPOLYGON (((-73.84792614099985 40.87134223...,0.000314,Allerton/Pelham Gardens,3,Bronx
3,4,0.043567,MULTIPOLYGON (((-73.97177410965318 40.72582128...,0.000112,Alphabet City,4,Manhattan
4,5,0.092146,MULTIPOLYGON (((-74.17421738099989 40.56256808...,0.000498,Arden Heights,5,Staten Island


In [4]:
size(tz)

1841

In [5]:
taxi = pd.read_csv(c_dir+'yellow_12_2018.csv' , index_col=0 )
taxi.head()

  mask |= (ar1 == a)


Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,PULocationID,DOLocationID,DOW,Hour_of_Day,DOM
0,2018-12-01 00:28:22,2018-12-01 00:44:07,2,148,234,5,0,1
1,2018-12-01 00:52:29,2018-12-01 01:11:37,3,170,144,5,0,1
2,2018-12-01 00:12:52,2018-12-01 00:36:23,1,113,193,5,0,1
3,2018-12-01 00:35:08,2018-12-01 00:43:11,1,95,92,5,0,1
4,2018-12-01 00:21:54,2018-12-01 01:15:13,1,163,228,5,0,1


In [6]:
taxig=nx.Graph()
taxig.add_nodes_from(tz['LocationID'])

In [7]:
links = taxi.iloc[: , [3, 4]].copy()
links.head()

Unnamed: 0,PULocationID,DOLocationID
0,148,234
1,170,144
2,113,193
3,95,92
4,163,228


In [8]:
size(links,0)

8173231

In [9]:
#create list of edges as tuples
EN=len(links.index)
edgelist=[(links['PULocationID'][j],links['DOLocationID'][j]) for j in range(EN)]

In [10]:
taxig.add_edges_from(edgelist)

In [11]:
#most connected location
LocDeg=dict(taxig.degree())
TopLoc=list(LocDeg.keys())[np.argmax(list(LocDeg.values()))]
print('Most connected taxi location is {0} with {1} connections'.format(TopLoc,LocDeg[TopLoc]))

Most connected taxi location is 132 with 260 connections


In [12]:
#least connected location
LocDeg=dict(taxig.degree())
TopLoc=list(LocDeg.keys())[np.argmin(list(LocDeg.values()))]
print('Least connected taxi location is {0} with {1} connections'.format(TopLoc,LocDeg[TopLoc]))

Least connected taxi location is 103 with 0 connections


In [13]:
#compute network average number of connections per airport and network density
k_avg=mean(list(LocDeg.values()))
dens=k_avg/(len(taxig)-1)
print('Average number of connections per location is {0}; network density ={1}'.format(k_avg,dens))

Average number of connections per location is 156.3560606060606; network density =0.5945097361447171


In [14]:
Name={tz.LocationID[c]:tz.zone[c] for c in tz.index}

In [15]:
#compute travel times to different locations
tt=dict(nx.shortest_path_length(taxig))

In [16]:
#top 10
#output top tn centrality scores, given the dictionary d
def topdict(d,tn):
    ind=sorted(d, key=d.get, reverse=True)
    for i in range(0,tn):
       print('{0}|{1} - {2} : {3}'.format(i+1,ind[i],Name[ind[i]],d[ind[i]]))
    return ind

In [17]:
mc1=dict(nx.degree(taxig))
ind1=topdict(mc1,10)

1|132 - JFK Airport : 260
2|138 - LaGuardia Airport : 258
3|170 - Murray Hill : 258
4|161 - Midtown Center : 257
5|230 - Times Sq/Theatre District : 257
6|164 - Midtown South : 256
7|68 - East Chelsea : 254
8|186 - Penn Station/Madison Sq West : 254
9|79 - East Village : 253
10|48 - Clinton East : 252


In [18]:
mc2 = nx.pagerank(taxig,0.85)
ind2=topdict(mc2,10)

1|170 - Murray Hill : 0.006550190299988254
2|132 - JFK Airport : 0.006191595141521971
3|138 - LaGuardia Airport : 0.0061321332834533854
4|161 - Midtown Center : 0.00611154272142503
5|230 - Times Sq/Theatre District : 0.006087124032923785
6|164 - Midtown South : 0.006031692248575814
7|79 - East Village : 0.005978208002421607
8|68 - East Chelsea : 0.005959117227319554
9|186 - Penn Station/Madison Sq West : 0.005951420962821146
10|100 - Garment District : 0.005939657362076628


In [19]:
mc3 = nx.betweenness_centrality(taxig)
ind3=topdict(mc3,10)

1|170 - Murray Hill : 0.014205797168085349
2|132 - JFK Airport : 0.00836958384909408
3|161 - Midtown Center : 0.007982976652115227
4|138 - LaGuardia Airport : 0.00789604566395584
5|230 - Times Sq/Theatre District : 0.0075370667353093414
6|164 - Midtown South : 0.006782042100359145
7|107 - Gramercy : 0.0067643931280112835
8|79 - East Village : 0.006682026214465491
9|231 - TriBeCa/Civic Center : 0.006667633082766549
10|100 - Garment District : 0.006328857749646809


In [20]:
mc4 = nx.closeness_centrality(taxig)
ind4=topdict(mc4,10)

1|132 - JFK Airport : 0.9811182163843761
2|138 - LaGuardia Airport : 0.9737413876897567
3|170 - Murray Hill : 0.9737413876897567
4|161 - Midtown Center : 0.9700944162002819
5|230 - Times Sq/Theatre District : 0.9700944162002819
6|164 - Midtown South : 0.9664746609159526
7|68 - East Chelsea : 0.9593155893536122
8|186 - Penn Station/Madison Sq West : 0.9593155893536122
9|79 - East Village : 0.9557756794297981
10|48 - Clinton East : 0.9522617982554239


In [21]:
nx.number_connected_components(taxig)

3

In [22]:
for component in nx.connected_components(taxig):
    print(len(component))

262
1
1


In [23]:
ls = [taxig.subgraph(c) for c in nx.connected_components(taxig)]

In [24]:
graphs=sorted(ls,key=len,reverse=True)[0]

In [25]:
print(nx.diameter(graphs))

3


In [26]:
print(nx.average_shortest_path_length(graphs))

1.404053698341669
