In [52]:
import pandas as pd
import folium
import os
import requests
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from collections import OrderedDict
import matplotlib.colors as mcolors
import geopandas as gpd
import json
import io
import time
import psutil
from folium import plugins
import shapely
from folium.plugins import HeatMap
import plotly.express as px
%pylab inline 

Populating the interactive namespace from numpy and matplotlib


# Read in the datasets

## Requests csv

In [2]:
requests = pd.read_csv('requests.csv')

In [3]:
requests.head()

Unnamed: 0,Timestamp,Requested_Pickup,Requested_Dropoff,Actual_Pickup,Actual_Dropoff
0,1380949200,1149,725,218,3601
1,1380949200,1225,1714,2003,3601
2,1380949200,1237,3143,1314,2218
3,1380949200,1323,3566,1330,2573
4,1380949200,1328,2466,1323,2473


## Vehicle Events

In [4]:
v_e = pd.read_csv('vehicle_events.csv')

In [5]:
v_e.head()

Unnamed: 0,Timestamp,Vehicle_ID,Stop_Intersection,Stop_Passengers,Requested_Stop_Intersection
0,1380949200,83,2767,0,2764
1,1380949200,89,3070,0,1818
2,1380949200,112,97,0,972
3,1380949200,121,215,0,396
4,1380949200,141,1314,0,1237


## Vehicle Path

In [6]:
v_p = pd.read_csv('vehicle_paths.csv')

In [7]:
v_p.head()

Unnamed: 0,Timestamp,Vehicle_ID,Latitude,Longitude,Num_Passengers
0,1380949200,83,40.73062,-73.99065,0
1,1380949200,89,40.71429,-74.01153,0
2,1380949200,112,40.77178,-73.97919,0
3,1380949200,121,40.73974,-74.00249,0
4,1380949200,141,40.77114,-73.96392,0


# Second Challenge

## Data Processing

In [8]:
#Perform first join between first two data sets
first_join = requests.merge(v_e, 
                            left_on = ['Requested_Pickup','Timestamp'], 
                            right_on = ['Requested_Stop_Intersection','Timestamp'], 
                            how = 'inner')

In [9]:
#Perform second join to have all three datasets together
second_join = first_join.merge(v_p,
                              left_on = ['Vehicle_ID','Timestamp'],
                              right_on = ['Vehicle_ID','Timestamp'],
                              how = 'inner')

In [10]:
second_join.head()

Unnamed: 0,Timestamp,Requested_Pickup,Requested_Dropoff,Actual_Pickup,Actual_Dropoff,Vehicle_ID,Stop_Intersection,Stop_Passengers,Requested_Stop_Intersection,Latitude,Longitude,Num_Passengers
0,1380949200,1237,3143,1314,2218,141,1314,0,1237,40.77114,-73.96392,0
1,1380949200,1328,2466,1323,2473,211,1323,0,1328,40.73855,-73.99968,0
2,1380949200,1336,1311,1350,1326,495,1350,0,1336,40.77954,-73.95781,0
3,1380949200,1581,720,3501,1032,330,3501,0,1581,40.7273,-73.99363,0
4,1380949200,1818,2470,3070,4006,89,3070,0,1818,40.71429,-74.01153,0


In [11]:
#Find where no actual pikcup occured
no_pickup = second_join.loc[second_join['Actual_Pickup']==-1]

In [16]:
no_pickup.head()

Unnamed: 0,Timestamp,Requested_Pickup,Requested_Dropoff,Actual_Pickup,Actual_Dropoff,Vehicle_ID,Stop_Intersection,Stop_Passengers,Requested_Stop_Intersection,Latitude,Longitude,Num_Passengers,date_time
113,1380949427,1389,3147,-1,-1,245,3188,1,1389,40.75552,-73.98362,1,
115,1380949440,1856,1670,-1,-1,166,232,1,1856,40.75286,-73.99298,4,
129,1380949631,396,2941,-1,-1,310,396,0,396,40.7394,-74.00645,1,
130,1380949631,396,2941,-1,-1,310,396,1,396,40.7394,-74.00645,1,
131,1380949639,2448,227,-1,-1,384,2451,0,2448,40.76143,-73.98405,3,


In [24]:
#Convert linux timestamp into datetime
no_pickup['date_time'] = NaN
no_pickup = no_pickup.reset_index()
for i in range(len(no_pickup)):
    no_pickup['date_time'][i] = time.strftime("%D %H:%M", time.localtime(no_pickup['Timestamp'][i]))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [28]:
#Get just the hour from the timestamp
no_pickup['date_time_split'] = no_pickup['date_time'].str.split(' ')
no_pickup['hour'] = NaN
for i in range(len(no_pickup)):
    no_pickup['hour'][i] = no_pickup['date_time_split'][i][1]
    no_pickup['hour'][i] = no_pickup['hour'].str.split(':')[i][0]
no_pickup['hour'] = no_pickup['hour'].astype(float)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [31]:
#Replace zero with 24 to indicate the next day, October 6th
no_pickup['hour'] = no_pickup.hour.replace(0,24)

In [32]:
#Write out as a csv
no_pickup.to_csv('challenge_2_dv.csv')

## Visualization

In [37]:
#Create a base and heatmap
m_2 = folium.Map(location=[40.774, -73.95], zoom_start = 12)

lat_long_date = no_pickup[['Latitude', 'Longitude','hour']]
lat_long_date['Timestamp'] = lat_long_date['hour'].astype(float)
lat_long_date = lat_long_date.dropna(axis=0, subset=['Latitude','Longitude','hour'])

# List comprehension to make out list of lists
lat_long_date = [[[row['Latitude'],row['Longitude']] for index, 
             row in lat_long_date[lat_long_date['hour'] == i].iterrows()] for i in range(1,25)]

hm = plugins.HeatMapWithTime(lat_long_date,auto_play=True,max_opacity=0.8)
hm.add_to(m_2)
# Display the map
m_2



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [309]:
m_2.save('challenge_2.html')

# Fourth Challenge

In [323]:
lat_long_date

[[[40.755520000000004, -73.98362],
  [40.75286, -73.99298],
  [40.7394, -74.00645],
  [40.7394, -74.00645],
  [40.76143, -73.98405],
  [40.76143, -73.98405],
  [40.739740000000005, -74.00249000000001],
  [40.739740000000005, -74.00249000000001],
  [40.75802, -73.96623000000001],
  [40.75802, -73.96623000000001],
  [40.76167, -73.97491],
  [40.737359999999995, -73.99684],
  [40.75456, -73.97165],
  [40.75456, -73.97165],
  [40.75796, -73.98553000000001],
  [40.75796, -73.98553000000001],
  [40.75029, -73.99485],
  [40.75029, -73.99485],
  [40.74285, -74.00027],
  [40.75166, -73.99013000000001],
  [40.73755, -74.00649],
  [40.742129999999996, -74.00455],
  [40.73876, -74.00646]],
 [[40.734790000000004, -73.99072],
  [40.734790000000004, -73.99072],
  [40.74085, -74.00513000000001],
  [40.74085, -74.00513000000001],
  [40.73295, -73.99799],
  [40.73785, -74.00565999999999],
  [40.71983, -73.98754],
  [40.71983, -73.98754]],
 [[40.72388, -73.98208000000001]],
 [[40.74288, -74.004], [40.742

## Data Processing

In [289]:
nyc_zip = gpd.read_file('https://raw.githubusercontent.com/hvo/datasets/master/nyc_zip.geojson')

In [290]:
nyc_zip.head()

Unnamed: 0,zipcode,geometry
0,10471,"(POLYGON ((-73.881919 40.906666, -73.878423 40..."
1,10463,"(POLYGON ((-73.920646 40.887237, -73.920376 40..."
2,10475,"(POLYGON ((-73.827224 40.89093, -73.825115 40...."
3,10464,"(POLYGON ((-73.815387 40.889394, -73.815265999..."
4,11222,"(POLYGON ((-73.954223 40.739107, -73.954183 40..."


In [291]:
v_p_with_v = pd.read_csv('vehicle_paths_pnas.csv')

In [292]:
violators = v_p_with_v.loc[v_p_with_v['Num_Passengers']>4]

In [293]:
violators_movements  = v_e.merge(violators,
                              left_on = ['Vehicle_ID','Timestamp'],
                              right_on = ['Vehicle_ID','Timestamp'],
                              how = 'inner')

In [294]:
violators_movements['lonlat']=list(zip(violators_movements.Longitude,violators_movements.Latitude))
violators_movements['geometry']= violators_movements[['lonlat']].applymap(lambda x:shapely.geometry.Point(x))

In [295]:
violators_movements_geo = gpd.GeoDataFrame(violators_movements)

In [296]:
violators_movements_zip =  gpd.sjoin(nyc_zip, violators_movements_geo, how="inner", op='contains')

In [297]:
violation_data = violators_movements_zip[['zipcode','Timestamp']].reset_index().reset_index()

In [298]:
violation_data = violation_data[['level_0','zipcode','Timestamp']]

In [299]:
violation_over_time = violation_data.groupby(['zipcode','Timestamp']).count().reset_index()

In [300]:
violation_over_time['date_time'] = violation_over_time.Timestamp.apply(lambda x: (datetime.datetime.fromtimestamp(x)))

In [301]:
violation_over_time['hour'] = violation_over_time.date_time.apply(lambda x: x.hour)

In [302]:
violation_over_time['hour'] = violation_over_time.hour.replace(0,24)

In [303]:
violation_over_time.columns = [['zipcode','Timestamp','count','date_time','hour']]

In [304]:
violation_data_find = violation_over_time[['zipcode','hour','count']]

In [305]:
violation_time = violation_data_find[['hour','count']]

In [306]:
violation_time.columns = ['HOUR','COUNT']

In [307]:
violation_GB = violation_time.groupby(['HOUR']).sum().reset_index()

In [308]:
fig = px.bar(violation_GB, x='HOUR', y='COUNT')
fig.show()

In [319]:
import plotly
plotly.offline.plot(fig, filename = 'challenge_4.html', auto_open=False)

'challenge_4.html'

# Eigth Challenge

## Data Processing

In [38]:
#Perform join to get vehicle movements
v_m  = v_e.merge(v_p,
                              left_on = ['Vehicle_ID','Timestamp'],
                              right_on = ['Vehicle_ID','Timestamp'],
                              how = 'inner')

In [39]:
#Get locations by vehcile grouped together
v_m_counts = v_m[['Vehicle_ID']].reset_index().groupby(['Vehicle_ID']).count()

In [40]:
#Reset the index for further processing
v_m_counts = v_m_counts.reset_index()

In [41]:
#Sort values to find a value that could be mapped easily
v_m_top = v_m_counts.sort_values(by=['index'],ascending=False)

In [42]:
#Use vehicle ID 269 with only 22 stops
v_m_269 = v_m.loc[v_m['Vehicle_ID'] == 269]

In [46]:
#Write out to csv
v_m_269.to_csv('challenge_8_dv.csv')

In [43]:
#Create a list of points travelled by the vehicle
geo_list = list(zip(v_m_269['Latitude'],v_m_269['Longitude']))

In [44]:
#List of times stoped by the vehicle
trip_list = list(range(len(v_m_269)))

## Visualization

In [326]:
#Vusalize a base and line plot
m_8 = folium.Map(location=[40.774, -73.95], zoom_start = 12)

folium.ColorLine(geo_list,
                 colors = trip_list,
                 colormap = ['blue', 'red'],
                 weight = 10,
                 opacity = 0.8).add_to(m_8)
m_8

In [310]:
m_8.save('challenge_8.html')