# 2. Data Summary 

## 2.1 Rearranging the data and plotting

I decided that the most obvious and easiest way to analysis of this data was to visualise where and when the rides were taking place. To start, I worked with the data to create .csvs of the location by zone as defined in the provided shapefile and time by hour. These csvs have the location id as the row, and the hour as the column, including an additional row for all data. Each cell contains the total number of trips for each hour in each location. The code below produces two csvs. 

In [None]:
import pandas as pd
import numpy as np
import cartopy.io.shapereader as shpreader

input = pd.read_csv('tlc_yellow_trips_2018_11_22_CLEAN.csv')
print(input.head())

input['pickup_datetime']=pd.to_datetime(input['pickup_datetime'])
input['dropoff_datetime']=pd.to_datetime(input['dropoff_datetime'])

shpfilename = '/home/Earth/mfalls/Downloads/junior-data-scientist-test-data-team-master/tlc_yellow_geom.shp'
reader = shpreader.Reader(shpfilename)
zones = reader.records()
zone_ids = []
for zone in zones:
    zone_ids.append(int(zone.attributes['zone_id']))
zone_ids = sorted(zone_ids)
n = len(zone_ids)

pickup = pd.DataFrame({'zone_id':zone_ids, 'all':np.zeros((n))})
dropoff = pd.DataFrame({'zone_id':zone_ids, 'all':np.zeros((n))})
pickup = pickup.set_index('zone_id')
dropoff = dropoff.set_index('zone_id')

for h in range(24):
    pickup[str(h)] = np.zeros((n))
    dropoff[str(h)] = np.zeros((n))
for r in range(input.shape[0]):
    puID = input['pickup_location_id'][r]
    doID = input['dropoff_location_id'][r]
    puH = input['pickup_datetime'][r].hour
    doH = input['dropoff_datetime'][r].hour
    pickup['all'][puID] = pickup['all'][puID] + 1
    pickup[str(puH)][puID] = pickup[str(puH)][puID] + 1
    dropoff['all'][doID] = dropoff['all'][doID] + 1
    dropoff[str(doH)][doID] = dropoff[str(doH)][doID] + 1

pickup.to_csv('pickup_timeplace.csv')
dropoff.to_csv('dropoff_timeplace.csv')

The code below plots the hourly data, and presents hourly data in subplots for both pickup and dropoff. To make comparison easier, pickup and dropoff use the same scale. 

In [None]:
import matplotlib.pyplot as plt
import cartopy.io.shapereader as shpreader
import cartopy.crs as ccrs
import pandas as pd
import numpy as np
from shapely.geometry import MultiPolygon
import math as m

pickup = pd.read_csv('pickup_timeplace.csv')
dropoff = pd.read_csv('dropoff_timeplace.csv')
zone_ids = np.array(pickup['zone_id'])
pickup = pickup.set_index('zone_id')
dropoff = dropoff.set_index('zone_id')
pickmax = np.max(pickup.to_numpy()[:,1:])
dropmax = np.max(dropoff.to_numpy()[:,1:])
hourlymax = max([pickmax,dropmax])

datas = [pickup,dropoff]
names = ['pickup','dropoff']

for i in range(2):
    data = datas[i]
    name = names[i]
    fig, ax = plt.subplots(4,6,figsize=[12.8,9.6])
    fig.tight_layout()
    for h in range(24):
        j=m.floor(h/6)
        i=h%6
        ax = plt.subplot(4, 6, h+1, projection=ccrs.PlateCarree())
        ax.set_extent([-74.27, -73.68, 40.48, 40.92], ccrs.PlateCarree())
        shpfilename = '/home/Earth/mfalls/Downloads/junior-data-scientist-test-data-team-master/tlc_yellow_geom.shp'
        reader = shpreader.Reader(shpfilename)
        zones = reader.records()
        for zone in zones:
            zid = zone.attributes['zone_id']
            if zid not in ['56', '103']:
                r = data[str(h)][int(zid)]
                try:
                    if r == 0:
                        ax.add_geometries(zone.geometry, ccrs.PlateCarree(), facecolor='#FFFFFF', linewidth=0.1,
                                          edgecolor="black")
                    elif 0 < r and r <= 100:
                        ax.add_geometries(zone.geometry, ccrs.PlateCarree(), facecolor='#FFECEC', linewidth=0.1,
                                          edgecolor="black")
                    elif 100 < r and r <= 200:
                        ax.add_geometries(zone.geometry, ccrs.PlateCarree(), facecolor='#FFB5B5', linewidth=0.1,
                                          edgecolor="black")
                    elif 200 < r and r <= 300:
                        ax.add_geometries(zone.geometry, ccrs.PlateCarree(), facecolor='#ff7575', linewidth=0.1,
                                          edgecolor="black")
                    elif 300 < r and r <= 400:
                        ax.add_geometries(zone.geometry, ccrs.PlateCarree(), facecolor='#FF2D2D', linewidth=0.1,
                                          edgecolor="black")
                    elif 400 < r and r <= 500:
                        ax.add_geometries(zone.geometry, ccrs.PlateCarree(), facecolor='#EA0000', linewidth=0.1,
                                          edgecolor="black")
                    elif  r > 500:
                        ax.add_geometries(zone.geometry, ccrs.PlateCarree(), facecolor='#AE0000', linewidth=0.1,
                                          edgecolor="black")
                except Exception as e:
                    list_str_polygons = [str(zone.geometry)]
                    c = MultiPolygon(map(wkt.loads, list_str_polygons))
                    if r == 0:
                        ax.add_geometries(c, ccrs.PlateCarree(), facecolor='#FFFFFF', linewidth=0.1, edgecolor="black")
                    elif 0 < r and r <= 100:
                        ax.add_geometries(c, ccrs.PlateCarree(), facecolor='#FFECEC', linewidth=0.1, edgecolor="black")
                    elif 100 < r and r <= 200:
                        ax.add_geometries(c, ccrs.PlateCarree(), facecolor='#FFB5B5', linewidth=0.1, edgecolor="black")
                    elif 200 < r and r <= 300:
                        ax.add_geometries(c, ccrs.PlateCarree(), facecolor='#ff7575', linewidth=0.1, edgecolor="black")
                    elif 300 < r and r <= 400:
                        ax.add_geometries(c, ccrs.PlateCarree(), facecolor='#FF2D2D', linewidth=0.1, edgecolor="black")
                    elif 400 < r and r <= 500:
                        ax.add_geometries(c, ccrs.PlateCarree(), facecolor='#EA0000', linewidth=0.1, edgecolor="black")
                    elif r > 500:
                        ax.add_geometries(c, ccrs.PlateCarree(), facecolor='#AE0000', linewidth=0.1, edgecolor="black")

        ax.set_title(str(h)+ '.00-' + str(h+1) + '.00')

    print(name + '_locs_hourly.png')
    plt.show()
    plt.savefig(name + '_locs_hourly.png')

## 2.2 Commentary on Data

Looking at both the pickup and dropoff data, we notice that the most frequent area for both pickup and dropoff is in the midtown area of Manhattan, starting from around noon and continuing until late evening. When we compare this to [NYC population density data](https://viewing.nyc/media/ac2b6e5c581107f23f1a3648986719f1/), we can see that this isn't the most densily populated area in the city, and given that it's a holiday, not many people will be going to work there. So why is this location the dominant trend? Considering events of the day, we can see that the region most in red corresponds to the time and route of [Macy's Thankgiving Parade](https://www.6sqft.com/map-where-to-watch-the-macys-thanksgiving-day-parade/).