### FHV_DATA
#### Used for Preprocessing (Creating our Data Tables)
#### -----------------------------------------------------------------------------------
#### OUTLINE
#### -----------------------------------------------------------------------------------
##### I.   Import Data Tools & Data Files
##### II.  Clean data to add coordinates, map coordinates, and merge tables
##### III. Filter the data into separate table and see differences
##### IV.  Explore the data with graphs and add additional features
##### V.   Prepare a script / pipeline to pre-process data for the other two data tables
##### VI.  Design a ML Model for Prediction
##### VII. Tweak the parameters for highest prediction

### Loading Data Tool Packages

In [1]:
#data manipulation & analysis packages
import pandas as pd
import numpy as np

#plotting packages
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

#statistical packages
import scipy.stats

#time and date functions
import time
from datetime import datetime as dt
from datetime import timedelta as td

#map packages
from geopy.geocoders import Nominatim
from branca.element import Figure
import folium

#web-scraping packages
import requests

### Loading in CSV / Data

In [2]:
#FHV_Data December 2019
fhv_data = pd.read_csv('fhv_tripdata_2019-12.csv')
fhv_data = pd.DataFrame(fhv_data)

#Taxi Zones
taxi_zones_id = pd.read_csv('taxi_zones_id.csv')

### Prepped Map Data

In [3]:
#Function: Replace '/' in Zone
new_zone = []
for i in taxi_zones_id['Zone']:
    try:
        x = i.replace("/", ", ")
        new_zone.append(x)
    except:
        new_zone.append('Unable to Find')

#Function: Combine Zone, Borough, & State for Search
taxi_zones_id.Zone = new_zone
taxi_zones_id['ZoneandBorough'] = taxi_zones_id['Zone'] + ', ' + taxi_zones_id['Borough'] + ', NY'

### Google API calls

In [4]:
print('Running!')
start_time = time.time()

#api key located in txt file
apikey = open('apikey_02.txt','r+').readline()
latarr = []
longarr = []
loctypesarr = []

for address in taxi_zones_id.ZoneandBorough:
    response = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address='+address+'&key='+apikey)
    response_json_payload = response.json()
    lat = response_json_payload['results'][0]['geometry']['location']['lat']
    long = response_json_payload['results'][0]['geometry']['location']['lng']
    loctypes = response_json_payload['results'][0]['types']
    latarr.append(lat)
    longarr.append(long)
    loctypesarr.append(loctypes)

print("Program Runtime:", (time.time() - start_time)/60, "minutes")

Running!
Program Runtime: 3.5273402333259583 minutes


### Map to DataFrame

In [5]:
#Function: Add to dataframe & filter out of bounds value
taxi_zones_id['Longitude'] = latarr
taxi_zones_id['Latitude'] = longarr
taxi_zones_id['Type_of_Place'] = loctypesarr
taxi_zones_id_not_NA = taxi_zones_id[taxi_zones_id.Latitude >= -116]

#Find the min and max for coordinate map
geoBox = ((taxi_zones_id_not_NA.Longitude.min(), taxi_zones_id_not_NA.Longitude.max(), taxi_zones_id_not_NA.Latitude.min(), taxi_zones_id_not_NA.Latitude.max()))

### Checkpoint for Coordinates

In [None]:
#Output to CSV
#taxi_zones_id.to_csv('taxi_zones_id_with_coordinates.csv')

### Folium Map

In [6]:
fig2=Figure(width=550,height=350)

map1 = folium_map = folium.Map(
    location=[taxi_zones_id_not_NA.Longitude.min(), taxi_zones_id_not_NA.Latitude.min()],
    #tiles='CartoDB dark_matter',
    zoom_start=11,
)

fig2.add_child(map1)

folium.TileLayer('Stamen Terrain').add_to(map1)
folium.TileLayer('Stamen Toner').add_to(map1)
folium.TileLayer('Stamen Water Color').add_to(map1)
folium.TileLayer('cartodbpositron').add_to(map1)
folium.TileLayer('cartodbdark_matter').add_to(map1)

folium.LayerControl().add_to(map1)
taxi_zones_id_not_NA.apply(lambda row: folium.CircleMarker(location=[row['Longitude'], row['Latitude']], popup='Default popup Marker1',tooltip='Click here to see Popup').add_to(map1), axis=1)
#map1

0      <folium.vector_layers.CircleMarker object at 0...
1      <folium.vector_layers.CircleMarker object at 0...
2      <folium.vector_layers.CircleMarker object at 0...
3      <folium.vector_layers.CircleMarker object at 0...
4      <folium.vector_layers.CircleMarker object at 0...
                             ...                        
259    <folium.vector_layers.CircleMarker object at 0...
260    <folium.vector_layers.CircleMarker object at 0...
261    <folium.vector_layers.CircleMarker object at 0...
262    <folium.vector_layers.CircleMarker object at 0...
264    <folium.vector_layers.CircleMarker object at 0...
Length: 264, dtype: object

In [7]:
#Merging the For Hire Pick-up Vehicle Data and Taxi Zones
fhv_data = (fhv_data.merge(taxi_zones_id, left_on='PULocationID', right_on='LocationID'))

In [8]:
#Merging the For Hire Drop Off Vehicle Data and Taxi Zones
taxi_zones_id_Dropoff = taxi_zones_id
taxi_zones_id_Dropoff.columns =  ['LocationIDDO', 'BoroughDO', 'ZoneDO', 'service_zoneDO', 'TypeDO', 'ZoneandBoroughDO', 'LongitudeDO', 'LatitudeDO', 'Type_of_PlaceDO']
fhv_data = (fhv_data.merge(taxi_zones_id_Dropoff, left_on='DOLocationID', right_on='LocationIDDO'))

### Finish Data Tables

In [9]:
#Filtering out the most common Pickup Location ID:
fhv_data_filtered_locations = fhv_data[fhv_data.PULocationID != 264]

In [None]:
fhv_data_filtered_locations.head(5)

In [10]:
#Noting the differences between filtering Pickup Location 264
print('Pre-filtered Pickup:', fhv_data.shape)
print('Post-filtered Pickup:', fhv_data_filtered_locations.shape)

Pre-filtered Pickup: (1759399, 24)
Post-filtered Pickup: (373886, 24)


### For-Hire Vehicle ("FHV") trip records in December 2019 (Unfiltered)

In [None]:
#Dropping Unncessary Column
fhv_data = fhv_data.drop(['SR_Flag'], axis = 1)

In [None]:
#Adding time features to the data table: day & hour for pickup & dropoff with total time of ride
fhv_data.pickup_datetime = pd.to_datetime(fhv_data.pickup_datetime)
fhv_data.dropoff_datetime = pd.to_datetime(fhv_data.dropoff_datetime)
fhv_data['time_of_ride'] = fhv_data.dropoff_datetime - fhv_data.pickup_datetime
#fhv_data['pickup_month'] = fhv_data.pickup_datetime.dt.month
fhv_data['pickup_day'] = fhv_data.pickup_datetime.dt.day
fhv_data['pickup_hour'] = fhv_data.pickup_datetime.dt.hour
fhv_data['dropoff_day'] = fhv_data.dropoff_datetime.dt.day
fhv_data['dropoff_hour'] = fhv_data.dropoff_datetime.dt.hour
time_of_ride_seconds = []
for i in fhv_data['time_of_ride']:
    time_of_ride_seconds.append(td.total_seconds(i)/60)
fhv_data['time_of_ride_minutes'] = time_of_ride_seconds

In [None]:
def graphZone(column):
    ZoneDictionary = {}
    for time in column:
        if time not in ZoneDictionary:
            ZoneDictionary[time] = 1
        else:
            ZoneDictionary[time] += 1
    
    #Zone Values equal to 'NV' sum is too large and must be dropped. (1385513 counts)
    del ZoneDictionary['NV']
    
    #Plotting top values
    ZoneDictionary = dict((key, value) for key, value in ZoneDictionary.items() if value > 8000)
    #print("Top5:", ZoneDictionary)
    names = list(ZoneDictionary.keys())
    values = list(ZoneDictionary.values())
    
    #Graphing bar plot
    plt.figure(figsize=(12,3))
    plt.bar(range(len(ZoneDictionary)), values, tick_label=names)
    plt.title('Number of Rides vs City Zones:', fontsize=20)
    plt.xlabel('Zones', fontsize=16)
    plt.ylabel('Number of Rides', fontsize=16)
    plt.rc('xtick',labelsize=10)
    plt.xticks(rotation=0)
    

graphZone(fhv_data.Zone)

In [None]:
#Grapphing the opposite, looking for cities with the lowest amount of rides
def graphZone(column):
    ZoneDictionary = {}
    for time in column:
        if time not in ZoneDictionary:
            ZoneDictionary[time] = 1
        else:
            ZoneDictionary[time] += 1
            
    #Zone Values equal to 'NV' sum is too large and must be dropped. (1385513 counts)
    del ZoneDictionary['NV']
    
    #Plotting top values
    ZoneDictionary = dict((key, value) for key, value in ZoneDictionary.items() if value < 10)
    print("Top Values:", ZoneDictionary)
    
    names = list(ZoneDictionary.keys())
    values = list(ZoneDictionary.values())
    
    #Graphing bar plot
    plt.figure(figsize=(12,3))
    plt.bar(range(len(ZoneDictionary)), values, tick_label=names)
    plt.title('Least Numbers of Rides and City Zones:', fontsize=20)
    plt.xlabel('Zones', fontsize=16)
    plt.ylabel('Number of Rides', fontsize=16)
    plt.rc('xtick',labelsize=10)
    plt.xticks(rotation=0)
    

graphZone(fhv_data.Zone)

In [None]:
fhv_data.pickup_datetime.head(5)
fhv_data.pickup_datetime = pd.to_datetime(fhv_data.pickup_datetime)

In [None]:
#Using Calendar to input days.
import calendar
fhv_weekday = []
fhv_weekday_number = []
for i in fhv_data.pickup_datetime:
    weekday = calendar.weekday(i.year, i.month, i.day)
    if weekday == 0:
        fhv_weekday.append("Monday")
        fhv_weekday_number.append(0)
    elif weekday == 1:
        fhv_weekday.append("Tuesday")
        fhv_weekday_number.append(1)
    elif weekday == 2:
        fhv_weekday.append("Wednesday")
        fhv_weekday_number.append(2)
    elif weekday == 3:
        fhv_weekday.append("Thursday")
        fhv_weekday_number.append(3)
    elif weekday == 4:
        fhv_weekday.append("Friday")
        fhv_weekday_number.append(4)
    elif weekday == 5:
        fhv_weekday.append("Saturday")
        fhv_weekday_number.append(5)
    elif weekday == 6:
        fhv_weekday.append("Sunday")
        fhv_weekday_number.append(6)
fhv_data['pickup_weekday'] = fhv_weekday

In [None]:
def graphWeekday(column):
    MinutesDictionary = {}
    for time in column:
        if time not in MinutesDictionary:
            MinutesDictionary[time] = 1
        else:
            MinutesDictionary[time] += 1
    #print(MinutesDictionary)
    plt.bar(*zip(*MinutesDictionary.items()))
    plt.title('Rides vs Day:', fontsize=20)
    plt.xlabel('Day', fontsize=16)
    plt.ylabel('Ride Count', fontsize=16)
    plt.rc('xtick',labelsize=8)
    #plt.savefig('Initial Time Range and Number of Rides',dpi = 300, quality =100)
    plt.show()

graphWeekday(fhv_data.pickup_weekday)

In [None]:
#
def graphShiftTimes(column):
    MinutesDictionary = {}
    typeOfShiftArray = []
    def convertIntoRanges(time):
    
        if time >= 7 and time <= 15:
            typeOfShift = 'First Shift'
            typeOfShiftArray.append(typeOfShift)
            
        elif time <= 23 and time >= 15:
            typeOfShift = 'Second Shift'
            typeOfShiftArray.append(typeOfShift)
            
        elif time <= 7:
            typeOfShift = 'Third Shift'
            typeOfShiftArray.append(typeOfShift)
            
    for time in column:
        convertIntoRanges(time)
    for typeOfShift in typeOfShiftArray:
        if typeOfShift not in MinutesDictionary:
            MinutesDictionary[typeOfShift] = 1
        else:
            MinutesDictionary[typeOfShift] += 1
    print(MinutesDictionary)
    plt.bar(*zip(*MinutesDictionary.items()))
    plt.title('Number of Rides vs Shift Times:', fontsize=20)
    plt.xlabel('Day Range', fontsize=16)
    plt.ylabel('Number of Rides', fontsize=16)
    plt.rc('xtick',labelsize=8)
    plt.show()

graphShiftTimes(fhv_data.dropoff_hour)

In [None]:
fhv_data[fhv_data.pickup_hour < 8].head(5)

In [None]:
def graphRideByLocation(column):
    MinutesDictionary = {}
    for time in column:
        if time not in MinutesDictionary and time != '264':
            MinutesDictionary[time] = 1
        else:
            MinutesDictionary[time] += 1
            
    #Pickup Location ID Values equal to '264' sum is too large and must be dropped. (1385513 counts)
    del MinutesDictionary[264]

    plt.bar(*zip(*MinutesDictionary.items()))
    plt.title('Number of Rides in Location:')
    plt.xlabel('Location ID')
    plt.ylabel('Number of Rides')
    #plt.savefig('Initial Time Range and Number of Rides',dpi = 300, quality =100)
    plt.show()



In [None]:
graphRideByLocation(fhv_data.PULocationID)

In [None]:
#Minutes Dictionary
# '0-5','5-10','10-15','15-20','20-30','30-45','45+'
ArrayZerotoFive=[]
ArrayFivetoTen=[]
ArrayTentoFifteen=[]
ArrayFifteentoTwenty=[]
ArrayTwentytoThirty=[]
ArrayThirtytoFortyFive=[]
ArrayFortyFivePlus=[]
MinutesDictionary = {'0-5':0,'5-10':0,'10-15':0,'15-20':0,'20-30':0,'30-45':0,'45+':0}
stepcount = 0 
for time in fhv_data.time_of_ride_minutes:
    def dict_call(time):
        if time not in MinutesDictionary:
            MinutesDictionary[time] = 1
        else:
            MinutesDictionary[time] += 1
    time = round(time, 0)
    stepcount += 1
    if time <= 5:
        ArrayZerotoFive.append(stepcount)
        time = '0-5'
        dict_call(time)
    elif time <= 10:
        time = '5-10'
        dict_call(time)
        ArrayFivetoTen.append(stepcount)
    elif time <= 15:
        time = '10-15'
        dict_call(time)
        ArrayTentoFifteen.append(stepcount)
    elif time <= 20:
        time = '15-20'
        dict_call(time)
        ArrayFifteentoTwenty.append(stepcount)
    elif time <= 30:
        time = '20-30'
        dict_call(time)
        ArrayTwentytoThirty.append(stepcount)
    elif time <= 45:
        time = '30-45'
        dict_call(time)
        ArrayThirtytoFortyFive.append(stepcount)
    else:
        time = '45+'
        dict_call(time)
        ArrayFortyFivePlus.append(stepcount)

#print(ArrayZerotoFive)
#print(MinutesDictionary)
plt.bar(*zip(*MinutesDictionary.items()))
plt.title('Number of Rides in Time Range')
plt.xlabel('Time Range')
plt.ylabel('Number of Rides')
#plt.savefig('Initial Time Range and Number of Rides',dpi = 300, quality =100)
plt.show()

In [None]:
ArrayColumn0to5 = fhv_data.loc[ArrayZerotoFive]
#print(ArrayFivetoTen)
#ArrayColumn5to10 = fhv_data.loc[ArrayFivetoTen]
ArrayColumn10to15 = fhv_data.loc[ArrayTentoFifteen]
graphRideByLocation(ArrayColumn0to5.PULocationID)
graphRideByLocation(ArrayColumn0to5.DOLocationID)
#graphRideByLocation(ArrayColumn5to10.PULocationID)
graphRideByLocation(ArrayColumn10to15.DOLocationID)

In [None]:
fhv_data.head(5)
#fhv_data.to_csv('fhv_data_and_taxi_zones_id.csv')

In [None]:
filtering_checkpoint = pd.read_csv('fhv_data_and_taxi_zones_id.csv')

In [None]:
#Top5: {'Saint George/New Brighton': 9316, 'Jackson Heights': 10710, nan: 9247, 'Astoria': 9121, 'Flushing': 8906}
#'NV': 1385513,
fhv_data_NV = fhv_data[fhv_data.Zone == 'NV']
graphWeekday(fhv_data_NV.pickup_weekday)
countperday(fhv_data_NV.pickup_day, 'pickup_calls_per_day')
countperday(fhv_data_NV.pickup_hour,'pickup_calls_per_hour')
countperday(fhv_data_NV.dropoff_day, 'dropoff_calls_per_day')
countperday(fhv_data_NV.dropoff_hour, 'dropoff_calls_per_hour')

In [None]:
fhv_data.time_of_ride.describe()

In [None]:
countOfDayCalls = fhv_data.nunique()

In [None]:
def countperday(data, name):
    title = name
    name = {}
    for item in data:
        if item not in name:
            name[item] = 1
        else:
            name[item] += 1
    #print(name)
    high_name = {}
    for key in name:
        if name[key] > 60000:
            high_name[key] = name[key]
    #print(high_name)
    plt.bar(*zip(*name.items()))
    plt.title(title)
    plt.xlabel('Time Unit')
    plt.ylabel('Counts')
    plt.show()
    
countperday(fhv_data.pickup_day, 'pickup_calls_per_day')
countperday(fhv_data.pickup_hour,'pickup_calls_per_hour')
countperday(fhv_data.dropoff_day, 'dropoff_calls_per_day')
countperday(fhv_data.dropoff_hour, 'dropoff_calls_per_hour')


In [None]:
def specialcountperday(data, name):
    title = name
    name = {}
    for item in data:
        if item not in name:
            name[item] = 1
        else:
            name[item] += 1
    #print(name)
    newDict = dict(filter(lambda elem: elem[0] >= 60000, name.items()))
    #print('Filtered Dictionary : ')
    #print(newDict)
    plt.bar(*zip(*name.items()))
    plt.title(title)
    plt.xlabel('Time Unit')
    plt.ylabel('Counts')
    plt.show()

specialcountperday(fhv_data.pickup_hour,'calls_per_hour')

In [None]:
#print('Look for dtypes: \n')
#print(fhv_data.dtypes)

#print('\nLook for total NaN\'s: \n')
#print(fhv_data.isnull().sum(axis=0))

In [None]:
fhv_data_columns = fhv_data.columns
#print('Columns:', fhv_data_columns)

#Countplot for all columns:
fhv_data_columns_countplot = fhv_data_columns.drop(['pickup_datetime','dropoff_datetime'])

#for column in fhv_data_columns_countplot:
    #print('Value Counts:', fhv_data[column].value_counts())
    #sns.countplot(fhv_data[column])
    #plt.show()

## Filtered FHV

In [11]:
fhv_data_filtered_locations = fhv_data_filtered_locations.drop(['SR_Flag'], axis = 1)
fhv_data_filtered_locations.pickup_datetime = pd.to_datetime(fhv_data_filtered_locations.pickup_datetime)
fhv_data_filtered_locations.dropoff_datetime = pd.to_datetime(fhv_data_filtered_locations.dropoff_datetime)
fhv_data_filtered_locations['time_of_ride'] = fhv_data_filtered_locations.dropoff_datetime - fhv_data_filtered_locations.pickup_datetime
#fhv_data['pickup_month'] = fhv_data.pickup_datetime.dt.month
fhv_data_filtered_locations['pickup_day'] = fhv_data_filtered_locations.pickup_datetime.dt.day
fhv_data_filtered_locations['pickup_hour'] = fhv_data_filtered_locations.pickup_datetime.dt.hour
fhv_data_filtered_locations['dropoff_day'] = fhv_data_filtered_locations.dropoff_datetime.dt.day
fhv_data_filtered_locations['dropoff_hour'] = fhv_data_filtered_locations.dropoff_datetime.dt.hour
time_of_ride_seconds = []
for i in fhv_data_filtered_locations['time_of_ride']:
    time_of_ride_seconds.append(td.total_seconds(i)/60)
fhv_data_filtered_locations['time_of_ride_minutes'] = time_of_ride_seconds


#Countplot for all columns:
#print(fhv_data_filtered_locations.columns)
#fhv_data_columns_countplot = fhv_data_filtered_locations.columns.drop(['pickup_datetime','dropoff_datetime'])

In [12]:
print(fhv_data_filtered_locations.pickup_datetime)
fhv_data_filtered_locations.pickup_datetime = pd.to_datetime(fhv_data_filtered_locations.pickup_datetime)
fhv_data_filtered_locations['pickup_month'] = fhv_data_filtered_locations['pickup_datetime'].dt.strftime('%m')

305627    2019-12-29 07:21:03
305628    2019-12-21 10:22:23
305629    2019-12-06 05:48:01
305630    2019-12-09 20:48:16
305631    2019-12-11 23:50:54
                  ...        
1759391   2019-12-31 16:14:39
1759392   2019-12-26 11:53:37
1759393   2019-12-23 12:00:09
1759394   2019-12-23 19:58:58
1759398   2019-12-05 16:11:00
Name: pickup_datetime, Length: 373886, dtype: datetime64[ns]


In [None]:
print(fhv_data_filtered_locations.head(5))

In [13]:
#Append the Day, Number of the Day for the Week, Month, and Season
import calendar
fhv_data_filtered_locations.pickup_datetime = pd.to_datetime(fhv_data_filtered_locations.pickup_datetime)
fhv_weekday = []
fhv_weekday_number = []
fhv_month = []

for i in fhv_data_filtered_locations.pickup_datetime:
    weekday = calendar.weekday(i.year, i.month, i.day)
    
    #fhv_month.append(i.month)
    if weekday == 0:
        fhv_weekday.append("Monday")
        fhv_weekday_number.append(0)
    elif weekday == 1:
        fhv_weekday.append("Tuesday")
        fhv_weekday_number.append(1)
    elif weekday == 2:
        fhv_weekday.append("Wednesday")
        fhv_weekday_number.append(2)
    elif weekday == 3:
        fhv_weekday.append("Thursday")
        fhv_weekday_number.append(3)
    elif weekday == 4:
        fhv_weekday.append("Friday")
        fhv_weekday_number.append(4)
    elif weekday == 5:
        fhv_weekday.append("Saturday")
        fhv_weekday_number.append(5)
    elif weekday == 6:
        fhv_weekday.append("Sunday")
        fhv_weekday_number.append(6)
fhv_data_filtered_locations['pickup_weekday'] = fhv_weekday
fhv_data_filtered_locations['pickup_weekday_num'] = fhv_weekday_number

In [14]:
#Determine the Seasons
seasons = []
seasons_number = []
for month in fhv_data_filtered_locations.pickup_month:
    month = int(month)
    #December to February: Winter | March to May: Spring | June to July: Summer | August to October: Fall
    if month == 12 or month == 1 or month == 2:
        seasons.append("Winter")
        seasons_number.append(1)
    elif month >= 3 and month <= 5:
        seasons.append("Spring")
        seasons_number.append(2)
    elif month >= 6 and month <= 8:
        seasons.append("Spring")
        seasons_number.append(3)
    elif month >= 9 and month <= 11:
        seasons.append("Spring")
        seasons_number.append(4)
        
fhv_data_filtered_locations['seasons'] = seasons
fhv_data_filtered_locations['seasons_number'] = seasons_number

In [15]:
# Length of Ride Interval Categories (LORI)
# 1:'0-5' | 2:'5-10' | 3:'10-15' | 4:'15-20' | 5:'20-30' | 6:'30-45' | 7:'45+'
LORI = []
SNL = []

#Create a column for LORI
for time in fhv_data_filtered_locations.time_of_ride_minutes:
    time = round(time, 0)
    if time <= 5:
        LORI.append(1)
    elif time <= 10:
        LORI.append(2)
    elif time <= 15:
        LORI.append(3)
    elif time <= 20:
        LORI.append(4)
    elif time <= 30:
        LORI.append(5)
    elif time <= 45:
        LORI.append(6)
    else:
        LORI.append(7)

for time in fhv_data_filtered_locations.time_of_ride_minutes:
    time = round(time, 0)
    if time <= 20:
        SNL.append(0)
    elif time >= 20:
        SNL.append(1)
        
fhv_data_filtered_locations['LORI'] = LORI

In [16]:
#First, Second, & Third Shift
typeOfShiftArray = []

#Categorize: First Shift from 7 AM to 3 PM, Second Shift from 3 PM to 11 AM, & Third Shift from 11 AM to 7 AM.
for time in fhv_data_filtered_locations.pickup_hour:
    if time >= 7 and time <= 15:
        typeOfShiftArray.append(1)
    elif time <= 23 and time >= 15:
        typeOfShiftArray.append(2)
    elif time <= 7:
        typeOfShiftArray.append(3)
        
fhv_data_filtered_locations['ShiftType'] = typeOfShiftArray

In [17]:
#Day, Afternoon, Evening, or Night (DAEN)
#5 AM to 11 AM, 12PM to 4PM, 5PM to 8PM, 9PM to 4 AM

DAEN = []

for time in fhv_data_filtered_locations.pickup_hour:
    if time >= 4 and time <= 10:
        DAEN.append(1)
    elif time <= 16 and time >= 11:
        DAEN.append(2)
    else:
        DAEN.append(3)

fhv_data_filtered_locations['DAEN'] = DAEN

In [18]:
fhv_data_filtered_locations.head(5)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,LocationID,Borough,Zone,service_zone,Type,...,dropoff_hour,time_of_ride_minutes,pickup_month,pickup_weekday,pickup_weekday_num,seasons,seasons_number,LORI,ShiftType,DAEN
305627,B00889,2019-12-29 07:21:03,2019-12-29 07:56:48,56,264,56,Queens,Corona,Boro Zone,Neighborhood,...,7,35.75,12,Sunday,6,Winter,1,6,1,1
305628,B00446,2019-12-21 10:22:23,2019-12-21 10:38:53,82,264,82,Queens,Elmhurst,Boro Zone,Neighborhood,...,10,16.5,12,Saturday,5,Winter,1,4,1,1
305629,B00446,2019-12-06 05:48:01,2019-12-06 06:05:13,93,264,93,Queens,Flushing Meadows-Corona Park,Boro Zone,Park,...,6,17.2,12,Friday,4,Winter,1,4,3,1
305630,B00446,2019-12-09 20:48:16,2019-12-09 23:20:02,93,264,93,Queens,Flushing Meadows-Corona Park,Boro Zone,Park,...,23,151.766667,12,Monday,0,Winter,1,7,2,3
305631,B00446,2019-12-11 23:50:54,2019-12-12 00:20:41,93,264,93,Queens,Flushing Meadows-Corona Park,Boro Zone,Park,...,0,29.783333,12,Wednesday,2,Winter,1,5,2,3


In [None]:
fhv_data_filtered_locations.to_csv('fhv_data_filtered_locations.csv')

### Checkpoint for Filtered Locations

In [19]:
fhv_data_filtered_locations = pd.read_csv('fhv_data_filtered_locations.csv')

In [20]:
fhv_data_filtered_locations.head(5)

Unnamed: 0.1,Unnamed: 0,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,LocationID,Borough,Zone,service_zone,...,dropoff_hour,time_of_ride_minutes,pickup_month,pickup_weekday,pickup_weekday_num,seasons,seasons_number,LORI,ShiftType,DAEN
0,305627,B00889,2019-12-29 07:21:03,2019-12-29 07:56:48,56,264,56,Queens,Corona,Boro Zone,...,7,35.75,12,Sunday,6,Winter,1,6,1,1
1,305628,B00446,2019-12-21 10:22:23,2019-12-21 10:38:53,82,264,82,Queens,Elmhurst,Boro Zone,...,10,16.5,12,Saturday,5,Winter,1,4,1,1
2,305629,B00446,2019-12-06 05:48:01,2019-12-06 06:05:13,93,264,93,Queens,Flushing Meadows-Corona Park,Boro Zone,...,6,17.2,12,Friday,4,Winter,1,4,3,1
3,305630,B00446,2019-12-09 20:48:16,2019-12-09 23:20:02,93,264,93,Queens,Flushing Meadows-Corona Park,Boro Zone,...,23,151.766667,12,Monday,0,Winter,1,7,2,3
4,305631,B00446,2019-12-11 23:50:54,2019-12-12 00:20:41,93,264,93,Queens,Flushing Meadows-Corona Park,Boro Zone,...,0,29.783333,12,Wednesday,2,Winter,1,5,2,3


In [None]:
#sns.countplot(fhv_data_filtered_locations.LocationID)

In [None]:
for column in fhv_data_columns_countplot:
    #print('Value Counts:', fhv_data[column].value_counts())
    sns.countplot(fhv_data[column])
    plt.show()