In [16]:
from plotnine import ggplot
import arrow
import dfply
import tidypython
import itables
import pandas as pd
from datetime import datetime

In [4]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [5]:
# Create list of colors to be used in plots

colors = ['#003f5c', '#374c80', '#7a5195', '#bc5090', '#ef5675', '#ff764a', '#ffa600']

In [10]:
apr_data = pd.read_csv('data/uber-raw-data-apr14.csv')
may_data = pd.read_csv('data/uber-raw-data-may14.csv')
jun_data = pd.read_csv('data/uber-raw-data-jun14.csv')
jul_data = pd.read_csv('data/uber-raw-data-jul14.csv')
aug_data = pd.read_csv('data/uber-raw-data-aug14.csv')
sep_data = pd.read_csv('data/uber-raw-data-sep14.csv')
jun_data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,6/1/2014 0:00:00,40.7293,-73.992,B02512
1,6/1/2014 0:01:00,40.7131,-74.0097,B02512
2,6/1/2014 0:04:00,40.3461,-74.661,B02512
3,6/1/2014 0:04:00,40.7555,-73.9833,B02512
4,6/1/2014 0:07:00,40.688,-74.1831,B02512


In [11]:
jun_data.shape

(663844, 4)

In [12]:
data = pd.concat([apr_data, may_data, jun_data, jul_data, aug_data, sep_data])
data.shape

(4534327, 4)

In [14]:
data['Date/Time'] = pd.to_datetime(data['Date/Time'], format='%m/%d/%Y %H:%M:%S')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4534327 entries, 0 to 1028135
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   Date/Time  datetime64[ns]
 1   Lat        float64       
 2   Lon        float64       
 3   Base       object        
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 173.0+ MB


In [19]:
def get_time(date_time):
    return date_time.strftime('%H:%M:%S')

data['Time'] = data['Date/Time'].map(get_time)
data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Time
0,2014-04-01 00:11:00,40.769,-73.9549,B02512,00:11:00
1,2014-04-01 00:17:00,40.7267,-74.0345,B02512,00:17:00
2,2014-04-01 00:21:00,40.7316,-73.9873,B02512,00:21:00
3,2014-04-01 00:28:00,40.7588,-73.9776,B02512,00:28:00
4,2014-04-01 00:33:00,40.7594,-73.9722,B02512,00:33:00


In [21]:
data['Time'] = pd.to_datetime(data.Time, format='%H:%M:%S')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4534327 entries, 0 to 1028135
Data columns (total 5 columns):
 #   Column     Dtype         
---  ------     -----         
 0   Date/Time  datetime64[ns]
 1   Lat        float64       
 2   Lon        float64       
 3   Base       object        
 4   Time       datetime64[ns]
dtypes: datetime64[ns](2), float64(2), object(1)
memory usage: 207.6+ MB


In [27]:
data['Day'] = data['Date/Time'].dt.day
data['Month'] = data['Date/Time'].dt.month
data['Year'] = data['Date/Time'].dt.year
data['Day_of_Week'] = data['Date/Time'].dt.dayofweek
data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Time,Day,Month,Year,Day_of_Week
0,2014-04-01 00:11:00,40.769,-73.9549,B02512,1900-01-01 00:11:00,1,4,2014,1
1,2014-04-01 00:17:00,40.7267,-74.0345,B02512,1900-01-01 00:17:00,1,4,2014,1
2,2014-04-01 00:21:00,40.7316,-73.9873,B02512,1900-01-01 00:21:00,1,4,2014,1
3,2014-04-01 00:28:00,40.7588,-73.9776,B02512,1900-01-01 00:28:00,1,4,2014,1
4,2014-04-01 00:33:00,40.7594,-73.9722,B02512,1900-01-01 00:33:00,1,4,2014,1


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4534327 entries, 0 to 1028135
Data columns (total 8 columns):
 #   Column     Dtype         
---  ------     -----         
 0   Date/Time  datetime64[ns]
 1   Lat        float64       
 2   Lon        float64       
 3   Base       object        
 4   Time       datetime64[ns]
 5   Day        int64         
 6   Month      int64         
 7   Year       int64         
dtypes: datetime64[ns](2), float64(2), int64(3), object(1)
memory usage: 311.3+ MB


In [29]:
data['Day_Name'] = data['Day_of_Week'].map({0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thur', 4: 'Fri', 5: 'Sat', 6: 'Sun'})
data.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Time,Day,Month,Year,Day_of_Week,Day_Name
0,2014-04-01 00:11:00,40.769,-73.9549,B02512,1900-01-01 00:11:00,1,4,2014,1,Tue
1,2014-04-01 00:17:00,40.7267,-74.0345,B02512,1900-01-01 00:17:00,1,4,2014,1,Tue
2,2014-04-01 00:21:00,40.7316,-73.9873,B02512,1900-01-01 00:21:00,1,4,2014,1,Tue
3,2014-04-01 00:28:00,40.7588,-73.9776,B02512,1900-01-01 00:28:00,1,4,2014,1,Tue
4,2014-04-01 00:33:00,40.7594,-73.9722,B02512,1900-01-01 00:33:00,1,4,2014,1,Tue
