In [21]:
import pandas as pd
import os.path

In [22]:
# Folder where all data for current uber information is kept
raw_data_folder = '/Users/bryan/Movies/Data For TDI Project/Raw_City_Information'
processed_data_folder = '/Users/bryan/Movies/Data For TDI Project/Processed_Data_City_Level'

In [23]:
# List of feature names to drop from this list, as we will not use them for the averaged data I pull out
# Note that although I drop std dev I will be calculating what is arguably a more useful std dev for my analysis
drop_features = ['sourceid','dstid','standard_deviation_travel_time','geometric_mean_travel_time',
                'geometric_standard_deviation_travel_time']

In [24]:
# This holds the definition used to create the smaller, more generalized, dataframes used for comparison

def calc_df(df_weekdays,df_weekends,drop_features):
    
    # Drop selected features from both datasets
    df_weekdays.drop(columns=drop_features,inplace=True)
    df_weekdays.reset_index(inplace=True,drop=True)
    df_weekends.drop(columns=drop_features,inplace=True)
    df_weekends.reset_index(inplace=True,drop=True)

    # Initialize dataframes for holding all information
    main_df = pd.DataFrame()
    temp_travel_df = pd.DataFrame()
    temp_stdDev_df = pd.DataFrame()

    # Add information for hour of day to both dataframes
    main_df['hod'] = df_weekdays['hod'].unique()
    temp_travel_df['hod'] = df_weekdays['hod'].unique()
    temp_stdDev_df['hod'] = df_weekdays['hod'].unique()

    # Store travel information in temp dataframe so it will always be overwritten, avoiding memory problems
    temp_travel_df['mean_travel_time'] = df_weekdays.groupby(df_weekdays['hod']).mean()
    temp_stdDev_df['std_dev_travel_time'] = df_weekdays.groupby(df_weekdays['hod']).std()
    # Delete original dataframe to avoid memory problems
    del df_weekdays
    # Merge this information with the main Dataframe
    main_df = pd.merge(main_df,temp_travel_df[['hod','mean_travel_time']],how='outer',on='hod')
    main_df = pd.merge(main_df,temp_stdDev_df[['hod','std_dev_travel_time']],how='outer',on='hod')
    # After including weekday information rename the columns in the dataframe
    main_df.rename(index=str, columns={"mean_travel_time": "Mean Travel Time (Weekday)", "std_dev_travel_time": "Std Dev Travel Time (Weekdays)"},inplace=True)

    # Now include the same information for the weekend
    # Store travel information in temp dataframe so it will always be overwritten, avoiding memory problems
    temp_travel_df['mean_travel_time'] = df_weekends.groupby(df_weekends['hod']).mean()
    temp_stdDev_df['std_dev_travel_time'] = df_weekends.groupby(df_weekends['hod']).std()
    # Delete original dataframe to avoid memory problems
    del df_weekends
    # Merge this information with the main Dataframe
    main_df = pd.merge(main_df,temp_travel_df[['hod','mean_travel_time']],how='outer',on='hod')
    main_df = pd.merge(main_df,temp_stdDev_df[['hod','std_dev_travel_time']],how='outer',on='hod')
    # After including weekday information rename the columns in the dataframe
    main_df.rename(index=str, columns={"hod":"Hour of Day","mean_travel_time": "Mean Travel Time (Weekend)", "std_dev_travel_time": "Std Dev Travel Time (Weekend)"},inplace=True)
    return main_df

In [25]:
# We now calculate a single average travel time for each hour of the day (separate for weekdays and weekends)
# At the same time we calculate the standard deviation for all average travel times for each hour

###### This box examines the city of Boston ##########

# Read in data for Boston (weekday and weekend separately)
df_weekdays = pd.read_csv(os.path.join(raw_data_folder,'Boston/boston-taz-2018-4-OnlyWeekdays-HourlyAggregate.csv'))
df_weekends = pd.read_csv(os.path.join(raw_data_folder,'Boston/boston-taz-2018-4-OnlyWeekends-HourlyAggregate.csv'))

# Calculate the smaller, more generalized dataframe for this area
boston_df = calc_df(df_weekdays,df_weekends,drop_features)
del df_weekdays
del df_weekends

# Now sort so we are in increasing order of time of day
boston_df.sort_values(by=['Hour of Day'],inplace=True)
boston_df.reset_index(inplace=True,drop=True)

boston_df.to_csv(os.path.join(processed_data_folder,'boston_traffic.csv'))
boston_df.head(24)

Unnamed: 0,Hour of Day,Mean Travel Time (Weekday),Std Dev Travel Time (Weekdays),Mean Travel Time (Weekend),Std Dev Travel Time (Weekend)
0,0,959.338452,442.334943,868.650337,416.670598
1,1,1084.600218,468.667742,917.426563,423.353915
2,2,1088.495127,500.096649,957.745715,460.385979
3,3,992.609426,435.50942,836.058005,389.498182
4,4,1431.816327,733.852476,992.332108,478.359157
5,5,792.003532,477.129194,684.820257,376.506069
6,6,989.076785,447.476777,858.353935,387.608626
7,7,1054.486724,470.48137,959.481385,461.935837
8,8,1051.73035,486.377707,867.940469,421.629185
9,9,1312.985084,578.515322,1010.019201,466.143763


In [26]:
# We now calculate a single average travel time for each hour of the day (separate for weekdays and weekends)
# At the same time we calculate the standard deviation for all average travel times for each hour

###### This box examines the city of Cincinnati ##########

# Read in data for Boston (weekday and weekend separately)
df_weekdays = pd.read_csv(os.path.join(raw_data_folder,'Cincinnati/cincinnati-taz2010-2018-4-OnlyWeekdays-HourlyAggregate.csv'))
df_weekends = pd.read_csv(os.path.join(raw_data_folder,'Cincinnati/cincinnati-taz2010-2018-4-OnlyWeekends-HourlyAggregate.csv'))

# Calculate the smaller, more generalized dataframe for this area
cincinnati_df = calc_df(df_weekdays,df_weekends,drop_features)
del df_weekdays
del df_weekends

# Now sort so we are in increasing order of time of day
cincinnati_df.sort_values(by=['Hour of Day'],inplace=True)
cincinnati_df.reset_index(inplace=True,drop=True)

cincinnati_df.to_csv(os.path.join(processed_data_folder,'cincinnati_traffic.csv'))
cincinnati_df.head(24)

Unnamed: 0,Hour of Day,Mean Travel Time (Weekday),Std Dev Travel Time (Weekdays),Mean Travel Time (Weekend),Std Dev Travel Time (Weekend)
0,0,809.465079,455.41769,666.889399,421.556466
1,1,845.63762,454.778487,749.41348,400.426665
2,2,640.173821,357.049068,611.012842,301.10377
3,3,712.590125,367.109742,688.541933,359.19838
4,4,933.142717,551.942485,741.037372,412.500971
5,5,654.388866,379.785913,634.593082,329.733111
6,6,714.359886,363.934781,660.06731,351.440182
7,7,721.371504,376.716455,627.02607,344.228435
8,8,733.327192,498.553648,569.824786,338.363931
9,9,738.150197,384.124164,691.784748,369.117707


In [27]:
# We now calculate a single average travel time for each hour of the day (separate for weekdays and weekends)
# At the same time we calculate the standard deviation for all average travel times for each hour

###### This box examines the city of Los Angeles ##########

# Read in data for Boston (weekday and weekend separately)
df_weekdays = pd.read_csv(os.path.join(raw_data_folder,'Los Angeles/los_angeles-taz-2018-4-OnlyWeekdays-HourlyAggregate.csv'))
df_weekends = pd.read_csv(os.path.join(raw_data_folder,'Los Angeles/los_angeles-taz-2018-4-OnlyWeekends-HourlyAggregate.csv'))

# Calculate the smaller, more generalized dataframe for this area
los_angeles_df = calc_df(df_weekdays,df_weekends,drop_features)
del df_weekdays
del df_weekends

# Now sort so we are in increasing order of time of day
los_angeles_df.sort_values(by=['Hour of Day'],inplace=True)
los_angeles_df.reset_index(inplace=True,drop=True)

los_angeles_df.to_csv(os.path.join(processed_data_folder,'los_angeles_traffic.csv'))
los_angeles_df.head(24)

Unnamed: 0,Hour of Day,Mean Travel Time (Weekday),Std Dev Travel Time (Weekdays),Mean Travel Time (Weekend),Std Dev Travel Time (Weekend)
0,0,961.442177,433.452156,974.97762,434.433063
1,1,1185.06196,505.680387,1068.927135,476.449346
2,2,2071.339222,1106.101918,1380.15301,665.132843
3,3,1416.076975,639.696927,1254.676376,604.233962
4,4,1515.726378,697.157981,1193.099811,543.498913
5,5,1372.98186,620.793668,1189.972028,568.838955
6,6,1148.641128,487.655852,1061.890192,469.356168
7,7,1271.516703,558.415478,1083.504625,489.13511
8,8,1766.285369,905.072605,886.938958,426.553494
9,9,1497.474435,725.455732,1061.062822,494.787278


In [28]:
# We now calculate a single average travel time for each hour of the day (separate for weekdays and weekends)
# At the same time we calculate the standard deviation for all average travel times for each hour

###### This box examines the city of Pittsburgh ##########

# Read in data for Boston (weekday and weekend separately)
df_weekdays = pd.read_csv(os.path.join(raw_data_folder,'Pittsburgh/pittsburgh-taz-2018-3-OnlyWeekdays-HourlyAggregate.csv'))
df_weekends = pd.read_csv(os.path.join(raw_data_folder,'Pittsburgh/pittsburgh-taz-2018-3-OnlyWeekends-HourlyAggregate.csv'))

# Calculate the smaller, more generalized dataframe for this area
pittsburgh_df = calc_df(df_weekdays,df_weekends,drop_features)
del df_weekdays
del df_weekends

# Now sort so we are in increasing order of time of day
pittsburgh_df.sort_values(by=['Hour of Day'],inplace=True)
pittsburgh_df.reset_index(inplace=True,drop=True)

pittsburgh_df.to_csv(os.path.join(processed_data_folder,'pittsburgh_traffic.csv'))
pittsburgh_df.head(24)

Unnamed: 0,Hour of Day,Mean Travel Time (Weekday),Std Dev Travel Time (Weekdays),Mean Travel Time (Weekend),Std Dev Travel Time (Weekend)
0,0,1237.219596,696.008976,942.031712,501.140577
1,1,712.584151,352.844369,791.792255,349.96057
2,2,960.668858,498.144444,776.882232,409.521603
3,3,897.432062,408.930119,859.396747,386.101303
4,4,888.476919,487.313561,784.925596,432.484731
5,5,848.134893,486.256635,765.29982,434.616024
6,6,1161.713585,643.766275,694.015576,362.725499
7,7,711.380464,352.025087,765.788316,345.781366
8,8,928.872099,489.929923,872.555217,465.803266
9,9,1338.709997,724.155188,962.110185,489.689998


In [29]:
# We now calculate a single average travel time for each hour of the day (separate for weekdays and weekends)
# At the same time we calculate the standard deviation for all average travel times for each hour

###### This box examines the city of San Francisco ##########

# Read in data for Boston (weekday and weekend separately)
df_weekdays = pd.read_csv(os.path.join(raw_data_folder,'San Francisco/san_francisco-taz-2018-3-OnlyWeekdays-HourlyAggregate.csv'))
df_weekends = pd.read_csv(os.path.join(raw_data_folder,'San Francisco/san_francisco-taz-2018-3-OnlyWeekends-HourlyAggregate.csv'))

# Calculate the smaller, more generalized dataframe for this area
san_francisco_df = calc_df(df_weekdays,df_weekends,drop_features)
del df_weekdays
del df_weekends

# Now sort so we are in increasing order of time of day
san_francisco_df.sort_values(by=['Hour of Day'],inplace=True)
san_francisco_df.reset_index(inplace=True,drop=True)

san_francisco_df.to_csv(os.path.join(processed_data_folder,'san_francisco_traffic.csv'))
san_francisco_df.head(24)

Unnamed: 0,Hour of Day,Mean Travel Time (Weekday),Std Dev Travel Time (Weekdays),Mean Travel Time (Weekend),Std Dev Travel Time (Weekend)
0,0,1392.398794,815.803641,1215.105404,745.504882
1,1,1395.298357,845.826945,1313.739803,813.750191
2,2,1121.457283,640.256313,1027.158103,575.912738
3,3,1276.451646,708.486284,1141.147694,664.590478
4,4,1222.041081,809.126871,863.633103,540.863828
5,5,1642.597805,1054.579548,1293.879579,790.987111
6,6,877.098658,565.069857,818.399663,519.826947
7,7,1549.261536,867.135813,1246.43686,738.225568
8,8,1770.094736,1130.991646,1334.706875,819.939576
9,9,920.298074,570.990412,841.873479,537.445441


In [30]:
# We now calculate a single average travel time for each hour of the day (separate for weekdays and weekends)
# At the same time we calculate the standard deviation for all average travel times for each hour

###### This box examines the city of Seattle ##########

# Read in data for Boston (weekday and weekend separately)
df_weekdays = pd.read_csv(os.path.join(raw_data_folder,'Seattle/seattle-taz-2018-3-OnlyWeekdays-HourlyAggregate.csv'))
df_weekends = pd.read_csv(os.path.join(raw_data_folder,'Seattle/seattle-taz-2018-3-OnlyWeekends-HourlyAggregate.csv'))

# Calculate the smaller, more generalized dataframe for this area
seattle_df = calc_df(df_weekdays,df_weekends,drop_features)
del df_weekdays
del df_weekends

# Now sort so we are in increasing order of time of day
seattle_df.sort_values(by=['Hour of Day'],inplace=True)
seattle_df.reset_index(inplace=True,drop=True)

seattle_df.to_csv(os.path.join(processed_data_folder,'seattle_traffic.csv'))
seattle_df.head(24)

Unnamed: 0,Hour of Day,Mean Travel Time (Weekday),Std Dev Travel Time (Weekdays),Mean Travel Time (Weekend),Std Dev Travel Time (Weekend)
0,0,622.303971,387.302091,631.655849,355.859088
1,1,1016.122413,551.88522,636.169089,361.527962
2,2,1213.207521,640.552927,830.092356,417.856676
3,3,684.997047,450.03834,603.471895,370.971204
4,4,965.457969,513.55411,721.998688,395.33324
5,5,987.435981,494.791423,781.662474,387.613883
6,6,701.695108,432.641278,627.997166,398.619266
7,7,718.956178,440.059416,675.848796,416.40504
8,8,1042.204982,613.267846,832.346622,447.782845
9,9,781.240954,394.551392,752.643286,373.316084


In [31]:
# We now calculate a single average travel time for each hour of the day (separate for weekdays and weekends)
# At the same time we calculate the standard deviation for all average travel times for each hour

###### This box examines the city of Washington DC ##########

# Read in data for Boston (weekday and weekend separately)
df_weekdays = pd.read_csv(os.path.join(raw_data_folder,'Washington DC/washington_DC-taz-2018-3-OnlyWeekdays-HourlyAggregate.csv'))
df_weekends = pd.read_csv(os.path.join(raw_data_folder,'Washington DC/washington_DC-taz-2018-3-OnlyWeekends-HourlyAggregate.csv'))

# Calculate the smaller, more generalized dataframe for this area
washington_DC_df = calc_df(df_weekdays,df_weekends,drop_features)
del df_weekdays
del df_weekends

# Now sort so we are in increasing order of time of day
washington_DC_df.sort_values(by=['Hour of Day'],inplace=True)
washington_DC_df.reset_index(inplace=True,drop=True)

washington_DC_df.to_csv(os.path.join(processed_data_folder,'washington_DC_traffic.csv'))
washington_DC_df.head(24)

Unnamed: 0,Hour of Day,Mean Travel Time (Weekday),Std Dev Travel Time (Weekdays),Mean Travel Time (Weekend),Std Dev Travel Time (Weekend)
0,0,1101.963295,508.240445,812.466364,378.151562
1,1,1218.068567,558.239575,953.969785,436.565934
2,2,733.91649,342.836173,682.136164,321.897831
3,3,965.533668,435.862397,872.612025,396.253297
4,4,671.439026,322.211393,753.676176,349.454399
5,5,982.724871,448.724748,882.900383,405.604511
6,6,965.507803,429.362603,902.986992,405.688883
7,7,882.206924,435.707908,744.977643,363.683526
8,8,996.348672,460.384726,846.821775,385.26016
9,9,771.627978,372.525108,691.301659,332.90616
