# About the notebook

This jupyter notebook calculates the average frequency of each route in GTFS data. The main idea is to download the data from transitfeed website (https://transitfeeds.com/), clean data, and calculate the average frequency. Here, an average frequency of the route defined as the average waiting time of the successive bus at each stops aggregated for the route. These frequencies are grouped into time of the data; namely Early: 4 am - 6 am, AM Peak: 6 am - 9 am, Midday: 9 am - 3 pm, PM Peak: 3 pm - 7 pm, Evening: 7 pm - 11 pm, and Late Night: 11 pm - 4 am.

The current version of this code is just limited to the data analysis, whereas ArcGIS is used for visualization and basic mobility analysis. But, it can be expanded in Python too. 

In [None]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt

## Download transit data

In [None]:
def download_GTFS_data(GTFS_URLs):
    #--------------------------------------------
    #downloads the GTFS data inside data\ for all the GTFS data passed as dictionary
    #INPUT: GTFS_URLs: dictionary of agency name and 
    #OUTPUT: None
    #---------------------------------------------
    
    #get current working directory
    current_path=os.getcwd()

    #make new directory data
    data_folder_path=os.path.join(current_path,'data')
    if not os.path.exists(data_folder_path):
        os.mkdir(data_folder_path)
    
    for agencies in list(GTFS_URLs.keys()):
        #make new directory for each all GTFS input link
        data_dir=os.path.join(data_folder_path, agencies)
        if not os.path.exists(data_dir):
            os.mkdir(data_dir)
        
        #download data and extract it
        import requests, zipfile, io
        r = requests.get(GTFS_URLs[agencies])
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(data_dir)

In [None]:
#add url of GTFS of all agencies as dictionary
GTFS_URLs={'DDOT':'https://transitfeeds.com/p/detroit-department-of-transportation/299/latest/download',
          'DPM':'https://transitfeeds.com/p/detroit-transportation-corporation/1043/latest/download'}

#call function to download data
download_GTFS_data(GTFS_URLs)

# Frequency analysis for all agencies


Ideally, we would operate a for loop to calculate the average frequency of each routes. However, there was some unresolved errors in a for loop. Thus, the set of codes is repeated for each agency manually.

In [None]:
gtfs_path=r'data/DDOT/'

final_freq_df=pd.DataFrame()

#list of files to read
gtfs_files_to_read=['routes.txt', 'trips.txt', 'stop_times.txt','calendar.txt','stops.txt']

#read the files and create pandas dataframe with same filename
for files in gtfs_files_to_read:
    vars()[files[:-4]]  = pd.read_csv(gtfs_path+files)

#merge dataframes
trips_routes=pd.merge(trips, routes, on='route_id',how='left')
stoptime_trips_routes=pd.merge(stop_times, trips_routes, on='trip_id', how='left')
data_df=pd.merge(stoptime_trips_routes,stops, on='stop_id', how='left')

#retain only useful column
data=data_df[['trip_id','route_id', 'service_id', 'direction_id','shape_id',
         'arrival_time', 'departure_time', 'stop_id', 'stop_sequence',
        'stop_code', 'stop_name', 'stop_desc', 'stop_lat','stop_lon',
         'route_short_name', 'route_long_name','route_type','route_color']]


#convert arrival_time column into pandas datetime format
data['arrival_time_pd']=data.apply(lambda x: pd.to_datetime(x['arrival_time'].strip(),format='%H:%M:%S')
                                   if int(x['arrival_time'].strip().split(':')[0])<24 
                                   else np.nan,axis=1)
data['arrival_time_hr']=data['arrival_time_pd'].dt.hour

#bin arrival time of bus based on categorical time of day
## Early: 4 am - 6 am
## AM Peak: 6 am - 9 am
## Midday: 9 am - 3 pm
## PM Peak: 3 pm - 7 pm
## Evening: 7 pm - 11 pm
## Late Night: 11 pm - 4 am
data['interval']=pd.cut(data['arrival_time_hr'],[0,4,6,9,15,19,23,24],
                        labels=['Late Night (11 pm - 4 am)','Early (4 am - 6 am)','AM Peak (6 am - 9 am)',
                                'Midday (9 am - 3 pm)','PM Peak (3 pm - 7 pm)','Evening (7 pm - 11 pm)','Late Night1'])
#replace 'Late Night1' with 'Late Night'
data['interval']=data['interval'].replace({'Late Night1':'Late Night'})


#frequency at stops by routes
freq_at_stops_by_routes=(data.groupby(['stop_id','route_id','service_id','interval','shape_id'])
                  .arrival_time_pd
                  .agg({'count': 'count',
                        'avg_freq': lambda group: group.sort_values().diff().mean().seconds/60}))
freq_at_stops_by_routes.reset_index(inplace=True)
freq_at_stops_by_routes.set_index('stop_id',inplace=True)

#merge lon and lat values of station
freq_at_stops_by_routes_data=pd.merge(freq_at_stops_by_routes, stops[['stop_id','stop_lat','stop_lon']], on='stop_id',how='inner')
#add agency name in a column
freq_at_stops_by_routes_data['agency']=gtfs_path[5:-1]

#append in the final data frame
final_freq_df=final_freq_df.append(freq_at_stops_by_routes_data)

In [None]:
gtfs_path=r'data/DPM/'


#list of files to read
gtfs_files_to_read=['routes.txt', 'trips.txt', 'stop_times.txt','calendar.txt','stops.txt']

#read the files and create pandas dataframe with same filename
for files in gtfs_files_to_read:
    vars()[files[:-4]]  = pd.read_csv(gtfs_path+files)

#merge dataframes
trips_routes=pd.merge(trips, routes, on='route_id',how='left')
stoptime_trips_routes=pd.merge(stop_times, trips_routes, on='trip_id', how='left')
data_df=pd.merge(stoptime_trips_routes,stops, on='stop_id', how='left')

#retain only useful column
data=data_df[['trip_id','route_id', 'service_id', 'direction_id','shape_id',
         'arrival_time', 'departure_time', 'stop_id', 'stop_sequence',
        'stop_code', 'stop_name', 'stop_desc', 'stop_lat','stop_lon',
         'route_short_name', 'route_long_name','route_type','route_color']]


#convert arrival_time column into pandas datetime format
data['arrival_time_pd']=data.apply(lambda x: pd.to_datetime(x['arrival_time'].strip(),format='%H:%M:%S')
                                   if int(x['arrival_time'].strip().split(':')[0])<24 
                                   else np.nan,axis=1)
data['arrival_time_hr']=data['arrival_time_pd'].dt.hour

#bin arrival time of bus based on categorical time of day
## Early: 4 am - 6 am
## AM Peak: 6 am - 9 am
## Midday: 9 am - 3 pm
## PM Peak: 3 pm - 7 pm
## Evening: 7 pm - 11 pm
## Late Night: 11 pm - 4 am
data['interval']=pd.cut(data['arrival_time_hr'],[0,4,6,9,15,19,23,24],
                        labels=['Late Night (11 pm - 4 am)','Early (4 am - 6 am)','AM Peak (6 am - 9 am)',
                                'Midday (9 am - 3 pm)','PM Peak (3 pm - 7 pm)','Evening (7 pm - 11 pm)','Late Night1'])
#replace 'Late Night1' with 'Late Night'
data['interval']=data['interval'].replace({'Late Night1':'Late Night'})


#frequency at stops by routes
freq_at_stops_by_routes=(data.groupby(['stop_id','route_id','service_id','interval','shape_id'])
                  .arrival_time_pd
                  .agg({'count': 'count',
                        'avg_freq': lambda group: group.sort_values().diff().mean().seconds/60}))
freq_at_stops_by_routes.reset_index(inplace=True)
freq_at_stops_by_routes.set_index('stop_id',inplace=True)

#merge lon and lat values of station
freq_at_stops_by_routes_data=pd.merge(freq_at_stops_by_routes, stops[['stop_id','stop_lat','stop_lon']], on='stop_id',how='inner')
#add agency name in a column
freq_at_stops_by_routes_data['agency']=gtfs_path[5:-1]

#append in the final data frame
final_freq_df=final_freq_df.append(freq_at_stops_by_routes_data)

This dataframe has frequency of bus at each stops.

In [11]:
final_freq_df.head()

Unnamed: 0,stop_id,route_id,service_id,interval,shape_id,count,avg_freq,stop_lat,stop_lon,agency
0,15,7029,1,Early (4 am - 6 am),54068,2,60.0,42.327192,-83.156513,DDOT
1,15,7029,1,AM Peak (6 am - 9 am),54068,3,59.5,42.327192,-83.156513,DDOT
2,15,7029,1,Midday (9 am - 3 pm),54068,6,60.0,42.327192,-83.156513,DDOT
3,15,7029,1,PM Peak (3 pm - 7 pm),54068,2,60.0,42.327192,-83.156513,DDOT
4,15,7029,1,PM Peak (3 pm - 7 pm),54069,2,60.0,42.327192,-83.156513,DDOT


In [12]:
final_freq_df.describe(include='all')

Unnamed: 0,stop_id,route_id,service_id,interval,shape_id,count,avg_freq,stop_lat,stop_lon,agency
count,88852.0,88852.0,88852.0,88852,88852.0,88852.0,75674.0,88852.0,88852.0,88852
unique,,,,6,,,,,,2
top,,,,PM Peak (3 pm - 7 pm),,,,,,DDOT
freq,,,,18788,,,,,,88826
mean,5242.231677,7017.233354,11.592266,,53997.159377,5.226647,43.319146,42.38627,-83.101532,
std,15774.523881,260.131378,569.139716,,272.237902,4.282268,17.15779,0.039999,0.089753,
min,15.0,6996.0,1.0,,53928.0,1.0,10.666667,42.255418,-83.333502,
25%,2563.0,7003.0,1.0,,53947.0,2.0,30.0,42.356957,-83.171487,
50%,4978.0,7011.0,2.0,,53987.0,4.0,45.0,42.388575,-83.104374,
75%,7383.0,7020.0,3.0,,54028.0,7.0,60.0,42.419606,-83.034183,


For an average frequency of each routes, we can take average of the 'avg_freq' column on multiple columns in an order of agency, route id, service id, and interval

In [15]:
freq_df=final_freq_df[['agency','route_id', 'service_id', 'interval', 'count','shape_id',
       'avg_freq']].groupby(['agency','route_id','service_id','shape_id','interval']).agg(np.mean)

#remove 'Late Night1' rows in final data
freq_df=freq_df.reset_index()
freq_df=freq_df[freq_df['interval']!='Late Night1']
freq_df.head()

Unnamed: 0,agency,route_id,service_id,shape_id,interval,count,avg_freq
0,DDOT,6996,1,53928,Late Night (11 pm - 4 am),1.522727,40.0
1,DDOT,6996,1,53928,Early (4 am - 6 am),3.863636,31.564394
2,DDOT,6996,1,53928,AM Peak (6 am - 9 am),8.840909,20.118561
3,DDOT,6996,1,53928,Midday (9 am - 3 pm),15.431818,23.446591
4,DDOT,6996,1,53928,PM Peak (3 pm - 7 pm),9.318182,25.529924


### reference code (all attempts of data analysis)

In [None]:
# #convert arrival_time column into pandas datetime format
# data['arrival_time_pd']=data.apply(lambda x: pd.to_datetime(x['arrival_time'].strip(),format='%H:%M:%S')
#                                    if int(x['arrival_time'].strip().split(':')[0])<24 
#                                    else np.nan,axis=1)
# data['arrival_time_hr']=data['arrival_time_pd'].dt.hour

# #bin arrival time of bus based on categorical time of day
# ## Early: 4 am - 6 am
# ## AM Peak: 6 am - 9 am
# ## Midday: 9 am - 3 pm
# ## PM Peak: 3 pm - 7 pm
# ## Evening: 7 pm - 11 pm
# ## Late Night: 11 pm - 4 am
# data['interval']=pd.cut(data['arrival_time_hr'],[0,4,6,9,15,19,23,24],
#                         labels=['Late Night','Early','AM Peak','Midday','PM Peak','Evening','Late Night1'])
# #replace 'Late Night1' with 'Late Night'
# data['interval']=data['interval'].replace({'Late Night1':'Late Night'})

# #groupby stops and calaculate frequency and average time difference 
# # freq_at_stops=(data.groupby(['stop_id','interval'])
# #                   .arrival_time_pd
# #                   .agg({'count': 'count',
# #                         'avg_time_diff': lambda group: group.sort_values().diff().mean().seconds/60}))
# # freq_at_stops.reset_index(inplace=True)
# # freq_at_stops.set_index('stop_id',inplace=True)


# #frequency at stops by routes
# freq_at_stops_by_routes=(data.groupby(['stop_id','route_id','service_id','interval'])
#                   .arrival_time_pd
#                   .agg({'count': 'count',
#                         'avg_time_diff': lambda group: group.sort_values().diff().mean().seconds/60}))
# freq_at_stops_by_routes.reset_index(inplace=True)
# freq_at_stops_by_routes.set_index('stop_id',inplace=True)

# #summary of bus schedule at each stops
# # bus_schedule_at_stops=(data.groupby(['stop_id'])
# #                   .arrival_time_pd
# #                   .agg({'count': 'count',
# #                         'first_bus': lambda group: group.min().time(),
# #                        'last_bus': lambda group: group.max().time()}))
# # bus_schedule_at_stops=bus_schedule_at_stops.join(data[['stop_id','route_id']].groupby(['stop_id']).nunique(),
# #                                on='stop_id',how='left').drop(columns=['stop_id']).rename(columns={'route_id':'number_of_routes'})

# #merge lon and lat values of station
# #freq_at_stops_data=pd.merge(freq_at_stops, stops[['stop_id','stop_lat','stop_lon']], on='stop_id',how='inner')
# #bus_schedule_at_stops_data=pd.merge(bus_schedule_at_stops, stops[['stop_id','stop_lat','stop_lon']], on='stop_id',how='inner')
# freq_at_stops_by_routes_data=pd.merge(freq_at_stops_by_routes, stops[['stop_id','stop_lat','stop_lon']], on='stop_id',how='inner')

### save csv files

In [16]:
#get current working directory
current_path=os.getcwd()

#make new directory data
data_folder_path=os.path.join(current_path,'output')
if not os.path.exists(data_folder_path):
    os.mkdir(data_folder_path)
#freq_at_stops_data.to_csv('output/freq_at_stops_data.csv')
freq_df.to_csv('output/freq_at_stops_by_routes_data_only_freq.csv')
#bus_schedule_at_stops_data.to_csv('output/bus_schedule_at_stops_data.csv')