In [1]:
import pandas as pd
from datetime import datetime, timedelta
# from geopy.geocoders import Nominatim

In [2]:
# Define time threshold to ensure there are no rides under 60 seconds

time_threshold = timedelta(seconds=60)

In [3]:
# Define standard column names to be used throughout

standard_cols = ['tripduration', 'starttime', 'stoptime', 'start station id', 'start station name', 
                 'start station latitude', 'start station longitude', 'end station id', 'end station name', 
                 'end station latitude', 'end station longitude', 'bikeid', 'usertype', 'birth year', 'gender']

In [4]:
# Define preprocessing steps for starttime / stoptime column name

def pre_process(df):
    
    '''
    Processing a dataframe of individual ride data by performing the following steps:
    - Converts start time to dataype datetime and set it as the index
    - Group the data by 1D frequency, include start station id, latitude, longitude
    - Aggregate data by count
    - Rename "count" column "ride_count"
    - Reset the index then move starttime back to the index (removes multi-index layers)
    '''
    
    df['starttime'] = pd.to_datetime(df['starttime'])
    
    df = df.set_index('starttime')
    
    grouper = df.groupby([pd.Grouper(freq='1D'), 'start station id', 'start station latitude', 'start station longitude'])
    
    df_grouped = pd.DataFrame(grouper['start station id'].count())
    
    df_grouped = df_grouped.rename(columns={'start station id': 'ride_count'})
    
    df_grouped = df_grouped.reset_index().set_index('starttime')
    
    return df_grouped

In [None]:
# Define function for rounding the station ID

def convert_station(station):
    '''
    Convert station ID to a round number
    Leave non-numeric station IDs as they are
    '''
    try:
        return round(float(station))
    except ValueError:
        return station

In [51]:
# Define preprocessing steps for starttime / stoptime column name

def pre_process_2021(df):
    
    '''
    Same as other processing function except customized slightly given ride share data in 
    2021 has different column names
    
    Processing a dataframe of individual ride data by performing the following steps:
    - Converts start time to dataype datetime and set it as the index
    - Group the data by 1D frequency, include start station id, latitude, longitude
    - Aggregate data by count
    - Rename "count" column "ride_count"
    - Reset the index then move starttime back to the index (removes multi-index layers)
    - Renaming rest of columns to match previous version
    '''
    
    df['starttime'] = pd.to_datetime(df['started_at'])
    
    df = df.set_index('starttime')
    
    grouper = df.groupby([pd.Grouper(freq='1D'), 'start_station_id', 'start_lat', 'start_lng'])
    
    df_grouped = pd.DataFrame(grouper['start_station_id'].count())
    
    df_grouped = df_grouped.rename(columns={'start_station_id': 'ride_count'})
    
    df_grouped = df_grouped.reset_index().set_index('starttime')
    
    df_grouped['start_station_id'] = df_grouped['start_station_id'].apply(lambda x: convert_station(x))
    
    df_grouped.rename(columns={'start_station_id': 'start station id', 
                             'start_lat': 'start station latitude', 
                             'start_lng': 'start station longitude'}, inplace=True)
    
    return df_grouped

In [47]:
def convert_station(station):
    '''
    Convert station ID to a round number
    Leave non-numeric station IDs as they are
    '''
    try:
        return round(float(station))
    except ValueError:
        return station

In [None]:
# Test month from Feb 2021

### Read in data in six steps:
- June 2013 standalone
- July 2013 through August 2014
- Sept 2014 through Sept 2016
- Oct 2016 through March 2017 (pass in different column names)
- April 2017 through Jan 2021
- Feb 2021 through Oct 2021

In [5]:
# Read in June 2013

df_201306 = pd.read_csv('./ridership_raw/201306-citibike-tripdata.csv')

In [6]:
# Preprocess June 2013 dataset

df_201306 = pre_process(df_201306)

df_201306.to_csv('./processed/201306.csv')

In [7]:
# Read in July 2013 through August 2014

# df_list1 = []

month = 7
year = 2013

for i in range(14):
    
    if month > 12:
        month = 1
        year = 2014
    
    if month >= 10:
        df = pd.read_csv(f"./ridership_raw/{year}-{month} - Citi Bike trip data.csv")
        df = pre_process(df)
        df.to_csv(f"./processed/{year}{month}.csv")
    else:
        df = pd.read_csv(f"./ridership_raw/{year}-0{month} - Citi Bike trip data.csv")
        df = pre_process(df)
        df.to_csv(f"./processed/{year}0{month}.csv")
    
    month += 1

In [8]:
# Read in Sept 2014 through Sept 2016

# df_list2 = []

month = 9
year = 2014

for i in range(25):
    
    if month > 12:
        month = 1
        year += 1
    
    if month >= 10:
        df = pd.read_csv(f"./ridership_raw/{year}{month}-citibike-tripdata.csv")
        df = pre_process(df)
        df.to_csv(f"./processed/{year}{month}.csv")
    else:
        df = pd.read_csv(f"./ridership_raw/{year}0{month}-citibike-tripdata.csv")
        df = pre_process(df)
        df.to_csv(f"./processed/{year}0{month}.csv")
    month += 1

In [9]:
# Read in one of the files to make sure it's exported correctly

test = pd.read_csv('./processed/201409.csv')

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9748 entries, 0 to 9747
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   starttime                9748 non-null   object 
 1   start station id         9748 non-null   int64  
 2   start station latitude   9748 non-null   float64
 3   start station longitude  9748 non-null   float64
 4   ride_count               9748 non-null   int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 380.9+ KB


In [10]:
# Read in Oct 2016 through March 2017 with different column names

# df_list3 = []

month = 10
year = 2016

for i in range(6):
    
    if month > 12:
        month = 1
        year += 1
    
    if month >= 10:
        df = pd.read_csv(f"./ridership_raw/{year}{month}-citibike-tripdata.csv", names=standard_cols, skiprows=1)
        df = pre_process(df)
        df.to_csv(f"./processed/{year}{month}.csv")

    else:
        df = pd.read_csv(f"./ridership_raw/{year}0{month}-citibike-tripdata.csv", names=standard_cols, skiprows=1)
        df = pre_process(df)
        df.to_csv(f"./processed/{year}0{month}.csv")
    
    month += 1

In [11]:
# Read in April 2017 through Jan 2021

# df_list4 = []

month = 4
year = 2017

for i in range(46):
    
    if month > 12:
        month = 1
        year += 1
    
    if month >= 10:
        df = pd.read_csv(f"./ridership_raw/{year}{month}-citibike-tripdata.csv")
        df = pre_process(df)
        df.to_csv(f"./processed/{year}{month}.csv")
    else:
        df = pd.read_csv(f"./ridership_raw/{year}0{month}-citibike-tripdata.csv")
        df = pre_process(df)
        df.to_csv(f"./processed/{year}0{month}.csv")
    
    month += 1

In [12]:
test2 = pd.read_csv('./ridership_raw/202102-citibike-tripdata.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [15]:
test2.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,BBA33D73DECE976F,docked_bike,2021-02-26 16:38:54,2021-02-26 16:44:37,E 84 St & Park Ave,7243.04,E 78 St & 2 Ave,7057.07,40.778626,-73.95772,40.772797,-73.955778,casual
1,B63D7AFF9AC5B6D4,docked_bike,2021-02-17 11:09:11,2021-02-17 11:26:47,Macon St & Nostrand Ave,4214.03,Bond St & Fulton St,4479.06,40.680983,-73.950047,40.689622,-73.983043,member
2,52B829195C469D99,docked_bike,2021-02-26 18:33:29,2021-02-26 19:05:41,Macon St & Nostrand Ave,4214.03,Lefferts Pl & Franklin Ave,4222.02,40.680983,-73.950047,40.680342,-73.955769,casual
3,19C84ECA2B468476,docked_bike,2021-02-26 12:48:35,2021-02-26 13:07:24,Macon St & Nostrand Ave,4214.03,Bond St & Fulton St,4479.06,40.680983,-73.950047,40.689622,-73.983043,member
4,C0DDB771E70D9DF5,docked_bike,2021-02-25 17:23:22,2021-02-25 17:28:20,Madison Ave & E 26 St,6131.12,W 37 St & 5 Ave,6398.06,40.742685,-73.986713,40.75038,-73.98339,member


In [56]:
# Read in remainder through Oct 2021

# df_list5 = []

month = 2
year = 2021

for i in range(9):
    
    if month > 12:
        month = 1
        year += 1
    
    if month >= 10:
        df = pd.read_csv(f"./ridership_raw/{year}{month}-citibike-tripdata.csv", low_memory=False)
        df = pre_process_2021(df)
        df.to_csv(f"./processed/{year}{month}.csv")
    else:
        df = pd.read_csv(f"./ridership_raw/{year}0{month}-citibike-tripdata.csv", low_memory=False)
        df = pre_process_2021(df)
        df.to_csv(f"./processed/{year}0{month}.csv")
    
    month += 1