In [2]:
import pandas as pd
import os
import gzip
import numpy as np

datafiles = sorted(list(os.listdir('data')))

def adjust_colnames(df: pd.DataFrame) -> pd.DataFrame:
    if 'tripduration' in df.columns:
        df = df.rename(
            columns={
                'starttime': 'started_at',
                'stoptime': 'ended_at',
                'start station id': 'start_station_id',
                'start station name': 'start_station_name',
                'start station latitude': 'start_lat',
                'start station longitude': 'start_lng',
                'end station id': 'end_station_id',
                'end station name': 'end_station_name',
                'end station latitude': 'end_lat',
                'end station longitude': 'end_lng',
                'usertype': 'member_casual',
            })
        df = df.drop(columns=['tripduration', 'birth year', 'bikeid', 'gender'])
        df['rideable_type'] = np.nan
        df['member_casual'] = df['member_casual'].replace({'Subscriber': 'member', 'Customer': 'casual'})
    else:
        df = df.drop(columns=['ride_id'])

    return df

all_data = []   

for file in datafiles:
    if file.endswith('.csv'):
        data = adjust_colnames(pd.read_csv('data/' + file, low_memory=False))
        all_data.append(data)
        
all_data = pd.concat(all_data)


In [3]:
all_data['started_at'] = pd.to_datetime(all_data['started_at'], format='mixed')
all_data['ended_at'] = pd.to_datetime(all_data['ended_at'], format='mixed')
all_data['ride_duration_secs'] = (all_data['ended_at'] - all_data['started_at']).dt.total_seconds()

In [4]:
def get_euclidean_distance(row):
    return np.sqrt((row['start_lat'] - row['end_lat'])**2 + (row['start_lng'] - row['end_lng'])**2)

all_data['distance_traveled'] = all_data.apply(get_euclidean_distance, axis=1)

In [5]:
def get_trip_matrix(df):
    """
    returns a dataframe with the average trip duration and distance traveled between each station pair. Also the trip count between each station pair.
    :param df: 
    :return: 
    """
    df = df[['start_station_id', 'end_station_id', 'ride_duration_secs', 'distance_traveled', 'member_casual', 'rideable_type']]
    df2 = df[['start_station_id', 'end_station_id']]
    df2 = df2.groupby(['start_station_id', 'end_station_id']).size().reset_index(name='trip_count')
    df = df.groupby(['start_station_id', 'end_station_id', 'member_casual']).agg({'ride_duration_secs': 'mean', 'distance_traveled': 'mean'}).reset_index()
    df = df.pivot_table(index=['start_station_id', 'end_station_id'], columns='member_casual', values=['ride_duration_secs', 'distance_traveled']).reset_index()
    
    df.columns = ['_'.join(col).strip() for col in df.columns.values]
    df = df.rename(columns={'start_station_id_': 'start_station_id', 'end_station_id_': 'end_station_id'})
    df = df.fillna(0)
    df = pd.merge(df, df2, on=['start_station_id', 'end_station_id'], how='left')
    return df

def get_trip_count_matrix(df):
    """
    returns a dataframe with the number of trips between each station pair
    :param df: 
    :return: 
    """
    df = df[['start_station_id', 'end_station_id']]
    df = df.groupby(['start_station_id', 'end_station_id']).size().reset_index(name='trip_count')
    return df

    
    

trip_matrix = get_trip_matrix(all_data)

trip_matrix.head()

Unnamed: 0,start_station_id,end_station_id,distance_traveled_casual,distance_traveled_member,ride_duration_secs_casual,ride_duration_secs_member,trip_count
0,72,72,0.0,0.0,2200.54,792.102843,62
1,72,79,0.049813,0.0,1747.9,0.0,1
2,72,116,0.0,0.026596,0.0,840.623,1
3,72,127,0.037787,0.0,1399.094,0.0,1
4,72,128,0.041175,0.041175,1252.312,1468.311,2


In [6]:
trip_matrix.iloc[trip_matrix.trip_count.idxmax()]

start_station_id                 6876.04
end_station_id                   6876.04
distance_traveled_casual             0.0
distance_traveled_member             0.0
ride_duration_secs_casual    2331.718682
ride_duration_secs_member     1186.70138
trip_count                          6631
Name: 679341, dtype: object

In [7]:
all_data.columns

Index(['started_at', 'ended_at', 'start_station_id', 'start_station_name',
       'start_lat', 'start_lng', 'end_station_id', 'end_station_name',
       'end_lat', 'end_lng', 'member_casual', 'rideable_type',
       'ride_duration_secs', 'distance_traveled'],
      dtype='object')