In [1]:
import pandas as pd
import os
from tqdm import tqdm
import geopandas as gpd
from shapely.geometry import LineString

In [2]:
_path = "../bike_network_data/" #put your directory path here

In [15]:
def create_dataframe (city):
    dir_path = os.path.join(_path, city)
    export_path = os.path.join(dir_path)
    
    # Get a list of all csv files in the directory
    csv_files = [f for f in os.listdir(dir_path) if f.endswith('.csv')]
    dataframes = []
    
    for file in tqdm(csv_files):
        # Extract year and month from filename
        year, month = file.split('.')[0].split('_')
    
        # Read csv file into a dataframe
        df = pd.read_csv(os.path.join(dir_path, file))
    
        # Add a new column for the month and year
        df['month'] = month
        df['year'] = year
    
        # Check if the DataFrame is not empty or does not contain only NaN values (avoid warning when concatenating dfs)
        if not df.empty and not df.isna().all().all():
            dataframes.append(df)
    
    # Concatenate all dataframes into a single dataframe
    if dataframes:
        combined_df = pd.concat(dataframes)
    
        # Write the combined dataframe to a new csv file in the export directory
        combined_df.to_csv(f'{export_path}/preprocessed_bike_rides.csv', index=False)
    return

In [29]:
# functions to create dataframes for stations and rides separately, can be used to create bike network
def create_stations_gdf(df: pd.DataFrame, crs_in: int = 4326, crs_out: int = 3857) -> gpd.GeoDataFrame:
    df = df.copy()  # Create a copy to avoid changing the original DataFrame

    # Create unique dataframes for start and end stations
    start_stations = df[
        [
            "start_station_id",
            "start_station_name",
            "start_station_description",
            "start_station_latitude",
            "start_station_longitude",
            "month",
            "year",
        ]
    ].drop_duplicates()
    end_stations = df[
        [
            "end_station_id",
            "end_station_name",
            "end_station_description",
            "end_station_latitude",
            "end_station_longitude",
            "month",
            "year",
        ]
    ].drop_duplicates()

    # Rename columns for uniformity
    start_stations.columns = [
        "station_id",
        "station_name",
        "station_description",
        "latitude",
        "longitude",
        "month",
        "year",
    ]
    end_stations.columns = [
        "station_id",
        "station_name",
        "station_description",
        "latitude",
        "longitude",
        "month",
        "year",
    ]

    # Concatenate the dfs and drop duplicates
    stations = pd.concat([start_stations, end_stations]).drop_duplicates()
    
    # Create GeoDataFrame
    gdf_stations = gpd.GeoDataFrame(
        stations,
        geometry=gpd.points_from_xy(stations.longitude, stations.latitude),
        crs=crs_in,
    )
    gdf_stations = gdf_stations.to_crs(epsg=crs_out)

    return gdf_stations

def create_rides_gdf(df: pd.DataFrame, crs_in: int = 4326, crs_out: int = 3857) -> gpd.GeoDataFrame:
    df = df.copy()  # Create a copy to avoid changing the original DataFrame
    
    # Create a new DataFrame with necessary columns
    df_rides = df[
        [
            "start_station_id",
            "end_station_id",
            "started_at",
            "ended_at",
            "duration",
            "start_station_name",
            "start_station_description",
            "start_station_latitude",
            "start_station_longitude",
            "end_station_name",
            "end_station_description",
            "end_station_latitude",
            "end_station_longitude",
            "month",
            "year",
        ]
    ].copy() # copy to avoid warning

    # Create LineString objects
    df_rides.loc[:, "geometry"] = df_rides.apply(
        lambda row: LineString(
            [
                (row["start_station_longitude"], row["start_station_latitude"]),
                (row["end_station_longitude"], row["end_station_latitude"]),
            ]
        ),
        axis=1,
    )

    # Create GeoDataFrame
    gdf_rides = gpd.GeoDataFrame(df_rides, geometry="geometry", crs=crs_in)
    gdf_rides = gdf_rides.to_crs(epsg=crs_out)
    return gdf_rides

def washington_create_stations_gdf(df: pd.DataFrame, crs_in: int = 4326, crs_out: int = 3857) -> gpd.GeoDataFrame:
    df = df.copy()  # Create a copy to avoid changing the original DataFrame

    # Create unique dataframes for start and end stations
    start_stations = df[
        [
            "start_station_id",
            "start_station_name",
            "start_lat",
            "start_lng",
            "month",
            "year",
        ]
    ].drop_duplicates()
    end_stations = df[
        [
            "end_station_id",
            "end_station_name",
            "end_lat",
            "end_lng",
            "month",
            "year",
        ]
    ].drop_duplicates()

    # Rename columns for uniformity
    start_stations.columns = [
        "station_id",
        "station_name",
        "latitude",
        "longitude",
        "month",
        "year",
    ]
    end_stations.columns = [
        "station_id",
        "station_name",
        "latitude",
        "longitude",
        "month",
        "year",
    ]

    # Concatenate the dfs and drop duplicates
    stations = pd.concat([start_stations, end_stations]).drop_duplicates()

    # Create GeoDataFrame
    gdf_stations = gpd.GeoDataFrame(
        stations,
        geometry=gpd.points_from_xy(stations.longitude, stations.latitude),
        crs=crs_in,
    )
    gdf_stations = gdf_stations.to_crs(epsg=crs_out)

    return gdf_stations

def portland_create_stations_gdf(df: pd.DataFrame, crs_in: int = 4326, crs_out: int = 3857) -> gpd.GeoDataFrame:
    df = df.copy()  # Create a copy to avoid changing the original DataFrame

    # Create unique dataframes for start and end stations
    start_stations = df[
        [
        "StartHub",
        "StartLatitude",
        "StartLongitude",
        "month",
        "year",
    ]
    ].drop_duplicates()
    end_stations = df[
        [
        "EndHub",
        "EndLatitude",
        "EndLongitude",
        "month",
        "year",
    ]
    ].drop_duplicates()

    # Rename columns for uniformity
    start_stations.columns = [
        "station_name",
        "latitude",
        "longitude",
        "month",
        "year",
    ]
    end_stations.columns = [
        "station_name",
        "latitude",
        "longitude",
        "month",
        "year",
    ]

    # Concatenate the dfs and drop duplicates
    stations = pd.concat([start_stations, end_stations]).drop_duplicates()

    # Create GeoDataFrame
    gdf_stations = gpd.GeoDataFrame(
        stations,
        geometry=gpd.points_from_xy(stations.longitude, stations.latitude),
        crs=crs_in,
    )
    gdf_stations = gdf_stations.to_crs(epsg=crs_out)

    return gdf_stations

def create_rides_washington_gdf(df: pd.DataFrame, crs_in: int = 4326, crs_out: int = 3857) -> gpd.GeoDataFrame:
    df = df.copy()  # Create a copy to avoid changing the original DataFrame
    
    # Create a new DataFrame with necessary columns
    df_rides = df[
        [
            "start_station_id",
            "end_station_id",
            "started_at",
            "ended_at",
            "start_station_name",
            "start_lat",
            "start_lng",
            "end_station_name",
            "end_lat",
            "end_lng",
            "month",
            "year",
        ]
    ].copy() # copy to avoid warning
    
    df_rides.columns = [
        "start_station_id",
        "end_station_id",
        "started_at",
        "ended_at",
        "start_station_name",
        "start_station_latitude",
        "start_station_longitude",
        "end_station_name",
        "end_station_latitude",
        "end_station_longitude",
        "month",
        "year",
    ]

    # Create LineString objects
    df_rides.loc[:, "geometry"] = df_rides.apply(
        lambda row: LineString(
            [
                (row["start_station_longitude"], row["start_station_latitude"]),
                (row["end_station_longitude"], row["end_station_latitude"]),
            ]
        ),
        axis=1,
    )

    # Create GeoDataFrame
    gdf_rides = gpd.GeoDataFrame(df_rides, geometry="geometry", crs=crs_in)
    gdf_rides = gdf_rides.to_crs(epsg=crs_out)
    return gdf_rides


def create_rides_portland_gdf(df: pd.DataFrame, crs_in: int = 4326, crs_out: int = 3857) -> gpd.GeoDataFrame:
    df = df.copy()  # Create a copy to avoid changing the original DataFrame

    # Create a new DataFrame with necessary columns
    df_rides = df[
        [
            "StartDate",
            "EndDate",
            "StartHub",
            "StartLatitude",
            "StartLongitude",
            "EndHub",
            "EndLatitude",
            "EndLongitude",
            "month",
            "year",
        ]
    ].copy()  # copy to avoid warning

    df_rides.columns = [
        "started_at",
        "ended_at",
        "start_station_name",
        "start_station_latitude",
        "start_station_longitude",
        "end_station_name",
        "end_station_latitude",
        "end_station_longitude",
        "month",
        "year",
    ]

    # Create LineString objects
    df_rides.loc[:, "geometry"] = df_rides.apply(
        lambda row: LineString(
            [
                (row["start_station_longitude"], row["start_station_latitude"]),
                (row["end_station_longitude"], row["end_station_latitude"]),
            ]
        ),
        axis=1,
    )

    # Create GeoDataFrame
    gdf_rides = gpd.GeoDataFrame(df_rides, geometry="geometry", crs=crs_in)
    gdf_rides = gdf_rides.to_crs(epsg=crs_out)
    return gdf_rides

In [5]:
df = pd.read_csv(f'{_path}/preprocessed_bike_rides.csv')

stations_oslo = create_stations_gdf(df)
rides_oslo = create_rides_gdf(df)

In [11]:
stations_oslo = stations_oslo.drop_duplicates(subset=['station_id'])

In [12]:
stations_oslo.shape

(277, 8)

In [13]:
rides_oslo.shape

(2407818, 16)

## Bergen

In [16]:
create_dataframe('Bergen')

100%|██████████| 24/24 [00:02<00:00, 10.65it/s]


In [17]:
bergen_path = os.path.join(_path, 'Bergen')
df = pd.read_csv(f'{bergen_path}/preprocessed_bike_rides.csv')

stations_bergen = create_stations_gdf(df)
rides_bergen = create_rides_gdf(df)

In [19]:
stations_bergen = stations_bergen.drop_duplicates(subset=['station_id'])

In [20]:
stations_bergen.shape

(123, 8)

In [21]:
rides_bergen.shape

(1003177, 16)

## Trondheim

In [22]:
create_dataframe('Trondheim')

100%|██████████| 18/18 [00:00<00:00, 25.91it/s]


In [23]:
trondheim_path = os.path.join(_path, 'Trondheim')
df = pd.read_csv(f'{trondheim_path}/preprocessed_bike_rides.csv')

stations_trondheim = create_stations_gdf(df)
rides_trondheim = create_rides_gdf(df)
stations_trondheim = stations_trondheim.drop_duplicates(subset=['station_id'])

In [24]:
stations_trondheim.shape

(73, 8)

In [25]:
rides_trondheim.shape

(286833, 16)

## Washington D.C.

In [26]:
create_dataframe('Washington_DC')

100%|██████████| 24/24 [00:14<00:00,  1.63it/s]


In [38]:
washington_path = os.path.join(_path, 'Washington_DC')
df = pd.read_csv(f'{washington_path}/preprocessed_bike_rides.csv')

stations_washington_nan = washington_create_stations_gdf(df)
rides_washington_nan = create_rides_washington_gdf(df)

In [42]:
stations_washington_nan.shape[0] - stations_washington_nan.dropna(subset=['station_id']).shape[0]

11334

In [43]:
stations_washington_nan = stations_washington_nan.drop_duplicates(subset=['station_id'])

Unnamed: 0,station_id,station_name,latitude,longitude,month,year,geometry
0,31136.0,11th & Clifton St NW,38.922089,-77.027172,10,23,POINT (-8574625.561 4710517.658)
2,31670.0,12th St & New York Ave NW,38.901104,-77.028061,10,23,POINT (-8574724.527 4707515.453)
3,31304.0,36th & Calvert St NW / Glover Park,38.922581,-77.070334,10,23,POINT (-8579430.336 4710588.024)
4,32220.0,W&OD Trail/Sunset Hills Rd & Isaac Newton Sq,38.951443,-77.340377,10,23,POINT (-8609491.385 4714718.584)
11,32422.0,The Mall at Prince Georges,38.968922,-76.954219,10,23,POINT (-8566504.511 4717220.844)
...,...,...,...,...,...,...,...
6505880,31103.0,15th & Harvard St NW,38.926124,-77.035421,1,22,POINT (-8575543.804 4711095.019)
421045,32909.0,tech trailer V-1,38.964459,-77.010759,10,23,POINT (-8572798.477 4716581.907)
880073,32900.0,Motivate BX Tech office,38.964406,-77.010759,4,23,POINT (-8572798.490 4716574.300)
3676665,32902.0,Motivate Tech Office,0.000000,0.000000,9,22,POINT (0.000 0.000)


In [44]:
stations_washington = stations_washington_nan.dropna(subset=['station_id'])

In [45]:
stations_washington.shape

(774, 7)

In [46]:
rides_washington_nan.shape

(7944116, 13)

## Portland

In [47]:
create_dataframe('Portland')

100%|██████████| 24/24 [00:01<00:00, 18.40it/s]


In [48]:
portland_path = os.path.join(_path, 'Portland')
df = pd.read_csv(f'{portland_path}/preprocessed_bike_rides.csv')

stations_portland_nan = portland_create_stations_gdf(df)
rides_portland_nan = create_rides_portland_gdf(df)

In [49]:
stations_portland_nan.shape[0] - stations_portland_nan.dropna(subset=['station_name']).shape[0]

32698

In [50]:
stations_portland_nan = stations_portland_nan.drop_duplicates(subset=['station_name'])

Unnamed: 0,station_name,latitude,longitude,month,year,geometry
0,SE 30th at Division,45.504689,-122.634555,11,19,POINT (-13651616.227 5701327.465)
1,,45.510808,-122.658733,11,19,POINT (-13654307.743 5702299.456)
3,SW 12th at Clay,45.515074,-122.687030,11,19,POINT (-13657457.684 5702977.130)
4,NE Broadway at 12th,45.534969,-122.653631,11,19,POINT (-13653739.747 5706138.220)
8,N Mason at Williams,45.553353,-122.666915,11,19,POINT (-13655218.470 5709060.377)
...,...,...,...,...,...,...
609796,SE Taylor at Chavez,45.514975,-122.623157,1,18,POINT (-13650347.397 5702961.401)
611839,NW Savier at 23rd - Disabled,45.534777,-122.698872,1,18,POINT (-13658775.919 5706107.724)
612260,N Russell at Interstate,45.541078,-122.676972,1,18,POINT (-13656338.011 5707109.209)
611118,Portland Night Market,45.517457,-122.663980,1,18,POINT (-13654891.792 5703355.658)


In [51]:
stations_portland = stations_portland_nan.dropna(subset=['station_name'])

In [52]:
stations_portland.shape

(185, 6)

In [53]:
rides_portland_nan.shape

(723744, 11)

In [67]:
rides_portland = rides_portland_nan.dropna(subset=['start_station_name'])
rides_portland_end = rides_portland_nan.dropna(subset=['end_station_name'])

In [68]:
rides_portland.shape

(454390, 11)

In [104]:
rides_portland = rides_portland_nan.dropna(thresh=10)

In [105]:
rides_portland.shape

(512854, 11)

In [113]:
rides_portland = rides_portland_nan.drop(rides_portland_nan.query("start_station_name.isna() & end_station_name.isna()").index, axis = 0)

In [114]:
rides_portland.shape

(611313, 11)