In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 

import cartopy.crs as ccrs
import cartopy.feature as cfeature

import folium
from folium.plugins import FastMarkerCluster
from folium.plugins import HeatMap

import utm 
import os

import requests
import json

import pyarrow

In [3]:
%run functions.ipynb

In [4]:
# def Generate_basemap():
#     basemap = folium.Map(location=[40.730610 , -73.935242])
#     return basemap

# 1 Data Preparation

## 1.1 List of Stations (Lat,Lon,Capacity)

In [5]:
# URL of the JSON file
url = 'https://gbfs.lyft.com/gbfs/2.3/bkn/en/station_information.json'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    json_data = response.json()
    
    # Now you can work with the JSON data)
else:
    print('Failed to retrieve data:', response.status_code)

# Create a DataFrame from the 'stations' list
station_data = pd.DataFrame(json_data['data']['stations'])

# Select only the required columns
station_data = station_data[['short_name', 'name', 'region_id', 'lat', 'lon', 'capacity']]

### 1.1.1 Station Visualisation

In [6]:
# basemap = Generate_basemap()
# FastMarkerCluster(station_data[['lat', 'lon' , 'capacity']]).add_to(basemap)

# HeatMap(station_data[['lat', 'lon' , 'capacity']]).add_to(basemap)
# # basemap

## 1.2 Ride Data

### 1.2.1 Ride Data (2015-2021)

In [7]:
# Open all files / computing intensive
current_directory = os.getcwd()
file_name = "data"
base_folder_path = os.path.join(current_directory, file_name)
start_year = 2015
end_year = 2019
combined_data = combine_csv_files_in_years(base_folder_path,start_year,end_year)

In [8]:
len(combined_data)

1354234

In [9]:
# combined_data.columns

In [10]:
# From 2015 - 2021, 15 columns
# From 2022 - 2024, 13 columns
# Issues with 2017
# Issues with 2021

# current_directory = os.getcwd()
# file_name = "data//2016"
# folder_path = os.path.join(current_directory, file_name)

# data = combine_csv_files(folder_path)
# data.columns

In [11]:
# combined_data['starttime'] = pd.to_datetime(combined_data['starttime'])
# combined_data['stoptime'] = pd.to_datetime(combined_data['stoptime'])

# # Reduce memory usage
# cols = ['start station name', 'end station name', 'bikeid', 'usertype', 'gender']
# for col in cols:
#     combined_data[col] = combined_data[col].astype('category')

### 1.2.2 Ride Data (2020)

In [12]:
current_directory = os.getcwd()
file_name = "data/2020"
folder_path = os.path.join(current_directory, file_name)

data_2020 = combine_csv_files(folder_path)
data_2020.iloc[:3]

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,226,2020-01-01 00:04:50.1920,2020-01-01 00:08:37.0370,3186,Grove St PATH,40.719586,-74.043117,3211,Newark Ave,40.721525,-74.046305,29444,Subscriber,1984,2
1,377,2020-01-01 00:16:01.6700,2020-01-01 00:22:19.0800,3186,Grove St PATH,40.719586,-74.043117,3269,Brunswick & 6th,40.726012,-74.050389,26305,Subscriber,1989,2
2,288,2020-01-01 00:17:33.8770,2020-01-01 00:22:22.4420,3186,Grove St PATH,40.719586,-74.043117,3269,Brunswick & 6th,40.726012,-74.050389,29268,Customer,1989,1


In [13]:
data_2020['starttime'] = pd.to_datetime(data_2020['starttime'])
data_2020['stoptime'] = pd.to_datetime(data_2020['stoptime'])

# Reduce memory usage
cols = ['start station name', 'end station name', 'bikeid', 'usertype', 'gender']
for col in cols:
    data_2020[col] = data_2020[col].astype('category')

### 1.2.3 Ride Data (2024)

In [14]:
# Open only 2024

current_directory = os.getcwd()
file_name = "data/2024"
folder_path = os.path.join(current_directory, file_name)

data_2024 = combine_csv_files(folder_path)

# data_2024['start_lat'] = pd.to_numeric(data_2024["start_lat"])
# data_2024['end_lat'] = pd.to_numeric(data_2024["end_lat"])
# data_2024['start_lng'] = pd.to_numeric(data_2024["start_lng"])
# data_2024['end_lng'] = pd.to_numeric(data_2024["end_lng"])

In [15]:
data_2024['started_at'] = pd.to_datetime(data_2024['started_at'])
data_2024['ended_at'] = pd.to_datetime(data_2024['ended_at'])

cols = ['start_station_name', 'end_station_name', 'start_station_id', 'end_station_id', 'ride_id', 'rideable_type', 'member_casual']
for col in cols:
    data_2024[col] = data_2024[col].astype('category')

### 1.2.4. Ride Data (2019) - Baseline

In [16]:
# Open only 2019

current_directory = os.getcwd()
file_name = "data/2019"
folder_path = os.path.join(current_directory, file_name)

data_2019 = combine_csv_files(folder_path)
data_2019.iloc[:3]

# data_2019['start_lat'] = pd.to_numeric(data_2019["start_lat"])
# data_2019['end_lat'] = pd.to_numeric(data_2019["end_lat"])
# data_2019['start_lng'] = pd.to_numeric(data_2019["start_lng"])
# data_2019['end_lng'] = pd.to_numeric(data_2019["end_lng"])

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,201,2019-01-01 03:09:09.7110,2019-01-01 03:12:30.8790,3183,Exchange Place,40.716247,-74.033459,3214,Essex Light Rail,40.712774,-74.036486,29612,Subscriber,1993,1
1,505,2019-01-01 05:18:00.1060,2019-01-01 05:26:25.9050,3183,Exchange Place,40.716247,-74.033459,3638,Washington St,40.724294,-74.035483,29213,Subscriber,1972,2
2,756,2019-01-01 10:36:33.3400,2019-01-01 10:49:10.2600,3183,Exchange Place,40.716247,-74.033459,3192,Liberty Light Rail,40.711242,-74.055701,26164,Subscriber,1985,1


In [17]:
data_2019['starttime'] = pd.to_datetime(data_2019['starttime'])
data_2019['stoptime'] = pd.to_datetime(data_2019['stoptime'])

In [18]:
print("Accessible dataframes    Description                            # of columns  ")
print("combined_data            Gives combined data from 2015-2019     15")
print("data_2020                Gives data for 2020                    15")
print("data_2024                Gives data for 2024                    13")
print("data_2019                Gives data for 2019-Baseline           15")
print("station_data             Existing stations in NYC from Lyft      6")

Accessible dataframes    Description                            # of columns  
combined_data            Gives combined data from 2015-2019     15
data_2020                Gives data for 2020                    15
data_2024                Gives data for 2024                    13
data_2019                Gives data for 2019-Baseline           15
station_data             Existing stations in NYC from Lyft      6
