In [36]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 

import cartopy.crs as ccrs
import cartopy.feature as cfeature

import folium
from folium.plugins import FastMarkerCluster
from folium.plugins import HeatMap

import utm 
import os

import requests
import json

import pyarrow

In [37]:
%run functions.ipynb

In [38]:
# def Generate_basemap():
#     basemap = folium.Map(location=[40.730610 , -73.935242])
#     return basemap

# 1 Data Preparation

## 1.1 List of Stations (Lat,Lon,Capacity)

In [39]:
# URL of the JSON file
url = 'https://gbfs.lyft.com/gbfs/2.3/bkn/en/station_information.json'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    json_data = response.json()
    
    # Now you can work with the JSON data)
else:
    print('Failed to retrieve data:', response.status_code)

# Create a DataFrame from the 'stations' list
station_data = pd.DataFrame(json_data['data']['stations'])

# Select only the required columns
station_data = station_data[['short_name', 'name', 'region_id', 'lat', 'lon', 'capacity']]

### 1.1.1 Station Visualisation

In [40]:
# basemap = Generate_basemap()
# FastMarkerCluster(station_data[['lat', 'lon' , 'capacity']]).add_to(basemap)

# HeatMap(station_data[['lat', 'lon' , 'capacity']]).add_to(basemap)
# # basemap

## 1.2 Ride Data

### 1.2.1 Ride Data (2015-2021)

In [41]:
#############################################
# # Open all files / computing intensive
# current_directory = os.getcwd()
# file_name = "data"
# base_folder_path = os.path.join(current_directory, file_name)
# start_year = 2015
# end_year = 2019
# combined_data = combine_csv_files_in_years(base_folder_path,start_year,end_year)
# len(combined_data)
#############################################

In [42]:
# combined_data.columns

In [43]:
# From 2015 - 2021, 15 columns
# From 2022 - 2024, 13 columns
# Issues with 2017
# Issues with 2021

# current_directory = os.getcwd()
# file_name = "data//2016"
# folder_path = os.path.join(current_directory, file_name)

# data = combine_csv_files(folder_path)
# data.columns

In [44]:
# combined_data['starttime'] = pd.to_datetime(combined_data['starttime'])
# combined_data['stoptime'] = pd.to_datetime(combined_data['stoptime'])

# # Reduce memory usage
# cols = ['start station name', 'end station name', 'bikeid', 'usertype', 'gender']
# for col in cols:
#     combined_data[col] = combined_data[col].astype('category')

### 1.2.2 Ride Data (2020)

In [64]:
folder_path = os.path.join(os.getcwd(), "data", "2020")

# List all files in the folder
file_names = os.listdir(folder_path)

# Initialize an empty list to store DataFrame chunks
chunks = []

# Loop through each file, load it, and append it to the list
for file_name in file_names:
    if file_name.endswith(".pkl.gz"):
        file_path = os.path.join(folder_path, file_name)
        chunk = pd.read_pickle(file_path, compression='gzip')
        chunks.append(chunk)

# Concatenate all DataFrame chunks into a single DataFrame
data_2020 = pd.concat(chunks, ignore_index=True)

In [61]:
# current_directory = os.getcwd()
# file_name = "data/2020/data_2020.pkl"
# folder_path = os.path.join(current_directory, file_name)

# # Load the DataFrame from the pickle file
# data_2020 = pd.read_pickle(folder_path)

In [63]:
# # Split the DataFrame into six chunks
# chunk_size = len(data_2020) // 6
# chunks = [data_2020[i:i+chunk_size] for i in range(0, len(data_2020), chunk_size)]

# # Define the folder path where the split files will be saved
# folder_path = os.path.join(os.getcwd(), "data", "2020")

# # Save each chunk as a separate gzip-compressed pickle file
# for i, chunk in enumerate(chunks):
#     chunk_file_name = f"data_2020_chunk_{i}.pkl.gz"
#     chunk_file_path = os.path.join(folder_path, chunk_file_name)
#     chunk.to_pickle(chunk_file_path, compression='gzip')

In [46]:
# #############################################
# current_directory = os.getcwd()
# file_name = "data/2020"
# folder_path = os.path.join(current_directory, file_name)

# data_2020 = combine_csv_files(folder_path)
# data_2020.iloc[:3]
# data_2020['starttime'] = pd.to_datetime(data_2020['starttime'])
# data_2020['stoptime'] = pd.to_datetime(data_2020['stoptime'])

# # Reduce memory usage
# cols = ['start station name', 'end station name', 'bikeid', 'usertype', 'gender']
# for col in cols:
#     data_2020[col] = data_2020[col].astype('category')

# # Define the file path
# current_directory = os.getcwd()
# file_name = "data/2020/data_2020.pkl"
# file_path = os.path.join(current_directory, file_name)

# # Save the DataFrame as a pickle file
# data_2020.to_pickle(file_path)
# #############################################

### 1.2.3 Ride Data (2021)

In [None]:
# folder_path = os.path.join(os.getcwd(), "data", "2021")

# # List all files in the folder
# file_names = os.listdir(folder_path)

# # Initialize an empty list to store DataFrame chunks
# chunks = []

# # Loop through each file, load it, and append it to the list
# for file_name in file_names:
#     if file_name.endswith(".pkl.gz"):
#         file_path = os.path.join(folder_path, file_name)
#         chunk = pd.read_pickle(file_path, compression='gzip')
#         chunks.append(chunk)

# # Concatenate all DataFrame chunks into a single DataFrame
# data_2021 = pd.concat(chunks, ignore_index=True)

In [47]:
# current_directory = os.getcwd()
# file_name = "data/2021/data_2021.pkl"
# folder_path = os.path.join(current_directory, file_name)

# # Load the DataFrame from the pickle file
# data_2021 = pd.read_pickle(folder_path)

In [74]:
# # Split the DataFrame into six chunks
# chunk_size = len(data_2021) // 30
# chunks = [data_2021[i:i+chunk_size] for i in range(0, len(data_2021), chunk_size)]

# # Define the folder path where the split files will be saved
# folder_path = os.path.join(os.getcwd(), "data", "2021")

# # Save each chunk as a separate gzip-compressed pickle file
# for i, chunk in enumerate(chunks):
#     chunk_file_name = f"data_2021_chunk_{i}.pkl.gz"
#     chunk_file_path = os.path.join(folder_path, chunk_file_name)
#     chunk.to_pickle(chunk_file_path, compression='gzip')

In [73]:

# station_data_21 = data_2021[['start_station_name','start_lat','start_lng']]
# station_data_21 = station_data_21.drop_duplicates(subset='start_station_name')
# station_data_21['capacity'] = 1
# # len(station_data_21['start_station_name'].unique())
# # len(station_data_21)

# station_data_21 = station_data_21.dropna(subset=['start_lat', 'start_lng'])

# # Sample data
# locations = station_data_21['start_station_name']
# latitudes = station_data_21['start_lat']
# longitudes = station_data_21['start_lng']
# sizes = station_data_21['capacity']
# #values = [20, 30, 25, 40]  # Values for the color scale

# # Initialize the map centered around the first location
# mymap = folium.Map()

# # Iterate over locations
# for lat, lon, size, location in zip(latitudes, longitudes, sizes, locations):
#     # Add circle marker with varying sizes
#     folium.CircleMarker(
#         location=[lat, lon],
#         radius=size / 10,  # Normalize size for better visualization
#         color='blue',
#         fill=True,
#         fill_color='blue',
#         fill_opacity=0.6,
#         popup=location
#     ).add_to(mymap)

# # Save the map to an HTML file
# mymap.save("nyc_bike_stations_21.html")

### 1.2.3 Ride Data (2024)

In [57]:

file_name = "data/2024/data_2024.pkl.gz"
folder_path = os.path.join(os.getcwd(), file_name)

# Load the DataFrame from the pickle file
# data_2019 = pd.read_pickle(folder_path)
data_2024 = pd.read_pickle(folder_path, compression='gzip')

In [59]:
# # Open only 2024
# ############################################
# current_directory = os.getcwd()
# file_name = "data/2024"
# folder_path = os.path.join(current_directory, file_name)

# data_2024 = combine_csv_files(folder_path)

# data_2024['started_at'] = pd.to_datetime(data_2024['started_at'])
# data_2024['ended_at'] = pd.to_datetime(data_2024['ended_at'])

# cols = ['start_station_name', 'end_station_name', 'start_station_id', 'end_station_id', 'ride_id', 'rideable_type', 'member_casual']
# for col in cols:
#     data_2024[col] = data_2024[col].astype('category')
# ############################################

In [55]:
# current_directory = os.getcwd()
# file_name = "data/2024/data_2024.pkl.gz"
# file_path = os.path.join(current_directory, file_name)

# # Save the DataFrame as a pickle file
# data_2024.to_pickle(file_path, compression='gzip')

In [11]:

# data_2024['start_lat'] = pd.to_numeric(data_2024["start_lat"])
# data_2024['end_lat'] = pd.to_numeric(data_2024["end_lat"])
# data_2024['start_lng'] = pd.to_numeric(data_2024["start_lng"])
# data_2024['end_lng'] = pd.to_numeric(data_2024["end_lng"])

### 1.2.4. Ride Data (2019) - Baseline

In [49]:
folder_path = os.path.join(os.getcwd(), "data", "2019")

# List all files in the folder
file_names = os.listdir(folder_path)

# Initialize an empty list to store DataFrame chunks
chunks = []

# Loop through each file, load it, and append it to the list
for file_name in file_names:
    if file_name.endswith(".pkl.gz"):
        file_path = os.path.join(folder_path, file_name)
        chunk = pd.read_pickle(file_path, compression='gzip')
        chunks.append(chunk)

# Concatenate all DataFrame chunks into a single DataFrame
data_2019 = pd.concat(chunks, ignore_index=True)

In [33]:
# # Split the DataFrame into six chunks
# chunk_size = len(data_2019) // 6
# chunks = [data_2019[i:i+chunk_size] for i in range(0, len(data_2019), chunk_size)]

# # Define the folder path where the split files will be saved
# folder_path = os.path.join(os.getcwd(), "data", "2019")

# # Save each chunk as a separate gzip-compressed pickle file
# for i, chunk in enumerate(chunks):
#     chunk_file_name = f"data_2019_chunk_{i}.pkl.gz"
#     chunk_file_path = os.path.join(folder_path, chunk_file_name)
#     chunk.to_pickle(chunk_file_path, compression='gzip')

In [27]:
# current_directory = os.getcwd()
# file_name = "data/2019/data_2019.pkl"
# folder_path = os.path.join(current_directory, file_name)
# data_2019.to_pickle(folder_path, compression='gzip')

# current_directory = os.getcwd()
# file_name = "data/2019/data_2019.pkl"
# folder_path = os.path.join(current_directory, file_name)

# # Load the DataFrame from the pickle file
# # data_2019 = pd.read_pickle(folder_path)
# data_2019 = pd.read_pickle(folder_path, compression='gzip')

In [12]:
# Open only 2019

# current_directory = os.getcwd()
# file_name = "data/2019"
# folder_path = os.path.join(current_directory, file_name)

# data_2019 = combine_csv_files(folder_path)
# data_2019.iloc[:3]

# # Reduce memory usage
# data_2019['starttime'] = pd.to_datetime(data_2019['starttime'])
# data_2019['stoptime'] = pd.to_datetime(data_2019['stoptime'])

# cols = ['start station name', 'end station name', 'bikeid', 'usertype', 'gender']
# for col in cols:
#     data_2019[col] = data_2019[col].astype('category')

# # Define the file path
# current_directory = os.getcwd()
# file_name = "data/2019/data_2019.pkl"
# file_path = os.path.join(current_directory, file_name)

# # Save the DataFrame as a pickle file
# data_2019.to_pickle(file_path)

In [60]:
print("Accessible dataframes    Description                            # of columns  ")
# print("combined_data            Gives combined data from 2015-2019     15")
print("data_2019                Gives data for 2019-Baseline           15")
# print("data_2020                Gives data for 2020                    15")
# print("data_2021                Gives data for 2021                    15")
print("data_2024                Gives data for 2024                    13")
print("station_data             Existing stations in NYC from Lyft      6")

Accessible dataframes    Description                            # of columns  
data_2019                Gives data for 2019-Baseline           15
data_2024                Gives data for 2024                    13
station_data             Existing stations in NYC from Lyft      6
