In [10]:
from pathlib import Path
import pandas as pd
from io import BytesIO
import zipfile
import requests
import os

In [38]:
def fetch_raw_data(year:int, month:int) -> str:
    if (year == 2024 and month in range(1, 5)) or (year == 2025 and month == 3):
        url = f"https://s3.amazonaws.com/tripdata/{year}{month:02}-citibike-tripdata.csv.zip"

        response = requests.get(url)
        if response.status_code == 200:
            path = Path('..') / 'data' / 'raw' / f"citi_rides_{year}_{month:02}.parquet"
            path.parent.mkdir(parents=True, exist_ok=True)

            with zipfile.ZipFile(BytesIO(response.content)) as z:
                csv_files = [f for f in z.namelist() if f.lower().endswith('.csv') and 'macosx' not in f.lower()]

                if not csv_files:
                    print("No CSV files found in zip file")
                else:
                    dfs = []
                    for csv_file in csv_files:
                        with z.open(csv_file) as f:
                            df = pd.read_csv(f)
                            for col in ['start_station_id', 'end_station_id']:
                                if col in df.columns:
                                    df[col] = df[col].astype(str)

                            dfs.append(df)

                    if dfs:
                        combined_df = pd.concat(dfs, ignore_index=True)
                        combined_df.to_parquet(path, index=False)
                        print(f"Saved {len(combined_df)} rows to {path}")
                        return str(path)
        else:
            print("Incorrect URL")

    elif (year >= 2024 and month in range (5, 13)) or (year == 2025 and month in (1, 2)):
        url = f"https://s3.amazonaws.com/tripdata/{year}{month:02}-citibike-tripdata.zip"

        response = requests.get(url)
        if response.status_code == 200:
            path = Path('..') / 'data' / 'raw' / f"citi_rides_{year}_{month:02}.parquet"
            path.parent.mkdir(parents=True, exist_ok=True)

            with zipfile.ZipFile(BytesIO(response.content)) as z:
                csv_files = [f for f in z.namelist() if f.lower().endswith('.csv') and 'macosx' not in f.lower()]

                if not csv_files:
                    print("No CSV files found in zip file")
                else:
                    dfs = []
                    for csv_file in csv_files:
                        with z.open(csv_file) as f:
                            df = pd.read_csv(f)
                            for col in ['start_station_id', 'end_station_id']:
                                if col in df.columns:
                                    df[col] = df[col].astype(str)

                            dfs.append(df)

                    if dfs:
                        combined_df = pd.concat(dfs, ignore_index=True)
                        combined_df.to_parquet(path, index=False)
                        print(f"Saved {len(combined_df)} rows to {path}")
                        return str(path)
        else:
            print("Incorrect URL")

    elif year < 2024:
        url = f"https://s3.amazonaws.com/tripdata/{year}-citibike-tripdata.zip"

        response = requests.get(url)
        if response.status_code == 200:
            path = Path('..') / 'data' / 'raw' / f"citi_rides_{year}_{month:02}.parquet"
            path.parent.mkdir(parents=True, exist_ok=True)

            outer_zip = zipfile.ZipFile(BytesIO(response.content))
            target_month_prefix = f"{year}{month:02}"
            monthly_zip_name = next(
                (f for f in outer_zip.namelist()
                 if f.endswith('.zip') and target_month_prefix in f and 'macosx' not in f.lower()),
                 None
            )

            if not monthly_zip_name:
                print(f"No zip found for {target_month_prefix}")
        
            with outer_zip.open(monthly_zip_name) as nested_zip_file:
                with zipfile.ZipFile(BytesIO(nested_zip_file.read())) as inner_zip:
                    csv_files = [f for f in inner_zip.namelist()
                                if f.endswith('.csv') and 'macosx' not in f.lower()]
                    dfs = []

                    for csv_file in csv_files:
                        with inner_zip.open(csv_file) as f:
                            df = pd.read_csv(f)

                            for col in ['start_station_id', 'end_station_id']:
                                if col in df.columns:
                                    df[col] = df[col].astype(str)

                            dfs.append(df)

                    if dfs:
                        combined_df = pd.concat(dfs, ignore_index=True)
                        combined_df.to_parquet(path, index=False)
                        print(f"Saved {len(combined_df)} rows to {path}")
                        return str(path)
                    else:
                        print("No CSV files found in the inner ZIP.")
        else:
            print("File not found")

In [27]:
fetch_raw_data(2023, 1)

  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 1795412 rows to ..\data\raw\citi_rides_2023_01.parquet


'..\\data\\raw\\citi_rides_2023_01.parquet'

In [28]:
for month in range(2,13):
    fetch_raw_data(2023, month=month)

  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 1696171 rows to ..\data\raw\citi_rides_2023_02.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 2118932 rows to ..\data\raw\citi_rides_2023_03.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 2749716 rows to ..\data\raw\citi_rides_2023_04.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 3453152 rows to ..\data\raw\citi_rides_2023_05.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 3451549 rows to ..\data\raw\citi_rides_2023_06.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 3659581 rows to ..\data\raw\citi_rides_2023_07.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 3964180 rows to ..\data\raw\citi_rides_2023_08.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 3471150 rows to ..\data\raw\citi_rides_2023_09.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 3725336 rows to ..\data\raw\citi_rides_2023_10.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 2816977 rows to ..\data\raw\citi_rides_2023_11.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 2204874 rows to ..\data\raw\citi_rides_2023_12.parquet


In [39]:
for month in range (5,13):
    fetch_raw_data(2024, month)

  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 4230360 rows to ..\data\raw\citi_rides_2024_05.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 4783576 rows to ..\data\raw\citi_rides_2024_06.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 4722896 rows to ..\data\raw\citi_rides_2024_07.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 4603575 rows to ..\data\raw\citi_rides_2024_08.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 4997898 rows to ..\data\raw\citi_rides_2024_09.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 5150054 rows to ..\data\raw\citi_rides_2024_10.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 3710134 rows to ..\data\raw\citi_rides_2024_11.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 2311171 rows to ..\data\raw\citi_rides_2024_12.parquet


In [40]:
for month in range (1,4):
    fetch_raw_data(2025, month)

  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 2124475 rows to ..\data\raw\citi_rides_2025_01.parquet


  df = pd.read_csv(f)
  df = pd.read_csv(f)


Saved 2031257 rows to ..\data\raw\citi_rides_2025_02.parquet


  df = pd.read_csv(f)


Saved 3168271 rows to ..\data\raw\citi_rides_2025_03.parquet
