In [2]:
import os
from pathlib import Path
import pandas as pd
import requests
import zipfile
import io

def fetch_raw_data(year: int, month: int) -> str:
    # Construct the URL for JC-style file
    file_name = f"JC-{year}{month:02}-citibike-tripdata.csv.zip"
    url = f"https://s3.amazonaws.com/tripdata/{file_name}"
    
    response = requests.get(url)
    
    if response.status_code == 200:
        # Path to save the ZIP
        zip_path = Path("..") / "data" / "raw" / file_name
        zip_path.parent.mkdir(parents=True, exist_ok=True)

        # Save ZIP
        with open(zip_path, 'wb') as f:
            f.write(response.content)
        print(f"Successfully fetched ZIP file: {zip_path}")

        # Extract contents
        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            z.extractall(zip_path.parent)
            print(f"Extracted ZIP file to: {zip_path.parent}")

        # Return path to extracted CSV
        csv_file_name = file_name.replace(".zip", "")
        csv_path = zip_path.parent / csv_file_name
        return csv_path
    else:
        raise Exception(f"{url} is not available (status code: {response.status_code})")

def process_data(year: int, month: int):
    csv_path = fetch_raw_data(year, month)
    df = pd.read_csv(csv_path)
    print(f"Successfully loaded data from {csv_path}")
    return df

def fetch_all_data():
    # Example: fetch JC files for Jan to Mar 2023
    for month in range(1, 2):
        try:
            process_data(2023, month)
        except Exception as e:
            print(e)

# Trigger data fetching
fetch_all_data()


Successfully fetched ZIP file: ..\data\raw\JC-202301-citibike-tripdata.csv.zip
Extracted ZIP file to: ..\data\raw
Successfully loaded data from ..\data\raw\JC-202301-citibike-tripdata.csv


In [3]:
fetch_raw_data(2023, 1)

Successfully fetched ZIP file: ..\data\raw\JC-202301-citibike-tripdata.csv.zip
Extracted ZIP file to: ..\data\raw


WindowsPath('../data/raw/JC-202301-citibike-tripdata.csv')