In [None]:
import pandas as pd

# ===========================
# 1. SETTINGS – EDIT THESE
# ===========================
csv_path = "../data/202301-citibike-tripdata_1.csv"   # <- your original file
output_path = "../data/202301-citibike-tripdata_sampled_50k.csv"
TARGET_ROWS = 50_000
RANDOM_SEED = 42

# ===========================
# 2. LOAD DATA
# ===========================
df = pd.read_csv(csv_path)

print("Original rows:", len(df))
print("Columns:", df.columns.tolist()[:10], "...")

# Make sure the time column is parsed correctly
df["started_at"] = pd.to_datetime(df["started_at"])

# Add a pure date column (YYYY-MM-DD)
df["date"] = df["started_at"].dt.date
print("Unique days in dataset:", df["date"].nunique())

# ===========================
# 3. COMPUTE PER-DAY SAMPLE SIZE
# ===========================
n_days = df["date"].nunique()
rows_per_day = TARGET_ROWS // n_days  # floor division
print("Target rows per day:", rows_per_day)

# ===========================
# 4. STRATIFIED SAMPLING
#    - for each day, sample ~rows_per_day rows
#    - within each day, preserve proportions of
#      member_casual and rideable_type
# ===========================
sampled_daily = []

for day, day_df in df.groupby("date"):
    day_size = len(day_df)
    if day_size == 0:
        continue
    
    # fraction of that day's rows to sample
    sample_frac = rows_per_day / day_size
    sample_frac = min(sample_frac, 1.0)  # cannot exceed 100%
    
    day_sample = (
        day_df
        .groupby(["member_casual", "rideable_type"], group_keys=False)
        .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
    )
    
    sampled_daily.append(day_sample)

sampled_df = pd.concat(sampled_daily, ignore_index=True)
print("Rows after per-day stratified sampling:", len(sampled_df))

# ===========================
# 5. OPTIONAL: TRIM TO EXACTLY TARGET_ROWS
# ===========================
if len(sampled_df) > TARGET_ROWS:
    sampled_df = sampled_df.sample(
        n=TARGET_ROWS,
        random_state=RANDOM_SEED
    ).reset_index(drop=True)

print("Final sampled rows:", len(sampled_df))

# Quick sanity checks (optional)
print("\nmember_casual distribution (sampled):")
print(sampled_df["member_casual"].value_counts(normalize=True))

print("\nrideable_type distribution (sampled):")
print(sampled_df["rideable_type"].value_counts(normalize=True))

print("\nDate range in sampled data:",
      sampled_df["date"].min(), "to", sampled_df["date"].max())

# ===========================
# 6. SAVE TO CSV
# ===========================
sampled_df.to_csv(output_path, index=False)
print("\nSaved sampled dataset to:", output_path)


  df = pd.read_csv(csv_path)


Original rows: 1000000
Columns: ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng'] ...
Unique days in dataset: 35
Target rows per day: 1428


  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, random_state=RANDOM_SEED))
  .apply(lambda g: g.sample(frac=sample_frac, rando

Rows after per-day stratified sampling: 44541
Final sampled rows: 44541

member_casual distribution (sampled):
member_casual
member    0.865135
casual    0.134865
Name: proportion, dtype: float64

rideable_type distribution (sampled):
rideable_type
classic_bike     0.501426
electric_bike    0.498574
Name: proportion, dtype: float64

Date range in sampled data: 2022-12-14 to 2023-01-31

Saved sampled dataset to: 202301-citibike-tripdata_sampled_50k.csv


In [None]:
cleaned_df = pd.read_csv(output_path)
print("\nReloaded cleaned data, rows:", len(cleaned_df))

# Check start date and end date
print("Date range in cleaned data:",
      cleaned_df["date"].min(), "to", cleaned_df["date"].max())

# Only keep rows in January 2023
cleaned_df["date"] = pd.to_datetime(cleaned_df["date"])
cleaned_df = cleaned_df[
      (cleaned_df["date"] >= "2023-01-01") & (cleaned_df["date"] < "2023-02-01")
]
print("Rows after filtering to January 2023:", len(cleaned_df))
display(cleaned_df)

# Check start date and end date
print("Date range in cleaned data:",
      cleaned_df["date"].min(), "to", cleaned_df["date"].max())

cleaned_df.to_csv("../data/cleaned_df.csv", index=False)



Reloaded cleaned data, rows: 44265
Date range in cleaned data: 2023-01-01 to 2023-01-31
Rows after filtering to January 2023: 44265


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date
0,E441DF12E01D900E,classic_bike,2023-01-01 13:36:56.305,2023-01-01 13:59:16.036,5 Ave & E 72 St,7100.07,5 Ave & E 63 St,6904.06,40.772828,-73.966853,40.766368,-73.971518,casual,2023-01-01
1,D0C5B4E524A0B7FB,classic_bike,2023-01-01 13:23:04.688,2023-01-01 13:41:52.653,3 Ave & E 72 St,7028.04,2 Ave & E 31 St,6197.02,40.769943,-73.960607,40.742909,-73.977061,casual,2023-01-01
2,BA2F79F3298E68CA,classic_bike,2023-01-01 12:24:57.138,2023-01-01 12:48:13.620,Fulton St & Broadway,5175.08,Fulton St & Broadway,5175.08,40.711066,-74.009447,40.711066,-74.009447,casual,2023-01-01
3,4E343A88DF619C04,classic_bike,2023-01-01 10:54:09.455,2023-01-01 11:42:20.771,5 Ave & E 72 St,7100.07,5 Ave & E 72 St,7100.07,40.772828,-73.966853,40.772828,-73.966853,casual,2023-01-01
4,97361E0404890052,classic_bike,2023-01-01 13:53:06.154,2023-01-01 14:13:22.392,Centre St & Chambers St,5207.01,N Moore St & Hudson St,5470.02,40.712733,-74.004607,40.719961,-74.008443,casual,2023-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44260,3DC1F545A9049C0D,electric_bike,2023-01-31 19:29:18.975,2023-01-31 19:35:18.405,E 80 St & 2 Ave,7121.02,E 91 St & 2 Ave,7286.01,40.773914,-73.954395,40.781153,-73.949630,member,2023-01-31
44261,326C4939768CF3E8,electric_bike,2023-01-31 17:06:59.853,2023-01-31 17:14:53.048,W 13 St & 5 Ave,5947.04,E 2 St & Avenue B,5515.02,40.735445,-73.994310,40.722174,-73.983688,member,2023-01-31
44262,7B2A676003FBE645,electric_bike,2023-01-31 16:12:50.222,2023-01-31 16:46:07.273,Dean St & Franklin Ave,4107.13,St Marks Pl & 2 Ave,5669.10,40.677592,-73.955637,40.728419,-73.987140,member,2023-01-31
44263,0F659DF655960412,electric_bike,2023-01-31 12:34:20.805,2023-01-31 12:38:17.233,E 48 St & 5 Ave,6626.01,W 42 St & 6 Ave,6517.08,40.757246,-73.978059,40.754920,-73.984550,member,2023-01-31


Date range in cleaned data: 2023-01-01 00:00:00 to 2023-01-31 00:00:00
