## Installing dependencies and setting up

In [None]:
# + pyarrow (fast CSV/Parquet IO) + polars (fast ETL) + fastparquet (optional)
!pip install pandas numpy tqdm scikit-learn xgboost lightgbm matplotlib pyarrow polars fastparquet


In [None]:
#!/usr/bin/env python3
import os
import polars as pl

# Look inside the 'combine' output folder
combine_dir = os.path.join(os.getcwd(), "combine")
if not os.path.isdir(combine_dir):
    raise SystemExit(f"Folder not found: {combine_dir}")

# Ensure parquet output folder exists
parquet_dir = os.path.join(os.getcwd(), "parquet")
os.makedirs(parquet_dir, exist_ok=True)

# Find CSVs in combine/
csv_files = [f for f in os.listdir(combine_dir) if f.lower().endswith(".csv")]
if not csv_files:
    raise SystemExit(f"No CSV files found in {combine_dir!r}.")

# Prefer 'combined.csv' if multiple exist
preferred = next((f for f in csv_files if f.lower() == "combined.csv"), None)
if preferred:
    csv_files = [preferred]

if len(csv_files) > 1:
    raise SystemExit(f"Expected a single CSV, found {len(csv_files)}: {csv_files}")

src = os.path.join(combine_dir, csv_files[0])
dst = os.path.join(parquet_dir, os.path.splitext(csv_files[0])[0] + "_clean.parquet")

# Stream CSV -> Parquet (no full collect)
(
    pl.scan_csv(
        src,
        try_parse_dates=True,
        infer_schema_length=200_000,  # set to 0 to scan all rows (slower but safest)
        # quote_char=None,            # uncomment only if you know there are NO quoted fields
        # ignore_errors=True,         # uncomment if you can drop occasional bad lines
    )
    .sink_parquet(
        dst,
        compression="zstd",
        compression_level=1,          # fast writes
        statistics=False,             # faster; enable if you’ll filter heavily later
        maintain_order=False,
        # row_group_size=500_000,     # tune for huge/wide files
    )
)

print(f"CSV -> Parquet: {src}  ➜  {dst}")


In [None]:
#!/usr/bin/env python3
import os
import pandas as pd

# Point to the 'parquet' folder next to your current working directory
folder_path = os.path.join(os.getcwd(), "parquet")

if not os.path.isdir(folder_path):
    raise SystemExit(f"Folder not found: {folder_path}")

# Find all Parquet files in parquet/
parquet_files = [f for f in os.listdir(folder_path)
                 if f.lower().endswith(".parquet")
                 and os.path.isfile(os.path.join(folder_path, f))]

if not parquet_files:
    print(f"No Parquet files found in {folder_path!r}.")
else:
    for parquet_file in parquet_files:
        file_path = os.path.join(folder_path, parquet_file)

        # Load Parquet
        df = pd.read_parquet(file_path)

        # Drop any rows with NaNs
        df_cleaned = df.dropna()

        # Overwrite the same file
        df_cleaned.to_parquet(file_path, index=False)

        print(f"Cleaned {parquet_file} → {len(df_cleaned)} rows kept")


In [None]:
#!/usr/bin/env python3
import os
import glob
import polars as pl

# Point to parquet/ folder
parquet_dir = os.path.join(os.getcwd(), "parquet")
if not os.path.isdir(parquet_dir):
    raise SystemExit(f"Folder not found: {parquet_dir}")

pattern = os.path.join(parquet_dir, "*_clean.parquet")

# Ensure there are matching files before scanning
matches = glob.glob(pattern)
if not matches:
    raise SystemExit(f"No Parquet files matching {pattern!r}")

# Scan, then select only the Label column (projection pushdown keeps it fast)
q = (
    pl.scan_parquet(pattern)
      .select(pl.col("Label").cast(pl.Categorical))
      .filter(pl.col("Label").is_not_null())
)

# Total rows (collect just a tiny scalar)
n_rows = (
    q.select(pl.len().alias("rows"))
     .collect(engine="streaming")["rows"][0]
)

# Label counts (materialize only the small result)
label_counts = (
    q.group_by("Label")
     .len()
     .sort("len", descending=True)
     .collect(engine="streaming")
     .rename({"len": "count"})
)

print(f"Analyzing Parquet files in: {parquet_dir}")
print(f"Total rows: {n_rows}\n")
print("Unique labels and their counts:")
for label, count in zip(label_counts["Label"].to_list(),
                        label_counts["count"].to_list()):
    print(f"  {label}: {count}")
print(f"\nTotal unique labels: {label_counts.shape[0]}")
