#### csv to parquet

In [None]:
#!/usr/bin/env python3
import os
import polars as pl

# Define input/output folders
input_folder = os.path.join(os.getcwd(), "combined")
output_folder = os.path.join(os.getcwd(), "drop_nan_and_get_label_count")

# Create output folder if not exists
os.makedirs(output_folder, exist_ok=True)

# Find CSV file inside combined/
csv_files = [f for f in os.listdir(input_folder) if f.lower().endswith(".csv")]
if not csv_files:
    raise SystemExit(f"No CSV files found in folder: {input_folder}")
if len(csv_files) > 1:
    raise SystemExit(f"Expected a single CSV in {input_folder}, found multiple: {csv_files}")

src = os.path.join(input_folder, csv_files[0])
dst = os.path.join(output_folder, os.path.splitext(csv_files[0])[0] + "_clean.parquet")

# Stream CSV -> Parquet conversion (memory-efficient)
(
    pl.scan_csv(
        src,
        try_parse_dates=True,
        infer_schema_length=200_000,  # safer inference for large files
    )
    .sink_parquet(
        dst,
        compression="zstd",
        compression_level=1,           # fast compression
        statistics=False,              # faster write
        maintain_order=False,
    )
)

print(f"✅ CSV -> Parquet completed:\n  Input:  {src}\n  Output: {dst}")


#### removing rows containing NaN in Parquet

In [None]:
#!/usr/bin/env python3
import os
import pandas as pd

# Folder containing Parquet files
folder_path = os.path.join(os.getcwd(), "drop_nan_and_get_label_count")

# Find all Parquet files in that folder
parquet_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.parquet')]

if not parquet_files:
    print(f"No Parquet files found in folder: {folder_path}")
else:
    for parquet_file in parquet_files:
        file_path = os.path.join(folder_path, parquet_file)
        
        # Load Parquet
        df = pd.read_parquet(file_path)

        # Drop any rows with NaNs
        df_cleaned = df.dropna()

        # Overwrite the same file
        df_cleaned.to_parquet(file_path, index=False)

        print(f"✅ Cleaned {parquet_file} → {len(df_cleaned)} rows kept")

print("\n🎉 All Parquet files in drop_nan_and_get_label_count have been cleaned.")


In [None]:
#!/usr/bin/env python3
import os
import polars as pl

# Folder containing cleaned Parquet files
folder = os.path.join(os.getcwd(), "drop_nan_and_get_label_count")

# Input pattern and output file
pattern = os.path.join(folder, "*_clean.parquet")
out_file = os.path.join(folder, "label_counts.txt")

# Scan, then select only the Label column (projection pushdown keeps it efficient)
q = (
    pl.scan_parquet(pattern)
      .select(pl.col("Label").cast(pl.Categorical))
      .filter(pl.col("Label").is_not_null())
)

# Total row count (tiny scalar)
n_rows = (
    q.select(pl.len().alias("rows"))
     .collect(engine="streaming")["rows"][0]
)

# Label counts
label_counts = (
    q.group_by("Label")
     .len()
     .sort("len", descending=True)
     .collect(engine="streaming")
     .rename({"len": "count"})
)

# Write results to a text file
with open(out_file, "w") as f:
    f.write(f"Analyzing Parquet files from: {folder}\n\n")
    f.write(f"Total rows: {n_rows}\n\n")
    f.write("Unique labels and their counts:\n")
    for label, count in zip(label_counts["Label"].to_list(),
                            label_counts["count"].to_list()):
        f.write(f"  {label}: {count}\n")
    f.write(f"\nTotal unique labels: {label_counts.shape[0]}\n")

print(f"✅ Label count summary written to: {out_file}")
