In [None]:
# Combine two parquet files in the same folder as this notebook
import os
import polars as pl

# --- config ---
folder = os.getcwd()
# adjust names if your files are called differently
file1 = os.path.join(folder, "combined_final_ciciomt2024_clean.parquet")
file2 = os.path.join(folder, "combined_final_mqttset_clean.parquet")
output = os.path.join(folder, "combined_2.parquet")

# --- load ---
df1 = pl.read_parquet(file1)
df2 = pl.read_parquet(file2)

# --- combine ---
combined = pl.concat([df1, df2], how="diagonal_relaxed")

# --- save ---
combined.write_parquet(output, compression="zstd", compression_level=1)

print(f"✅ Combined {file1} + {file2} → {output}")
print("Rows:", combined.height)
print("Columns:", combined.width)
print("Schema:", combined.schema)


In [None]:
#!/usr/bin/env python3
import os
import polars as pl

folder = os.getcwd()
pattern = os.path.join(folder, "combined_2.parquet")
out_path = os.path.join(folder, "label_counts.txt")

# Scan, then select only the Label column (projection pushdown will keep it fast).
q = (
    pl.scan_parquet(pattern)
      .select(pl.col("Label").cast(pl.Categorical))
      .filter(pl.col("Label").is_not_null())
)

# Total rows (collect just a tiny scalar)
n_rows = (
    q.select(pl.len().alias("rows"))
     .collect(engine="streaming")["rows"][0]
)

# Label counts (materialize only the small result)
label_counts = (
    q.group_by("Label")
     .len()
     .sort("len", descending=True)
     .collect(engine="streaming")
     .rename({"len": "count"})
)

# Build text lines
lines = []
lines.append("Analyzing original Parquet files (glob): *_clean.parquet")
lines.append(f"Total rows: {n_rows}\n")
lines.append("Unique labels and their counts:")
for label, count in zip(label_counts["Label"].to_list(),
                        label_counts["count"].to_list()):
    lines.append(f"  {label}: {count}")
lines.append(f"\nTotal unique labels: {label_counts.shape[0]}")

# Write to txt
with open(out_path, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print(f"Wrote summary to: {out_path}")
