# 1a. Data Optimization: CSV to Parquet

This notebook converts the large `master_dataframe.csv` file into the Parquet format using a robust, chunk-by-chunk method that directly uses the `pyarrow` library to avoid potential versioning conflicts with pandas' `append` functionality.

In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
import os

# Define file paths
csv_path = '/Users/mariostam/Documents/Projects/thesis/llm-powered-feature-engineering/data/master_dataframe.csv'
parquet_path = '/Users/mariostam/Documents/Projects/thesis/llm-powered-feature-engineering/data/master_dataframe.parquet'
chunk_size = 100000  # 100,000 rows per chunk

# Create a reader object that yields chunks
reader = pd.read_csv(csv_path, chunksize=chunk_size)

# 1. Get the first chunk to infer the schema
print("Inferring schema from the first chunk...")
first_chunk = next(reader)
schema = pa.Table.from_pandas(first_chunk).schema

# 2. Set up the Parquet writer with the inferred schema
writer = pq.ParquetWriter(parquet_path, schema)

# 3. Write the first chunk
table = pa.Table.from_pandas(first_chunk, schema=schema)
writer.write_table(table)

# Get total lines for a more accurate progress bar
total_lines = sum(1 for line in open(csv_path, 'r')) - 1 # Subtract header

print(f"Converting {total_lines:,} rows from CSV to Parquet...")

# 4. Loop through the rest of the chunks and write them
# We already processed one chunk, so we subtract it from the total for tqdm
for chunk in tqdm(reader, total=(total_lines // chunk_size) - 1):
    table = pa.Table.from_pandas(chunk, schema=schema)
    writer.write_table(table)

# 5. Close the writer to finalize the file
writer.close()

print("
Conversion complete!")
print(f"Parquet file saved to: {parquet_path}")