# Merge Parquet Files

Concatenates the three class parquet files (Class 1 Human, Class 2 Standard AI, Class 3 Imposter AI) into a single `data.parquet` in the top-level `data/` folder.

In [None]:
import os
from pathlib import Path
import pandas as pd

candidates = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent]

def find_file(rel_path):
    for base in candidates:
        p = base.joinpath(rel_path)
        if p.exists():
            return p.resolve()
    return None

def read_parquet_safely(rel_path, label):
    p = find_file(rel_path)
    if p is None:
        print(f"{label} not found at expected locations: {rel_path}")
        return pd.DataFrame(), None
    try:
        df = pd.read_parquet(p)
        print(f"Read {len(df)} rows from {label} ({p})")
        return df, p
    except Exception as e:
        print(f"Failed to read {label} at {p}: {e}")
        return pd.DataFrame(), p

In [2]:
# Read human_class1.parquet
human_rel = os.path.join('data_human', 'processed', 'human_class1.parquet')
df_human, human_path = read_parquet_safely(human_rel, 'human_class1.parquet')

✅ Read 500 rows from human_class1.parquet (W:\Programming\PKOG\preprecog\data\data_human\processed\human_class1.parquet)


In [3]:
# Read ai_class2.parquet
ai2_rel = os.path.join('data_ai', 'processed', 'ai_class2.parquet')
df_ai2, ai2_path = read_parquet_safely(ai2_rel, 'ai_class2.parquet')

✅ Read 504 rows from ai_class2.parquet (W:\Programming\PKOG\preprecog\data\data_ai\processed\ai_class2.parquet)


In [4]:
# Read ai_class3.parquet
ai3_rel = os.path.join('data_ai', 'processed', 'ai_class3.parquet')
df_ai3, ai3_path = read_parquet_safely(ai3_rel, 'ai_class3.parquet')

✅ Read 504 rows from ai_class3.parquet (W:\Programming\PKOG\preprecog\data\data_ai\processed\ai_class3.parquet)


In [None]:
source_paths = [p for p in (human_path, ai2_path, ai3_path) if p is not None]
if source_paths:
    data_root = source_paths[0].parents[2]
else:
    data_root = Path.cwd()

out_path = data_root.joinpath('data.parquet')

frames = []
if not df_human.empty: frames.append(df_human)
if not df_ai2.empty: frames.append(df_ai2)
if not df_ai3.empty: frames.append(df_ai3)

if not frames:
    print("No dataframes read. Nothing to write.")
else:
    df_combined = pd.concat(frames, ignore_index=True)
    if 'origin_ref' in df_combined.columns:
        df_combined = df_combined.drop(columns=['origin_ref'])
        print("Dropped 'origin_ref' column from merged data")
    df_combined.to_parquet(out_path)
    print(f"Wrote {len(df_combined)} total rows to {out_path}")
    print(f" - human_class1: {len(df_human)}")
    print(f" - ai_class2: {len(df_ai2)}")
    print(f" - ai_class3: {len(df_ai3)}")

ℹ️ Dropped 'origin_ref' column from merged data
✅ Wrote 1508 total rows to W:\Programming\PKOG\preprecog\data\data.parquet
Breakdown:
 - human_class1.parquet: 500
 - ai_class2.parquet: 504
 - ai_class3.parquet: 504
