In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

RANDOM_STATE = 2026

## Load raw data

In [3]:
df = pd.read_csv(
    "../data/all-data.csv",
    encoding="cp1252",
    header=None,
    names=["output", "input"],
)

In [4]:
# -----------------------------
# Basic cleaning
# -----------------------------
# Strip whitespace
df["input"] = df["input"].astype(str).str.strip()
df["output"] = df["output"].astype(str).str.strip().str.lower()  # normalize labels

# Add instruction column
df["instruction"] = "Classify the sentiment of the following news."

# -----------------------------
# Drop missing values
# -----------------------------
before_dropna = len(df)
df = df.dropna(subset=["input", "output"])
after_dropna = len(df)

print(f"Rows before dropna: {before_dropna}")
print(f"Rows after dropna:  {after_dropna}")
print(f"Rows dropped by dropna: {before_dropna - after_dropna}")

# -----------------------------
# Drop duplicates
# -----------------------------
before_dedup = len(df)
df = df.drop_duplicates(subset=["input", "output"])
after_dedup = len(df)

print(f"Rows before drop_duplicates: {before_dedup}")
print(f"Rows after drop_duplicates:  {after_dedup}")
print(f"Rows dropped by drop_duplicates: {before_dedup - after_dedup}")

# -----------------------------
# Final sanity check
# -----------------------------
print("\nLabel distribution:")
print(df["output"].value_counts())


Rows before dropna: 4846
Rows after dropna:  4846
Rows dropped by dropna: 0
Rows before drop_duplicates: 4846
Rows after drop_duplicates:  4840
Rows dropped by drop_duplicates: 6

Label distribution:
output
neutral     2873
positive    1363
negative     604
Name: count, dtype: int64


## Stratified Split (80 / 10 / 10)

In [5]:
# First split: Train (80%) / Temp (20%)
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=df["output"]
)

# Second split: Validation (10%) / Test (10%)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=RANDOM_STATE,
    stratify=temp_df["output"]
)

## Verify class balance after split

In [6]:
for name, split_df in {
    "train": train_df,
    "val": val_df,
    "test": test_df
}.items():
    print(f"\n{name.upper()} distribution:")
    print(split_df["output"].value_counts(normalize=True))


TRAIN distribution:
output
neutral     0.593492
positive    0.281767
negative    0.124742
Name: proportion, dtype: float64

VAL distribution:
output
neutral     0.592975
positive    0.280992
negative    0.126033
Name: proportion, dtype: float64

TEST distribution:
output
neutral     0.595041
positive    0.280992
negative    0.123967
Name: proportion, dtype: float64


## Reduce dataset size (50%) for faster experiments

In [7]:
for split, split_df in {
    "train": train_df,
    "val": val_df,
    "test": test_df
}.items():

    # Reduce to 50% (for faster debugging experiments)
    reduced_df = split_df.sample(
        frac=0.5,
        random_state=RANDOM_STATE
    )

    # Save as JSON for LLM training format
    reduced_df[["instruction", "input", "output"]].to_json(
        f"../data/processed/{split}.json",
        orient="records",
        indent=2
    )