In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load Dataset

### **Disclaimer**

As the `final_dataset.csv` file size is > 100MB, we were unable to commit it to GitHub. 

As such, please download the dataset from: https://drive.google.com/file/d/1icZ2Yq1FZkN6FeYhYLmu9Bi7Fm0SiJgk/view?usp=share_link.

Please add the downloaded dataset into the folder `final_data`.

In [51]:
df = pd.read_csv("final_data/final_dataset.csv")

# Dataset Splitting

To minimise data leakage and maintain an even class distribution, we performed the train–validation–test split as follows:
- Split ratio: 70% train, 10% validation, 20% test.

- Stratification: Ensured stratified sampling so that the distribution of the 5 classes is preserved across all three sets.

- User grouping constraint: Reviews from the same user were grouped together to ensure that a user only appears in one set (train, validation, or test), preventing data leakage.

In [53]:
# Drop rows with missing IDs
df = df.dropna(subset=["user_id"]).reset_index(drop=True)

# --- Grouping by user ---
# Each user is one group. We'll assign the majority label of their reviews
user_labels = df.groupby("user_id")["label"].agg(lambda x: x.mode()[0])  # majority class
users = pd.DataFrame({"user_id": user_labels.index, "label": user_labels.values})

# --- Train/val/test split at user level ---
train_users, temp_users = train_test_split(
    users,
    test_size=0.30,
    stratify=users["label"],
    random_state=42
)

val_users, test_users = train_test_split(
    temp_users,
    test_size=2/3,  # 2/3 of 30% = 20%
    stratify=temp_users["label"],
    random_state=42
)

# --- Map back to reviews ---
train_df = df[df["user_id"].isin(train_users["user_id"])]
val_df   = df[df["user_id"].isin(val_users["user_id"])]
test_df  = df[df["user_id"].isin(test_users["user_id"])]

print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))

Train: 13569 Val: 1989 Test: 3858


# Save Dataset

In [54]:
train_df.to_csv("final_data/train_dataset.csv", index=False, encoding='utf-8-sig')
val_df.to_csv("final_data/val_dataset.csv", index=False, encoding='utf-8-sig')
test_df.to_csv("final_data/test_dataset.csv", index=False, encoding='utf-8-sig')