In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load Dataset

In [51]:
df = pd.read_csv("final_data/final_dataset.csv")

In [52]:
df

Unnamed: 0,review_id,user_id,user_name,time,rating,review,gmap_id,business_name,latitude,longitude,...,review_length_words,review_embedding,business_desc_embedding,name_validity,review_similarity,category_similarity,avg_review_rating_by_user,std_review_rating_by_user,posts_per_year,count_significantly_different_ratings
0,1,1.100000e+20,Michelle Banks,2018-03-02 14:13:20,5.0,It's a beautiful place to read books and have ...,0x80c8bf81f68a634f:0xe605b4c3043783c9,Barnes & Noble,36.157754,-115.289418,...,17,[ 1.90505534e-02 -7.93722086e-03 -2.14198250e-...,[-1.83555495e-03 9.61866044e-03 2.88142487e-...,True,0.0,0.0,4.320354,1.188759,43.461538,136.0
1,2,1.060000e+20,Steven DeRyck [Staff],2018-10-20 01:46:40,4.0,"As previous reviews have stated, two small pie...",0x80c8c415f0a42c77:0x55c554fdc4ad8b9c,Carnegie Deli,36.120556,-115.173611,...,39,[-3.98575701e-02 4.56756018e-02 3.53269391e-...,[-4.08036783e-02 2.61357054e-02 3.17493156e-...,True,0.0,0.0,4.155598,1.368606,37.642857,147.0
2,3,1.100000e+20,Stevey Markovich,2020-09-13 12:26:40,5.0,Absolutely love this office! Afton is truly am...,0x80c8ce0f7732ee7b:0xea13348742f64327,Center for Cosmetic and Family Dentistry,36.001929,-115.107484,...,16,[-4.05264683e-02 -3.00896112e-02 -9.05165076e-...,[-8.50196630e-02 2.05087401e-02 -4.43694256e-...,True,0.0,0.0,4.320354,1.188759,43.461538,136.0
3,4,1.020000e+20,William Campbell,2018-10-20 01:46:40,3.0,The food is as good as it usually is,0x80c8dc9da25847c7:0x27b862b824ac757c,Asian Garden,36.168901,-115.060601,...,9,[ 1.34939998e-02 -1.85529497e-02 3.62251103e-...,[-2.38598082e-02 5.62610254e-02 4.09539938e-...,True,0.0,0.0,4.167969,1.325318,46.545455,145.0
4,5,1.120000e+20,Beverly Thorman,2018-03-02 14:13:20,5.0,We came in without an appointment on a Saturda...,0x80c8c03de37488fd:0xdc3302fd9f8f44a,Great Clips,36.191055,-115.258969,...,48,[-2.75374460e-03 1.00887353e-02 4.39300016e-...,[ 2.24305522e-02 -1.58977900e-02 1.25814928e-...,True,0.0,0.0,4.213084,1.262917,44.583333,147.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19485,19486,1.171940e+20,Rowan White,2025-06-08 13:40:00,3.0,"This spot is secretly a spaceship, probably.",0x80c8d0f443e4993d:0x21970488a62b5304,Shoe Palace,36.067121,-115.042427,...,7,[ 4.53867577e-02 -1.71024147e-02 5.96188568e-...,[-4.38414179e-02 1.12900045e-02 -2.52069859e-...,True,,,3.000000,,1.000000,1.0
19486,19487,1.397370e+20,Dakota Scott,2017-06-12 15:06:40,4.0,"Feels like my old school hallways, but this is...",0x80999aef1429f34f:0x84a9d151a2d7aead,Safeway,38.986503,-119.943734,...,12,[ 9.99895036e-02 8.81674979e-03 4.94781919e-...,[-2.60437261e-02 1.79667026e-02 -6.35378994e-...,False,,,4.000000,,1.000000,0.0
19487,19488,1.784250e+20,Casey Scott,2020-01-12 00:46:40,5.0,Sheesh this location exists ig.,0x8099410c27001a2f:0xecb44ab56ff35042,Casa Grande 2,39.484121,-119.852747,...,5,[-8.31940584e-03 2.37789107e-04 7.20330253e-...,[-1.26966881e-02 2.79331654e-02 -2.49020793e-...,True,,,5.000000,,1.000000,0.0
19488,19489,1.487900e+20,Jamie King,2023-03-12 19:40:00,3.0,Never gonna give you up—this place is on the m...,0x80c8ce478dc00207:0x99bc66a8ab3ad89,Chuck E. Cheese,36.021982,-115.116911,...,13,[-2.26007961e-02 -4.82657291e-02 4.31942381e-...,[-5.08386381e-02 -1.64995044e-02 -2.41248067e-...,True,,,3.000000,,1.000000,1.0


In [53]:
# Drop rows with missing IDs
df = df.dropna(subset=["user_id"]).reset_index(drop=True)

# --- Grouping by user ---
# Each user is one group. We'll assign the majority label of their reviews
user_labels = df.groupby("user_id")["label"].agg(lambda x: x.mode()[0])  # majority class
users = pd.DataFrame({"user_id": user_labels.index, "label": user_labels.values})

# --- Train/val/test split at user level ---
train_users, temp_users = train_test_split(
    users,
    test_size=0.30,
    stratify=users["label"],
    random_state=42
)

val_users, test_users = train_test_split(
    temp_users,
    test_size=2/3,  # 2/3 of 30% = 20%
    stratify=temp_users["label"],
    random_state=42
)

# --- Map back to reviews ---
train_df = df[df["user_id"].isin(train_users["user_id"])]
val_df   = df[df["user_id"].isin(val_users["user_id"])]
test_df  = df[df["user_id"].isin(test_users["user_id"])]

print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))

Train: 13569 Val: 1989 Test: 3858


In [54]:
train_df.to_csv("final_data/train_dataset.csv", index=False, encoding='utf-8-sig')
val_df.to_csv("final_data/val_dataset.csv", index=False, encoding='utf-8-sig')
test_df.to_csv("final_data/test_dataset.csv", index=False, encoding='utf-8-sig')