In [1]:
# Load the dataset
import pandas as pd

df = pd.read_csv("subset_cleaned.csv")

print(df.shape)
df.head()


(5217467, 18)


Unnamed: 0,user_id,app_id,is_recommended,hours,helpful,funny,date,title,date_release,rating,positive_ratio,user_reviews,price_final,discount,win,mac,linux,steam_deck
0,22793,534380,1,40.6,0,0,2022-10-08,Dying Light 2 Stay Human,2022-02-03,Mostly Positive,79,113138,30.0,0.0,True,False,False,True
1,912612,438100,1,8.1,0,0,2020-09-10,VRChat,2017-02-01,Mostly Positive,73,202265,0.0,0.0,True,False,False,True
2,461080,730,1,21.4,0,0,2021-11-19,Counter-Strike: Global Offensive,2012-08-21,Very Positive,88,7494460,15.0,0.0,True,True,True,True
3,737481,602960,1,41.9,0,0,2022-01-05,Barotrauma,2023-03-13,Very Positive,93,35639,24.0,0.0,True,True,True,True
4,4470638,105600,1,528.3,2,0,2016-10-09,Terraria,2011-05-16,Overwhelmingly Positive,97,943413,10.0,0.0,True,True,True,True


In [2]:
# Sample 5% of users and save the sampled subset

import numpy as np

# Get all unique users
unique_users = df.user_id.unique()

# Sample 5% of users without replacement
sample_size = int(0.05 * len(unique_users))
sampled_users = np.random.choice(unique_users, size=sample_size, replace=False)

# Keep all rows associated with the sampled users
df_sampled = df[df.user_id.isin(sampled_users)]

# Print dataset size before and after sampling
print("Original dataset shape:", df.shape)
print("Sampled dataset shape:", df_sampled.shape)

# Save to CSV
df_sampled.to_csv("subset_sampled.csv", index=False)

print("Saved subset_sampled.csv successfully.")


Original dataset shape: (5217467, 18)
Sampled dataset shape: (261195, 18)
Saved subset_sampled.csv successfully.


In [3]:
# Split sampled data into positive and negative recommendation edges

# Positive samples (is_recommended = 1)
pos = df_sampled[df_sampled.is_recommended == 1].copy()

# Negative samples (is_recommended = 0)
neg = df_sampled[df_sampled.is_recommended == 0].copy()

# Print the sizes of each subset
print("Positive samples:", len(pos))
print("Negative samples:", len(neg))

# Quick check: shape
print("pos shape:", pos.shape)
print("neg shape:", neg.shape)


Positive samples: 222249
Negative samples: 38946
pos shape: (222249, 18)
neg shape: (38946, 18)


In [4]:
# Perform an 80/20 train-test split separately for positive and negative samples

from sklearn.model_selection import train_test_split

# Split positive edges (80% train, 20% test)
pos_train, pos_test = train_test_split(
    pos,
    test_size=0.2,
    random_state=42  # ensures reproducibility
)

# Split negative edges (80% train, 20% test)
neg_train, neg_test = train_test_split(
    neg,
    test_size=0.2,
    random_state=42
)

# Print the sizes to verify correct split ratio
print("Positive train:", len(pos_train))
print("Positive test:", len(pos_test))
print("Negative train:", len(neg_train))
print("Negative test:", len(neg_test))

print("pos_train shape:", pos_train.shape)
print("pos_test shape:", pos_test.shape)
print("neg_train shape:", neg_train.shape)
print("neg_test shape:", neg_test.shape)

Positive train: 177799
Positive test: 44450
Negative train: 31156
Negative test: 7790
pos_train shape: (177799, 18)
pos_test shape: (44450, 18)
neg_train shape: (31156, 18)
neg_test shape: (7790, 18)


In [5]:
# Combine positive and negative splits into final train and test sets
# and save to CSV files for use by the entire team


# Combine train edges
train = pd.concat([pos_train, neg_train], ignore_index=True)

# Combine test edges
test = pd.concat([pos_test, neg_test], ignore_index=True)

# Print shapes for verification
print("Final train set shape:", train.shape)
print("Final test set shape:", test.shape)

# Save to CSV files
train.to_csv("train_edges_sampled.csv", index=False)
test.to_csv("test_edges_sampled.csv", index=False)


Final train set shape: (208955, 18)
Final test set shape: (52240, 18)
