In [7]:
# Check overlap between train and test

# Get unique users from each set
train_users = set(train_df[user_col].unique())
test_users = set(test_df[user_col].unique())

print(f"amount of unique users in training set: {len(train_users)}")
print(f"amount unique users in test set: {len(test_users)}")

# Find users in test that aren't in train
users_not_in_train = test_users - train_users
print(f"Users in test but not in train: {len(users_not_in_train)}")

# Remove those users from test set
if len(users_not_in_train) > 0:
    test_df = test_df[~test_df[user_col].isin(users_not_in_train)]
    print(f"Test set size after removal: {len(test_df)}")
    print(f"Unique users in test after removal: {len(test_df[user_col].unique())}")
else:
    print("All test users appear in training set")

amount of unique users in training set: 1389
amount unique users in test set: 1389
Users in test but not in train: 0
All test users appear in training set


In [6]:
#clean dataset

# Defining column names
user_col = 'user_id'
item_col = 'item_id'
rating_col = 'rating'

# Check for missing values
print("\nMissing values in training set:")
print(train_df.isnull().sum())

# Drop rows with missing user_id or item_id
train_df = train_df.dropna(subset=[user_col, item_col])
test_df = test_df.dropna(subset=[user_col, item_col])

print(f"\nAfter dropping missing values:")
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

# Check for duplicates
duplicates = train_df.duplicated(subset=[user_col, item_col], keep=False)
print(f"\nNumber of duplicate user-item pairs: {duplicates.sum()}")

# Show duplicates
if duplicates.sum() > 0:
    print("\nExample of duplicates:")
    print(train_df[duplicates].head(10))

# Remove duplicates
train_df = train_df.drop_duplicates(subset=[user_col, item_col], keep='first')
test_df = test_df.drop_duplicates(subset=[user_col, item_col], keep='first')

print(f"\nAfter removing duplicates:")
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")


Missing values in training set:
item_id    0
user_id    0
rating     0
dtype: int64

After dropping missing values:
Training set shape: (30000, 3)
Test set shape: (6645, 3)

Number of duplicate user-item pairs: 6608

Example of duplicates:
       item_id                       user_id  rating
16  B07Y59H3K2  AGMWACNMAG74AXBF7IJ22IOZSZPA     2.0
30  B0036F0V4G  AHR7SVRVXDOSN43C6VDUUX64IDAA     4.0
31  B00DBDPOZ4  AHNJQNJSASC2EPCN6TQZJCO4KYVQ     5.0
36  B007X5103Q  AFPZBAMPLF7ZQH3ZRCR7ONBUDJQQ     5.0
55  B00DJRLDMU  AFIJTAKKNPWTFQQOML32MHXGCL5A     5.0
58  B003ZUXQD0  AHYS63TR4TOZXK6QNCCWIO7MVSCQ     5.0
63  B001EYUNVC  AEXKHRMC56RCLI6WHL57UOQ6NOSA     3.0
64  B004R1QUIO  AGYR6AMANPWMZJRHMLQLKNXPJTLA     2.0
77  B00M3D8EFU  AHJYTEUCAGPLF4ESBSA64AZARH3A     5.0
94  B00C1TTF86  AFAEQJ6Q3ILNRSVDO7XOWXYZBN4Q     2.0

After removing duplicates:
Training set shape: (26580, 3)
Test set shape: (6645, 3)


In [3]:
#load dataset
import pandas as pd

train_df = pd.read_parquet('data/train_video_games.parquet')
test_df = pd.read_parquet('data/test_video_games.parquet')

# Quick check that initial dataset was loaded
print(train_df.head())
print(train_df.info())

      item_id                       user_id  rating
0  B00ZM5OXD8  AHOGCWGRSFQ6YZH6QLYUMNQ4N3KA     5.0
1  B00O3JSRHW  AEKOQRDUY64SGH4PONBFTSIM2I2Q     4.0
2  B0056WJA30  AFA43JCV3C72LHM5BIVPV7UEJ2CA     5.0
3  B002I0J4NE  AFD3QCEDODUYBYMQGA6NWILJW7KA     5.0
4  B0031SWWPO  AGYF4ZSMTSZHCE3OH6CO5SJK5Y3A     5.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   item_id  30000 non-null  object 
 1   user_id  30000 non-null  object 
 2   rating   30000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 703.2+ KB
None
