In [None]:
import pandas as pd

# Load the data
train_df = pd.read_parquet('data/train_video_games.parquet')

# See all column names
print("COLUMN NAMES:")
print(train_df.columns.tolist())

# See first few rows to understand the data
print("\nFIRST FEW ROWS:")
print(train_df.head())

# See data types
print("\nDATA TYPES:")
print(train_df.dtypes)

In [4]:
#clean the dataset

print("\n" + "="*50)
print("STEP 2: Cleaning the dataset")
print("="*50)

# Check missing values
print("\nMissing values in training set:")
print(train_df.isnull().sum())

# Drop rows with missing user_id or item_id 
# Based on output use the actual column names

print("\nActual column names:", train_df.columns.tolist())

# I'll assume typical names - adjust these based on what you see above
user_col = 'user_id' if 'user_id' in train_df.columns else 'user'
item_col = 'item_id' if 'item_id' in train_df.columns else 'item'

train_df = train_df.dropna(subset=[user_col, item_col])
test_df = test_df.dropna(subset=[user_col, item_col])

print(f"\nAfter dropping missing values:")
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

# Check for duplicates (same user rating same item multiple times)
duplicates = train_df.duplicated(subset=[user_col, item_col], keep=False)
print(f"\nNumber of duplicate user-item pairs: {duplicates.sum()}")

# Remove duplicates, keeping the first occurrence
train_df = train_df.drop_duplicates(subset=[user_col, item_col], keep='first')
test_df = test_df.drop_duplicates(subset=[user_col, item_col], keep='first')

print(f"After removing duplicates:")
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")


STEP 2: Cleaning the dataset

Missing values in training set:
item_id    0
user_id    0
rating     0
dtype: int64

Actual column names: ['item_id', 'user_id', 'rating']

After dropping missing values:
Training set shape: (30000, 3)
Test set shape: (7500, 3)

Number of duplicate user-item pairs: 6608
After removing duplicates:
Training set shape: (26580, 3)
Test set shape: (6645, 3)


In [3]:
#load dataset
import pandas as pd

train_df = pd.read_parquet('data/train_video_games.parquet')
test_df = pd.read_parquet('data/test_video_games.parquet')

# Quick check that initial dataset was loaded
print(train_df.head())
print(train_df.info())

      item_id                       user_id  rating
0  B00ZM5OXD8  AHOGCWGRSFQ6YZH6QLYUMNQ4N3KA     5.0
1  B00O3JSRHW  AEKOQRDUY64SGH4PONBFTSIM2I2Q     4.0
2  B0056WJA30  AFA43JCV3C72LHM5BIVPV7UEJ2CA     5.0
3  B002I0J4NE  AFD3QCEDODUYBYMQGA6NWILJW7KA     5.0
4  B0031SWWPO  AGYF4ZSMTSZHCE3OH6CO5SJK5Y3A     5.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   item_id  30000 non-null  object 
 1   user_id  30000 non-null  object 
 2   rating   30000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 703.2+ KB
None
