In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon/test_amazon_ratings.csv
/kaggle/input/amazon/train_amazon_ratings.csv


In [2]:
# Install
!pip install scikit-surprise pyspark --quiet

**Look at the dataset**

In [4]:
# 📂 Load data
train_df = pd.read_csv("/kaggle/input/amazon/train_amazon_ratings.csv")
test_df = pd.read_csv("/kaggle/input/amazon/test_amazon_ratings.csv")

# Prepare sets
train_users = set(train_df["UserID"])
train_items = set(train_df["ItemID"])
test_users = set(test_df["UserID"])
test_items = set(test_df["ItemID"])

# Overlap analysis
common_users = train_users & test_users
common_items = train_items & test_items
new_users = test_users - train_users
new_items = test_items - train_items

# Summary statistics
print("Total train users:", len(train_users))
print("Total test users:", len(test_users))
print("Overlapping users:", len(common_users))
print("New users (cold-start):", len(new_users))
print("—" * 40)
print("Total train items:", len(train_items))
print("Total test items:", len(test_items))
print("Overlapping items:", len(common_items))
print("New items (cold-start):", len(new_items))
print("—" * 40)
print(f"Test user overlap rate: {len(common_users)/len(test_users):.2%}")
print(f"Test item overlap rate: {len(common_items)/len(test_items):.2%}")
print(f"Estimated fallback usage (user-based): {(1 - len(common_users)/len(test_users)) * 100:.2f}%")

Total train users: 9300
Total test users: 8144
Overlapping users: 8140
New users (cold-start): 4
————————————————————————————————————————
Total train items: 2970
Total test items: 2960
Overlapping items: 2960
New items (cold-start): 0
————————————————————————————————————————
Test user overlap rate: 99.95%
Test item overlap rate: 100.00%
Estimated fallback usage (user-based): 0.05%


***Baseline + GridSearch, performance: the best which is 0.882!!!!!!***

In [7]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, BaselineOnly
from surprise.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Load data
train_df = pd.read_csv("/kaggle/input/amazon/train_amazon_ratings.csv")
test_df = pd.read_csv("/kaggle/input/amazon/test_amazon_ratings.csv")
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_df[["UserID", "ItemID", "Rating"]], reader)

# GridSearchCV for BaselineOnly
param_grid = {
    "bsl_options": {
        "method": ["als"],
        "reg_i": [5, 10, 15],
        "reg_u": [5, 10, 15]
    }
}
gs = GridSearchCV(BaselineOnly, param_grid, measures=["rmse"], cv=5, n_jobs=-1)
gs.fit(data)
best_params = gs.best_params['rmse']
print("Best BaselineOnly CV RMSE:", gs.best_score['rmse'])
print("Best Params:", best_params)

# Manual Validation Split
train_split, val_split = train_test_split(train_df, test_size=0.2, random_state=42)
trainset = Dataset.load_from_df(train_split[["UserID", "ItemID", "Rating"]], reader).build_full_trainset()
model = BaselineOnly(**best_params)
model.fit(trainset)

# Fallback Dictionaries
global_mean = train_split["Rating"].mean()
user_avg = train_split.groupby("UserID")["Rating"].mean().to_dict()
item_avg = train_split.groupby("ItemID")["Rating"].mean().to_dict()
user_count = train_split["UserID"].value_counts().to_dict()
item_count = train_split["ItemID"].value_counts().to_dict()
known_users = set(user_avg)
known_items = set(item_avg)

def fallback(uid, iid):
    u_avg = user_avg.get(uid)
    i_avg = item_avg.get(iid)
    u_c = user_count.get(uid, 0)
    i_c = item_count.get(iid, 0)
    if u_avg and i_avg:
        total = u_c + i_c + 10
        return (u_c / total) * u_avg + (i_c / total) * i_avg + (10 / total) * global_mean
    elif u_avg:
        total = u_c + 10
        return (u_c / total) * u_avg + (10 / total) * global_mean
    elif i_avg:
        total = i_c + 10
        return (i_c / total) * i_avg + (10 / total) * global_mean
    else:
        return global_mean

# Manual Validation RMSE
preds, truths, fallback_count = [], [], 0
for _, row in val_split.iterrows():
    uid, iid, true = row["UserID"], row["ItemID"], row["Rating"]
    if uid in known_users and iid in known_items:
        pred = model.predict(uid, iid).est
    else:
        pred = fallback(uid, iid)
        fallback_count += 1
    preds.append(pred)
    truths.append(true)
manual_rmse = mean_squared_error(truths, preds, squared=False)
print(f"BaselineOnly Manual RMSE: {manual_rmse:.5f} | Fallback: {fallback_count / len(val_split):.2%}")

# Final train on full data
final_model = BaselineOnly(**best_params)
final_model.fit(data.build_full_trainset())

# Generate submission
submission = test_df.copy()
submission["Prediction"] = submission.apply(
    lambda row: final_model.predict(row["UserID"], row["ItemID"]).est
    if row["UserID"] in known_users and row["ItemID"] in known_items
    else fallback(row["UserID"], row["ItemID"]), axis=1
)
submission[["id", "Prediction"]].to_csv("submission_baselineonly.csv", index=False)
print("submission_baselineonly.csv generated!")

Best BaselineOnly CV RMSE: 0.9367438489620156
Best Params: {'bsl_options': {'method': 'als', 'reg_i': 5, 'reg_u': 5}}
Estimating biases using als...
BaselineOnly Manual RMSE: 0.94373 | Fallback: 0.27%
Estimating biases using als...
submission_baselineonly.csv generated!
