# Classification
1. In this notebook we evaluate classification models in predicting user ratings for movies given movie features and user preference features.<br>
Given that our dataset contains millions of ratings from the top 10k users across 9k movies, we sample a subset of ratings at random.<br>

2. We also perform hyperparameter tuning on the XGBOOSt model using Randomized Search and evaluate the resulting model performance as well as the learned weights assigned to each feature.

- Evaluate Logistic Regression and XGBOOST using K-Fold cross-validation
- Print out classification metrics
- Visualize Confusion Matrix and ROC-AUC
- Tune XGBOOST Hyperparameters using Randomized Search and evaluate
- Plot the most and least important learned feature weights

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import gzip
import json

import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    roc_curve,
    classification_report,
)
from xgboost import XGBClassifier
alt.renderers.enable('default')

DATA_PATH = "../data"
NETFLIX_FOLDER_PATH = os.path.join(DATA_PATH, "netflix_prize")
IMDB_FOLDER_PATH = os.path.join(DATA_PATH, "imdb")
MIN_OCCURRENCES = 20

## Load Data

### Movie Features

In [None]:
# Load movie features
MOVIE_FEATURES_PATH = os.path.join(DATA_PATH, f"processed/movie_features_{MIN_OCCURRENCES}.pickle")
with open(MOVIE_FEATURES_PATH, "rb") as f:
    movie_features = pickle.load(f)

# Load feature mapping
FEATURE_MAPPING_PATH = os.path.join(DATA_PATH, f"processed/feature_mapping_{MIN_OCCURRENCES}.pickle")
with open(FEATURE_MAPPING_PATH, "rb") as f:
    feature_mapping = pickle.load(f)

feature_to_id = feature_mapping['feature_to_id']
id_to_feature = feature_mapping['id_to_feature']

# Number of features
num_features = len(feature_to_id)
print(f"Number of features: {num_features}")

### Create Movie Feature Matrix

In [None]:
# List of movie IDs and feature IDs
movie_ids = list(movie_features.keys())
feature_ids = list(id_to_feature.keys())

# Create an empty DataFrame
movie_feature_matrix = pd.DataFrame(0, index=movie_ids, columns=feature_ids)

# Fill the DataFrame
for movie_id, features in movie_features.items():
    movie_feature_matrix.loc[movie_id, features] = 1

print(f"Movie feature matrix shape: {movie_feature_matrix.shape}")

### User Profiles
- Import top 10k users
- Randomly sample k users

In [4]:
k = 30
RANDOM_SEED = 42

In [None]:
# Load user profiles
USER_PROFILES_PATH = os.path.join(DATA_PATH, f"processed/user_profiles_{MIN_OCCURRENCES}.pickle")
with open(USER_PROFILES_PATH, "rb") as f:
    user_profiles = pickle.load(f)

# List of user IDs
user_ids = list(user_profiles.keys())

# Randomly sample k users
np.random.seed(RANDOM_SEED)
sampled_user_ids = np.random.choice(user_ids, size=k, replace=False)
sampled_user_ids_set = set(sampled_user_ids)

print(f"Number of users sampled: {len(sampled_user_ids)}")


### User Feature Matrix for Sampled Users

In [None]:
# Filter user_profiles to include only sampled users
sampled_user_profiles = {user_id: user_profiles[user_id] for user_id in sampled_user_ids}

# Create an empty DataFrame for user features
user_feature_matrix = pd.DataFrame(-1, index=sampled_user_ids, columns=feature_ids)

# Fill the DataFrame with user feature preferences
for user_id, user_data in sampled_user_profiles.items():
    feature_preferences = user_data['feature_preferences']
    for feature_id, preference in feature_preferences.items():
        user_feature_matrix.loc[user_id, feature_id] = preference

print(f"User feature matrix shape: {user_feature_matrix.shape}")

### Netflix-IMDB Mapping

In [7]:
NETFLIX_TO_IMDB_PATH = os.path.join(DATA_PATH, "netflix_to_imdb.json")
with open(NETFLIX_TO_IMDB_PATH, "r") as f:
    netflix_to_imdb = json.load(f)

### Load Ratings Data for Sampled Users

In [None]:
import glob

# Get list of rating files
rating_files = glob.glob(os.path.join(NETFLIX_FOLDER_PATH, "training_set", "*.txt"))

# Set of movie IDs and user IDs we care about
movie_ids_set = set(movie_features.keys())
user_ids_set = set(sampled_user_ids)  # Use sampled users

ratings_data = []
total_entries = 0
max_entries = 1000000  # Limit the number of entries for the sake of memory

for file_path in rating_files:
    with open(file_path, 'r') as f:
        lines = f.readlines()
        if not lines: continue
        # The first line contains the movie ID, ending with ':'
        movie_id_line = lines[0].strip()
        movie_id = movie_id_line[:-1]  # Remove the colon at the end
        # Only process if movie_id is in our set
        if movie_id in movie_ids_set:
            # Process the rest of the lines
            for line in lines[1:]:
                parts = line.strip().split(',')
                if len(parts) != 3: continue
                user_id, rating, date = parts
                # Only include sampled users
                if user_id in user_ids_set:
                    ratings_data.append({
                        'movie_id': movie_id,
                        'user_id': user_id,
                        'rating': int(rating),
                        'date': date
                    })
                    total_entries += 1
                    if total_entries >= max_entries: break
            if total_entries >= max_entries: break
    if total_entries >= max_entries: break

ratings_df = pd.DataFrame(ratings_data)
print(f"Total ratings loaded: {ratings_df.shape[0]}")

In [None]:
print(f"Number of unique users in ratings: {ratings_df['user_id'].nunique()}")
print(f"Number of unique movies in ratings: {ratings_df['movie_id'].nunique()}")

## Preprocess

Binarize Ratings

In [10]:
ratings_df['label'] = ratings_df['rating'].apply(lambda x: 1 if x >= 4 else 0)

Prepare Movie Features

In [11]:
# Reset index and rename columns for merging
movie_feature_matrix.reset_index(inplace=True)
movie_feature_matrix.rename(columns={'index': 'movie_id'}, inplace=True)

# Rename feature columns to avoid overlap
movie_feature_columns = [col for col in movie_feature_matrix.columns if col != 'movie_id']
movie_feature_matrix.rename(columns={col: f'movie_feat_{col}' for col in movie_feature_columns}, inplace=True)

Prepare User Features

In [None]:
# Reset index and rename columns for merging
user_feature_matrix.reset_index(inplace=True)
user_feature_matrix.rename(columns={'index': 'user_id'}, inplace=True)

# Rename feature columns to avoid overlap
user_feature_columns = [col for col in user_feature_matrix.columns if col != 'user_id']
user_feature_matrix.rename(columns={col: f'user_feat_{col}' for col in user_feature_columns}, inplace=True)

Merge DataFrames

In [None]:
# Merge ratings with movie features
ratings_df = ratings_df.merge(movie_feature_matrix, on='movie_id', how='left')

# Merge with user features
ratings_df = ratings_df.merge(user_feature_matrix, on='user_id', how='left')
print(f"Data shape after merging: {ratings_df.shape}")

Prepare Input Features and Labels

In [14]:
# Define feature columns
movie_feature_cols = [col for col in ratings_df.columns if col.startswith('movie_feat_')]
user_feature_cols = [col for col in ratings_df.columns if col.startswith('user_feat_')]

# Input features and target variable
X = ratings_df[movie_feature_cols + user_feature_cols]
y = ratings_df['label']

No need to impute missing values

In [15]:
# # Replace -1 (indicating missing user preferences) with NaN
# X[user_feature_cols] = X[user_feature_cols].replace(-1, np.nan)

# # Fill NaN values with 0 (assuming no preference)
# X.fillna(0, inplace=True)

# Classification

Initialize Models

In [16]:
lr = LogisticRegression(max_iter=1000)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

Define K-Fold Cross-Validation

In [22]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

## Logistic Regression

In [None]:
from sklearn.model_selection import cross_val_predict

# Cross-validated predictions
lr_probas = cross_val_predict(lr, X, y, cv=kf, method='predict_proba', n_jobs=-1)[:, 1]
lr_preds = (lr_probas >= 0.5).astype(int)

# Metrics
lr_accuracy = accuracy_score(y, lr_preds)
lr_precision = precision_score(y, lr_preds)
lr_recall = recall_score(y, lr_preds)
lr_f1 = f1_score(y, lr_preds)
lr_auc = roc_auc_score(y, lr_probas)
lr_cm = confusion_matrix(y, lr_preds)

print("Logistic Regression Metrics:")
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1 Score: {lr_f1:.4f}")
print(f"ROC AUC Score: {lr_auc:.4f}")

## XGBOOST

In [None]:
# Cross-validated predictions
xgb_probas = cross_val_predict(xgb, X, y, cv=kf, method='predict_proba', n_jobs=-1)[:, 1]
xgb_preds = (xgb_probas >= 0.5).astype(int)

# Metrics
xgb_accuracy = accuracy_score(y, xgb_preds)
xgb_precision = precision_score(y, xgb_preds)
xgb_recall = recall_score(y, xgb_preds)
xgb_f1 = f1_score(y, xgb_preds)
xgb_auc = roc_auc_score(y, xgb_probas)
xgb_cm = confusion_matrix(y, xgb_preds)

print("XGBoost Metrics:")
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1 Score: {xgb_f1:.4f}")
print(f"ROC AUC Score: {xgb_auc:.4f}")

## Visualization

In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

lr_fpr, lr_tpr, _ = roc_curve(y, lr_probas)
plt.figure(figsize=(6, 4))
plt.plot(lr_fpr, lr_tpr, label=f'ROC Curve (AUC = {lr_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('Logistic Regression ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(xgb_cm, annot=True, fmt='d', cmap='Greens')
plt.title('XGBoost Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

xgb_fpr, xgb_tpr, _ = roc_curve(y, xgb_probas)
plt.figure(figsize=(6, 4))
plt.plot(xgb_fpr, xgb_tpr, label=f'ROC Curve (AUC = {xgb_auc:.4f})', color='green')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('XGBoost ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

# Hyperparameter Tuning

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Hyperparameter space
param_distributions = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 0.5),  # From 0.5 to 1.0
    'colsample_bytree': uniform(0.5, 0.5),  # From 0.5 to 1.0
    'gamma': uniform(0, 0.5),
    'min_child_weight': randint(1, 10),
    'reg_alpha': uniform(0, 0.5),
    'reg_lambda': uniform(0.5, 0.5)  # From 0.5 to 1.0
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=50,  # Number of parameter settings that are sampled
    scoring='roc_auc',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

In [None]:
random_search.fit(X, y)

### Evaluate

In [None]:
# Best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:")
print(best_params)

In [None]:
# Get the best estimator
best_xgb = random_search.best_estimator_

# Cross-validated predictions
best_xgb_probas = cross_val_predict(
    best_xgb, X, y, cv=kf, method='predict_proba', n_jobs=-1
)[:, 1]
best_xgb_preds = (best_xgb_probas >= 0.5).astype(int)

In [None]:
# Metrics
best_xgb_accuracy = accuracy_score(y, best_xgb_preds)
best_xgb_precision = precision_score(y, best_xgb_preds)
best_xgb_recall = recall_score(y, best_xgb_preds)
best_xgb_f1 = f1_score(y, best_xgb_preds)
best_xgb_auc = roc_auc_score(y, best_xgb_probas)
best_xgb_cm = confusion_matrix(y, best_xgb_preds)

print("Tuned XGBoost Metrics:")
print(f"Accuracy: {best_xgb_accuracy:.4f}")
print(f"Precision: {best_xgb_precision:.4f}")
print(f"Recall: {best_xgb_recall:.4f}")
print(f"F1 Score: {best_xgb_f1:.4f}")
print(f"ROC AUC Score: {best_xgb_auc:.4f}")

### Visualize

In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(best_xgb_cm, annot=True, fmt='d', cmap='Purples')
plt.title('Tuned XGBoost Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

best_xgb_fpr, best_xgb_tpr, _ = roc_curve(y, best_xgb_probas)
plt.figure(figsize=(6, 4))
plt.plot(best_xgb_fpr, best_xgb_tpr, label=f'ROC Curve (AUC = {best_xgb_auc:.4f})', color='purple')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('Tuned XGBoost ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()


### Rank Learned Feature Importance Weights

In [None]:
# Get learned feature weights
importances = best_xgb.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})

# Map feature IDs to actual feature names
feature_importances['feature'] = feature_importances['feature'].apply(
    lambda x: feature_mapping['id_to_feature'][int(x.split('_')[-1])] 
    if x.split('_')[-1].isdigit() else x
)

# Sort by importance
feature_importances.sort_values(by='importance', ascending=False, inplace=True)
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))

# Plot top 20 features
sns.barplot(x='importance', y='feature', data=feature_importances.head(20), palette='viridis', ax=ax1)
ax1.set_title('Top 20 Most Important Features')
ax1.set_xlabel('Importance')
ax1.set_ylabel('Feature')

# Plot bottom 20 features
sns.barplot(x='importance', y='feature', data=feature_importances.tail(20), palette='viridis', ax=ax2)
ax2.set_title('20 Least Important Features') 
ax2.set_xlabel('Importance')
ax2.set_ylabel('Feature')

plt.tight_layout()
plt.show()