In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kurtosis
from sklearn.decomposition import PCA
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    Ridge,
    Lasso,
    SGDRegressor
    )
from sklearn.tree import (
    DecisionTreeRegressor
)
from sklearn.ensemble import (
    RandomForestRegressor,
    BaggingRegressor
)
from sklearn.metrics import (
    mean_absolute_error,
    accuracy_score
)
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    StratifiedKFold
)
from sklearn.cluster import KMeans
from sklearn.ensemble import StackingRegressor
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Loading datasets
first_batch  = np.load("data/Week1/first_batch_regression_labelled.npz")
second_batch  = np.load("data/Week2/second_batch_regression_labelled.npz")
third_batch  = np.load("data/Week3/third_batch_regression_labelled.npz")
fourth_batch  = np.load("data/Week4/fourth_batch_regression_labelled.npz")

# Extract X, y, y_cat for each batch
X1, y1, y1_cat = first_batch["X"], first_batch["yy"], first_batch["yy_cat"]
X2, y2, y2_cat = second_batch["X"], second_batch["yy"], second_batch["yy_cat"]
X3, y3, y3_cat = third_batch["X"], third_batch["yy"], third_batch["yy_cat"]
X4, y4, y4_cat = fourth_batch["X"], fourth_batch["yy"], fourth_batch["yy_cat"]

# Convert to DataFrames and rename columns for each batch
X1     = pd.DataFrame(X1, columns=["user", "item", "rating"])
y1     = pd.DataFrame(y1, columns=["user", "label"])
y1_cat = pd.DataFrame(y1_cat, columns=["user", "label", "anomtype"])
X2     = pd.DataFrame(X2, columns=["user", "item", "rating"])
y2     = pd.DataFrame(y2, columns=["user", "label"])
y2_cat = pd.DataFrame(y2_cat, columns=["user", "label", "anomtype"])
X3     = pd.DataFrame(X3, columns=["user", "item", "rating"])
y3     = pd.DataFrame(y3, columns=["user", "label"])
y3_cat = pd.DataFrame(y3_cat, columns=["user", "label", "anomtype"])
X4     = pd.DataFrame(X4, columns=["user", "item", "rating"])
y4     = pd.DataFrame(y4, columns=["user", "label"])
y4_cat = pd.DataFrame(y4_cat, columns=["user", "label", "anomtype"])

# Combine the data 
X = pd.concat([X1, X2, X3, X4], ignore_index=True)
y = pd.concat([y1, y2, y3, y4], ignore_index=True)
y_cat = pd.concat([y1_cat, y2_cat, y3_cat, y4_cat], ignore_index=True)

# Parse to correct types
y     = y.astype({"user": int, "label": float})
y_cat = y_cat.astype({"user": int, "label": float, "anomtype": int})

# Latest test data
XX    = np.load("data/Week4/fifth_batch_regression_unlabelled.npz")['X']
XX    = pd.DataFrame(XX, columns=["user", "item", "rating"])

In [3]:
# Checking loads:
X

Unnamed: 0,user,item,rating
0,0,94,2
1,0,90,1
2,0,97,2
3,0,100,4
4,0,101,2
...,...,...,...
1144737,3599,396,4
1144738,3599,183,3
1144739,3599,877,3
1144740,3599,961,5


In [4]:
# Checking loads: (Pt. 2)
y

Unnamed: 0,user,label
0,0,0.962817
1,1,0.031248
2,2,0.068668
3,3,0.349012
4,4,0.917704
...,...,...
3595,3595,0.720721
3596,3596,0.705247
3597,3597,0.362698
3598,3598,0.072459


In [5]:
# Checking loads: (Pt. 3)
y_cat.shape

(60, 3)

In [6]:
# Checking loads: (Pt. 4)
XX

Unnamed: 0,user,item,rating
0,3600,849,5
1,3600,722,5
2,3600,462,4
3,3600,982,4
4,3600,749,4
...,...,...,...
284949,4499,757,4
284950,4499,752,4
284951,4499,751,4
284952,4499,778,4


In [7]:
# Removing duplicates
X = X.drop_duplicates(subset=["user", "item"], keep="last")
XX = XX.drop_duplicates(subset=["user", "item"], keep="last")

# SVD for regression dataframe
X_matrix = X.pivot(index="user", columns="item", values="rating").fillna(0)
XX_matrix = XX.pivot(index="user", columns="item", values="rating").fillna(0)

# SVD --- say WHATTTTTTTT
svd = TruncatedSVD(n_components=650, random_state=42)
X_svd = svd.fit_transform(X_matrix)
XX_svd = svd.transform(XX_matrix)

print(f"Variance Explained by SVD: {svd.explained_variance_ratio_.sum():.4f}")

df_reg_train = pd.DataFrame(X_svd, index=X_matrix.index)
df_reg_test = pd.DataFrame(XX_svd, index=XX_matrix.index)

print(X_matrix.shape)
print(XX_matrix.shape)
print(df_reg_train.shape)
print(df_reg_test.shape)

Variance Explained by SVD: 0.9032
(3600, 1000)
(900, 1000)
(3600, 650)
(900, 650)


In [None]:
# --- FEATURE ENGINEERING (This is mainly what I worked on, sry cause its super messy) ---

# base user stats
X_stats = X.groupby("user").agg(
    mean_rating=('rating', 'mean'),
    std_rating=('rating', 'std'),
    min_rating=('rating', 'min'),
    max_rating=('rating', 'max'),
    total_interactions=('rating', 'count')
)
X_stats['rating_var'] = X_stats['std_rating'] ** 2
X_stats['normalized_std'] = X_stats['std_rating'] / (X_stats['mean_rating'] + 1e-5)
X_stats['skewness'] = X.groupby('user')['rating'].skew()
X_stats['kurtosis'] = X.groupby('user')['rating'].apply(pd.Series.kurt)
X_stats = X_stats.fillna(0)

# --- Extra stuff starts from here... ---
# checking if ratings are biased by global mean per item
global_mean = X['rating'].mean()

# create and merge item-level means/stds BEFORE any references
item_stats = X.groupby('item')['rating'].agg(['mean', 'std']).rename(columns={'mean': 'item_mean', 'std': 'item_std'})
item_stats = item_stats.fillna(0)
X = X.merge(item_stats, on='item', how='left')

# user bias relative to item mean
user_bias = X.groupby('user').apply(lambda df: (df['rating'] - df['item_mean']).mean()).rename('user_bias')
X_stats = X_stats.merge(user_bias, on='user', how='left').fillna(0)

# rating distribution (extremeties p much)
rating_counts = X.groupby(['user', 'rating']).size().unstack(fill_value=0)
rating_counts.columns = [f'rating_{int(c)}_count' for c in rating_counts.columns]
X_stats = X_stats.merge(rating_counts, on='user', how='left').fillna(0)

# proportion of total items rated
num_items = X['item'].nunique()
X_stats['interaction_ratio'] = X_stats['total_interactions'] / num_items

# user outlier fraction (how often ratings deviate strongly from usual)
X['abs_dev'] = abs(X['rating'] - X['item_mean'])
user_outlier_frac = (X['abs_dev'] > 1.5 * X['item_std']).groupby(X['user']).mean().rename('outlier_frac')
X_stats = X_stats.merge(user_outlier_frac, on='user', how='left').fillna(0)

# item anomalies?? maybe the items have something weird going on there...
item_stats_expanded = X.groupby("item").agg(
    item_mean=('rating', 'mean'),
    item_std=('rating', 'std'),
    item_var=('rating', lambda x: np.var(x, ddof=1)),
    item_min=('rating', 'min'),
    item_max=('rating', 'max'),
    item_total=('rating', 'count')
)
item_stats_expanded['item_skew'] = X.groupby('item')['rating'].skew()
item_stats_expanded['item_kurt'] = X.groupby('item')['rating'].apply(pd.Series.kurt)
item_stats_expanded['item_range'] = item_stats_expanded['item_max'] - item_stats_expanded['item_min']
item_stats_expanded['item_mean_rank'] = item_stats_expanded['item_mean'].rank(pct=True)
item_stats_expanded = item_stats_expanded.fillna(0)

# debug
for col in ['item_mean', 'item_std', 'item_mean_x', 'item_mean_y', 'item_std_x', 'item_std_y']:
    if col in X.columns:
        X = X.drop(columns=col)

X = X.merge(item_stats_expanded, on='item', how='left')

X['item_mean'] = X['item_mean']
X['item_std'] = X['item_std']

# mooar extra
X['deviation_from_item_mean'] = X['rating'] - X['item_mean']
X['z_score_item'] = X['deviation_from_item_mean'] / (X['item_std'] + 1e-6)
alignment_by_user = X.groupby('user')['z_score_item'].mean().rename('mean_item_alignment')
X_stats = X_stats.merge(alignment_by_user, on='user', how='left').fillna(0)

# entropy  woooooo
from scipy.stats import entropy
def rating_entropy(ratings):
    counts = ratings.value_counts(normalize=True)
    return entropy(counts)

user_entropy = X.groupby('user')['rating'].apply(rating_entropy).rename('rating_entropy')
extreme_ratio = X.groupby('user').apply(lambda df: ((df['rating']==1)|(df['rating']==5)).mean()).rename('extreme_ratio')
X_stats = X_stats.merge(user_entropy, on='user', how='left').merge(extreme_ratio, on='user', how='left').fillna(0)

# normalizing rank
X_stats['user_mean_rank'] = X_stats['mean_rating'].rank(pct=True)

# merge
df_reg_train = df_reg_train.merge(X_stats, on='user', how='left')
df_reg_train = df_reg_train.merge(y, on='user', how='inner')

# --- same stuff for XX ---
for col in ['item_mean', 'item_std', 'item_mean_x', 'item_mean_y', 'item_std_x', 'item_std_y']:
    if col in XX.columns:
        XX = XX.drop(columns=col)

XX = XX.merge(item_stats_expanded, on='item', how='left')

# debug
XX['item_mean'] = XX['item_mean']
XX['item_std'] = XX['item_std']

XX_stats = XX.groupby("user").agg(
    mean_rating=('rating', 'mean'),
    std_rating=('rating', 'std'),
    min_rating=('rating', 'min'),
    max_rating=('rating', 'max'),
    total_interactions=('rating', 'count')
)
XX_stats['rating_var'] = XX_stats['std_rating'] ** 2
XX_stats['normalized_std'] = XX_stats['std_rating'] / (XX_stats['mean_rating'] + 1e-5)
XX_stats['skewness'] = XX.groupby('user')['rating'].skew()
XX_stats['kurtosis'] = XX.groupby('user')['rating'].apply(pd.Series.kurt)
XX_stats = XX_stats.fillna(0)

num_items_test = XX['item'].nunique()
XX_stats['interaction_ratio'] = XX_stats['total_interactions'] / num_items_test

XX['abs_dev'] = abs(XX['rating'] - XX['item_mean'])
user_outlier_frac_test = (XX['abs_dev'] > 1.5 * XX['item_std']).groupby(XX['user']).mean().rename('outlier_frac')
XX_stats = XX_stats.merge(user_outlier_frac_test, on='user', how='left').fillna(0)

XX['deviation_from_item_mean'] = XX['rating'] - XX['item_mean']
XX['z_score_item'] = XX['deviation_from_item_mean'] / (XX['item_std'] + 1e-6)
alignment_by_user_test = XX.groupby('user')['z_score_item'].mean().rename('mean_item_alignment')
XX_stats = XX_stats.merge(alignment_by_user_test, on='user', how='left').fillna(0)

user_entropy_test = XX.groupby('user')['rating'].apply(rating_entropy).rename('rating_entropy')
extreme_ratio_test = XX.groupby('user').apply(lambda df: ((df['rating']==1)|(df['rating']==5)).mean()).rename('extreme_ratio')
XX_stats = XX_stats.merge(user_entropy_test, on='user', how='left').merge(extreme_ratio_test, on='user', how='left').fillna(0)

XX_stats['user_mean_rank'] = XX_stats['mean_rating'].rank(pct=True)

# merge
df_reg_test = df_reg_test.merge(XX_stats, on='user', how='left')


  user_bias = X.groupby('user').apply(lambda df: (df['rating'] - df['item_mean']).mean()).rename('user_bias')
  extreme_ratio = X.groupby('user').apply(lambda df: ((df['rating']==1)|(df['rating']==5)).mean()).rename('extreme_ratio')
  extreme_ratio_test = XX.groupby('user').apply(lambda df: ((df['rating']==1)|(df['rating']==5)).mean()).rename('extreme_ratio')


In [9]:
X_data = df_reg_train.drop(columns=['user', 'label'])
y_data = df_reg_train['label']
X_data.columns = X_data.columns.astype(str)

y_bins = pd.qcut(y_data, q=10, labels=False)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_params = dict(
    n_estimators=3500,
    learning_rate=0.008,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    eval_metric='mae'
)

lgb_params = dict(
    n_estimators=3500,
    learning_rate=0.008,
    num_leaves=31,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    objective='mae',
    metric='mae'
)

mae_scores = []

# --- Cross-validation ---
for i, (train_idx, val_idx) in enumerate(kf.split(X_data, y_bins)):
    X_train, X_val = X_data.iloc[train_idx], X_data.iloc[val_idx]
    y_train, y_val = y_data.iloc[train_idx], y_data.iloc[val_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled   = scaler.transform(X_val)

    xgb = XGBRegressor(**xgb_params)
    lgb = LGBMRegressor(**lgb_params)

    xgb.fit(X_train_scaled, y_train)
    lgb.fit(X_train_scaled, y_train)

    y_pred = (0.6 * xgb.predict(X_val_scaled)) + (0.4 * lgb.predict(X_val_scaled))
    mae = mean_absolute_error(y_val, y_pred)
    mae_scores.append(mae)
    print(f"Fold {i+1} MAE: {mae:.4f}")

print(f"\nMean MAE: {np.mean(mae_scores):.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170162
[LightGBM] [Info] Number of data points in the train set: 2880, number of used features: 672
[LightGBM] [Info] Start training from score 0.506428




Fold 1 MAE: 0.0646
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170197
[LightGBM] [Info] Number of data points in the train set: 2880, number of used features: 672
[LightGBM] [Info] Start training from score 0.506512




Fold 2 MAE: 0.0679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008864 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170197
[LightGBM] [Info] Number of data points in the train set: 2880, number of used features: 672
[LightGBM] [Info] Start training from score 0.506512




Fold 3 MAE: 0.0655
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006568 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170165
[LightGBM] [Info] Number of data points in the train set: 2880, number of used features: 672
[LightGBM] [Info] Start training from score 0.506512




Fold 4 MAE: 0.0657
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170196
[LightGBM] [Info] Number of data points in the train set: 2880, number of used features: 672
[LightGBM] [Info] Start training from score 0.506577
Fold 5 MAE: 0.0666

Mean MAE: 0.0661 ± 0.0011


