### Trying different approaches for prediction

In [2]:
import pandas as pd

def read_parquet_file(file_path):
    """
    Reads a Parquet file and returns a DataFrame.
    
    :param file_path: Path to the Parquet file.
    :return: DataFrame containing the data from the Parquet file.
    """
    try:
        df = pd.read_parquet(file_path, engine='pyarrow')
        return df
    except Exception as e:
        print(f"Error reading Parquet file: {e}")
        return None

df = read_parquet_file('../data/processed/ratings_movies.parquet')

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,genre_list,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,17,4.0,"[Drama, Romance]",0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,25,1.0,"[Drama, Romance]",0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,29,2.0,"[Adventure, Drama, Fantasy, Mystery, Sci-Fi]",0,0,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,1,30,5.0,"[Crime, Drama]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,32,5.0,"[Mystery, Sci-Fi, Thriller]",0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,0,0


In [4]:
# user_ids_with_min_entries = df.groupby('userId').filter(lambda x: len(x) >= 8)['userId'].unique()
# print(user_ids_with_min_entries)

In [5]:
# get number of unique movieIds
unique_movie_ids = df['movieId'].nunique()

In [6]:
unique_movie_ids

84432

In [7]:
user_means = df.groupby('userId').rating.transform('mean')
df['user_bias'] = user_means

In [8]:
movie_means = df.groupby('movieId').rating.transform('mean')
df['movie_bias'] = movie_means


In [9]:
from sklearn.preprocessing import MultiLabelBinarizer
# assume df has a column 'genre_list' of Python lists, e.g. ['Drama','Romance']
mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(
    mlb.fit_transform(df['genre_list']),
    columns=mlb.classes_,
    index=df.index
)

# join them back onto df
df = pd.concat([df, genre_dummies], axis=1)

In [10]:
# bias features
df['user_bias']  = df.groupby('userId').rating.transform('mean')
df['movie_bias'] = df.groupby('movieId').rating.transform('mean')

# your genre columns
genre_cols = list(mlb.classes_)  

# build X,y
X = df[['user_bias','movie_bias'] + genre_cols]
y = df['rating']


In [11]:
import platform
print(platform.architecture())

('64bit', 'Mach-O')


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# model = Ridge(alpha=1.0)
# model.fit(X_train, y_train)
# rmse = root_mean_squared_error(y_test, model.predict(X_test))
# print("Test RMSE:", rmse)

import xgboost as xgb


: 

Got RMSE of 0.8615835027740744, pretty good result, will try xgboost next

In [13]:
from sklearn.model_selection import train_test_split

# 1) Split the full df
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# 2) Compute bias features on train_df
train_df = train_df.copy()
train_df['user_bias']  = train_df.groupby('userId').rating.transform('mean')
train_df['movie_bias'] = train_df.groupby('movieId').rating.transform('mean')
global_mean = train_df.rating.mean()

# 3) Map those means into test_df (unseen users/movies get global mean)
test_df = test_df.copy()
user_means  = train_df.groupby('userId').user_bias.first()
movie_means = train_df.groupby('movieId').movie_bias.first()

test_df['user_bias']  = test_df['userId'].map(user_means).fillna(global_mean)
test_df['movie_bias'] = test_df['movieId'].map(movie_means).fillna(global_mean)

# 4) Now extract X and y
feature_cols = ['user_bias','movie_bias'] + genre_cols

X_train, y_train = train_df[feature_cols], train_df['rating']
X_test,  y_test  = test_df[feature_cols],  test_df['rating']


In [None]:
xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=20,
    verbose=10
)

preds = xgb.predict(X_test)
rmse_xgb = root_mean_squared_error(y_test, preds, squared=False)
print(f"XGBoost Test RMSE: {rmse_xgb:.4f}")