In [1]:
#using Matrix Factorization-based Algorithm

import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from pandas.api.types import CategoricalDtype

from surprise import SVD
from surprise import Dataset
from surprise import Reader


# HANDLE OUTLIRS FUNCTION
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df


# MOVIES DATASET &  RATINGS DATASET
movies_df = pd.read_csv(r"C:\Users\nh013\Desktop\movie recommendation system dataset\movies.csv")
ratings_df = pd.read_csv(r"C:\Users\nh013\Desktop\movie recommendation system dataset\ratings.csv")



# MOVIES DATAFRAME FEATURE EXTRACT
movies_df = movies_df[['movieId', 'title', 'genres']]

# FEATURE EXTRACT RATING DATAFRAME 
ratings_df = ratings_df[['userId', 'movieId', 'rating', 'timestamp']]



# CHECK MISSING VALUES
movies_df.dropna(inplace=True)
ratings_df.dropna(inplace=True)

# HANDLE OUTLIERS
movies_df = handle_outliers(movies_df, 'movieId')
ratings_df = handle_outliers(ratings_df, 'rating')



# CONVERT CATEGORICAL VALUES TO NUMERICAL TO MOVIES DATAFRAME 
categorical_cols_movies = ['title', 'genres']
for col in categorical_cols_movies:
    movies_df[col] = movies_df[col].astype('category').cat.codes

# CONVERT CATEGORICAL VALUES TO NUMERICAL TO RATING DATAFRAME
categorical_cols_ratings = ['userId']
for col in categorical_cols_ratings:
    ratings_df[col] = ratings_df[col].astype('category').cat.codes

    
    
# LETS CREATE A SURPRISE DATASET FROM RATING_DF
reader = Reader(rating_scale=(-1, 1))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)


#SVD ALGORITHM
algo = SVD()

# TRAIN THE MODEL
trainset = data.build_full_trainset()
algo.fit(trainset)

# GET TOP n RECOMMENDATION FOR USERS
user_id = 1
n = 5

user_predictions = []
for movie_id in movies_df['movieId'].unique():
    pred = algo.predict(str(user_id), str(movie_id))
    user_predictions.append(pred)

# SHORT PREDICTION BY ESTIMATE RATING
user_predictions.sort(key=lambda x: x.est, reverse=True)



print(f"Top {n} recommendations for User {user_id}:")
for pred in user_predictions[:n]:
    movie_id = pred.iid
    estimated_rating = pred.est
    print(f"Movie ID: {movie_id}, Estimated Rating: {estimated_rating}")


Top 5 recommendations for User 1:
Movie ID: 1, Estimated Rating: 1
Movie ID: 2, Estimated Rating: 1
Movie ID: 3, Estimated Rating: 1
Movie ID: 4, Estimated Rating: 1
Movie ID: 5, Estimated Rating: 1


In [5]:
# using XGBOOST model ...........

import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from pandas.api.types import CategoricalDtype

from surprise import SVD
from surprise import Dataset
from surprise import Reader

from surprise.model_selection import cross_validate
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# HANDLE OUTLIER FUNCTION
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df



# MOVIES DATASET & RATINGS DATASET
movies_df = pd.read_csv(r"C:\Users\nh013\Desktop\movie recommendation system dataset\movies.csv")
ratings_df = pd.read_csv(r"C:\Users\nh013\Desktop\movie recommendation system dataset\ratings.csv")




# MOVIES DATAFRAME FEATURE EXTRACT
movies_df = movies_df[['movieId', 'title', 'genres']]

# FEATURE EXTRACT RATING DATAFRAME 
ratings_df = ratings_df[['userId', 'movieId', 'rating', 'timestamp']]



# CHECK MISSING VALUES
movies_df.dropna(inplace=True)
ratings_df.dropna(inplace=True)


# HANDLE OUTLIERS
movies_df = handle_outliers(movies_df, 'movieId')
ratings_df = handle_outliers(ratings_df, 'rating')



# CONVERT CATEGORICAL VALUES TO NUMERICAL TO MOVIES DATAFRAME 
categorical_cols_movies = ['title', 'genres']
for col in categorical_cols_movies:
    movies_df[col] = movies_df[col].astype('category').cat.codes

# CONVERT CATEGORICAL VALUES TO NUMERICAL TO RATING DATAFRAME
categorical_cols_ratings = ['userId']
for col in categorical_cols_ratings:
    ratings_df[col] = ratings_df[col].astype('category').cat.codes

    
    
# SURPRISE DATASET FROM RATING_DF
reader = Reader(rating_scale=(-1, 1))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# SVD ALGORITHM FROM SURPRISE
algo = SVD()

# TRAIN SVD MODEL
trainset = data.build_full_trainset()
algo.fit(trainset)

# GET TOP n RECOMMENDATION FOR USERS USING SURPRISE SVD
user_id = 1
n = 5

user_predictions = []
for movie_id in movies_df['movieId'].unique():
    pred = algo.predict(str(user_id), str(movie_id))
    user_predictions.append(pred)

    
    
# SORT PREDICTION BY ESTIMATE RATINGS
user_predictions.sort(key=lambda x: x.est, reverse=True)


print(f"Top {n} recommendations for User {user_id} using Surprise SVD:")
for pred in user_predictions[:n]:
    movie_id = pred.iid
    estimated_rating = pred.est
    print(f"Movie ID: {movie_id}, Estimated Rating: {estimated_rating}")

    
    
#DATA FOR XGBOOST MODEL
X = ratings_df[['userId', 'movieId']]
y = ratings_df['rating']

# SPLIT DATA INTO TRAINING AND TESTING SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#MODEL
xgb_model = XGBRegressor()

# TRAIN THE MODEL
xgb_model.fit(X_train, y_train)

#PREDICTION ON TEST SET
y_pred = xgb_model.predict(X_test)

# MODEL EVALUATE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"XGBoost RMSE: {rmse}")


Top 5 recommendations for User 1 using Surprise SVD:
Movie ID: 1, Estimated Rating: 1
Movie ID: 2, Estimated Rating: 1
Movie ID: 3, Estimated Rating: 1
Movie ID: 4, Estimated Rating: 1
Movie ID: 5, Estimated Rating: 1
XGBoost RMSE: 0.8727472217117144
