In [None]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357286 sha256=26e9c8edc0f2b21c2ade7fee9c9902e8ddd941ae00abd4aca4572148970ce017
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a

## **SVD 1**

In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import pickle
import time
import os


def prepare_data_csv(file_path, split_ratio=0.8):


    df = pd.read_csv(file_path)

    df = df.sort_values(by=['user_id', 'user_time'])

    train_data_list = []
    val_data_list = []

    for user_id, group in df.groupby('user_id'):
        split_index = int(len(group) * split_ratio)
        train_data_list.append(group.iloc[:split_index])
        val_data_list.append(group.iloc[split_index:])

    train_df = pd.concat(train_data_list).reset_index(drop=True)
    val_df = pd.concat(val_data_list).reset_index(drop=True)

    return train_df, val_df


def prepare_data_model(train_df, val_df, rating_range=(1, 5)):

    train_reader = Reader(rating_scale=rating_range)
    val_reader = Reader(rating_scale=rating_range)

    train_data = Dataset.load_from_df(train_df[["user_id", "movie_id", "rating"]], train_reader)
    valid_data = Dataset.load_from_df(val_df[["user_id", "movie_id", "rating"]], val_reader)

    return train_data, valid_data


def train_model(train_data, model_name='SVD'):
    model = SVD(n_factors=100, n_epochs=20, biased=True, lr_all=0.005, reg_all=0.02)
    start_time = time.time()
    training_set = train_data.build_full_trainset()
    model.fit(training_set)
    training_time = time.time() - start_time
    training_time_ms = training_time * 1000



    model_filename = f'{model_name}_movie_recommender.pkl'
    with open(model_filename, 'wb') as model_file:
        pickle.dump(model, model_file)

    print(f"Model saved to {model_filename}")
    return model,training_time_ms


def evaluate(model, data):

    dataset = [(rating[0], rating[1], rating[2]) for rating in data.raw_ratings]

    predictions = model.test(dataset)

    rmse = accuracy.rmse(predictions, verbose=True)

    return rmse

def inference_cost_per_input(model, user_id, movie_id):
    start_time = time.time()
    model.predict(uid=user_id, iid=movie_id)
    inference_time_seconds = time.time() - start_time
    inference_time_ms = inference_time_seconds * 1000  # Convert to milliseconds
    return inference_time_ms


def predict(model, user_id, movie_list, user_movie_list, K=20):

    recommendations = []
    scores = []

    for movie in movie_list:
        if user_id in user_movie_list and movie in user_movie_list[user_id]:
            continue
        prediction = model.predict(uid=user_id, iid=movie)
        scores.append((prediction.est, movie))

    scores.sort(reverse=True)
    recommendations = [movie for _, movie in scores[:K]]

    return recommendations
def get_model_size(model_filename):
    # Get the size of the model in bytes
    return os.path.getsize(model_filename)


ratings_file = 'extracted_ratings.csv'

train_df, val_df = prepare_data_csv(ratings_file)

train_data, valid_data = prepare_data_model(train_df, val_df)

model, training_time = train_model(train_data)

rmse_score = evaluate(model, valid_data)
print(f"Validation RMSE value is {rmse_score}")

all_movies= train_df['movie_id'].unique().tolist()

user_movies= train_df.groupby('user_id')['movie_id'].apply(set).to_dict()

test_user_id = 93
test_movie_id = train_df['movie_id'].iloc[0]
recommendations = predict(model, test_user_id, all_movies, user_movies)
print(f"Top 20 recommendations for user {test_user_id}: {recommendations}")

inference_time_ms = inference_cost_per_input(model, test_user_id, test_movie_id)
print(f"Inference time per input: {inference_time_ms:.6f} milliseconds")

model_filename = 'SVD_movie_recommender.pkl'
model_size_bytes = get_model_size(model_filename)
model_size_mb = model_size_bytes / (1024 * 1024)
print(f"Memory size of the model: {model_size_mb:.2f} MB")

print(f"Training time: {training_time:.2f} milliseconds")

Model saved to SVD_movie_recommender.pkl
RMSE: 0.8287
Validation RMSE value is 0.8286712240397239
Top 20 recommendations for user 93: ['the+shawshank+redemption+1994', 'the+lives+of+others+2006', 'das+boot+1981', 'seven+samurai+1954', 'city+of+god+2002', 'the+godfather+1972', 'the+usual+suspects+1995', 'reservoir+dogs+1992', 'american+history+x+1998', 'the+prestige+2006', 'life+is+beautiful+1997', 'memento+2000', 'schindlers+list+1993', 'dr.+strangelove+or+how+i+learned+to+stop+worrying+and+love+the+bomb+1964', 'fight+club+1999', 'shallow+grave+1994', 'amlie+2001', 'the+african+queen+1951', 'the+wrong+trousers+1993', 'my+neighbor+totoro+1988']
Inference time per input: 0.012159 milliseconds
Memory size of the model: 15.21 MB
Training time: 411.73 milliseconds


### **SVD 2**

In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import pickle
import time
import os


def prepare_data_csv(file_path, split_ratio=0.8):


    df = pd.read_csv(file_path)

    df = df.sort_values(by=['user_id', 'user_time'])

    train_data_list = []
    val_data_list = []

    for user_id, group in df.groupby('user_id'):
        split_index = int(len(group) * split_ratio)
        train_data_list.append(group.iloc[:split_index])
        val_data_list.append(group.iloc[split_index:])

    train_df = pd.concat(train_data_list).reset_index(drop=True)
    val_df = pd.concat(val_data_list).reset_index(drop=True)

    return train_df, val_df


def prepare_data_model(train_df, val_df, rating_range=(1, 5)):

    train_reader = Reader(rating_scale=rating_range)
    val_reader = Reader(rating_scale=rating_range)

    train_data = Dataset.load_from_df(train_df[["user_id", "movie_id", "rating"]], train_reader)
    valid_data = Dataset.load_from_df(val_df[["user_id", "movie_id", "rating"]], val_reader)

    return train_data, valid_data


def train_model(train_data, model_name='SVD'):
    model = SVD(n_factors=150, n_epochs=30, biased=True, lr_all=0.01, reg_all=0.05)
    start_time = time.time()
    training_set = train_data.build_full_trainset()
    model.fit(training_set)
    training_time = time.time() - start_time
    training_time_ms = training_time * 1000



    model_filename = f'{model_name}_movie_recommender.pkl'
    with open(model_filename, 'wb') as model_file:
        pickle.dump(model, model_file)

    print(f"Model saved to {model_filename}")
    return model,training_time_ms


def evaluate(model, data):

    dataset = [(rating[0], rating[1], rating[2]) for rating in valid_data.raw_ratings]

    predictions = model.test(dataset)

    rmse = accuracy.rmse(predictions, verbose=True)

    return rmse

def inference_cost_per_input(model, user_id, movie_id):
    start_time = time.time()
    model.predict(uid=user_id, iid=movie_id)
    inference_time_seconds = time.time() - start_time
    inference_time_ms = inference_time_seconds * 1000  # Convert to milliseconds
    return inference_time_ms


def predict(model, user_id, movie_list, user_movie_list, K=20):

    recommendations = []
    scores = []

    for movie in movie_list:
        if user_id in user_movie_list and movie in user_movie_list[user_id]:
            continue
        prediction = model.predict(uid=user_id, iid=movie)
        scores.append((prediction.est, movie))

    scores.sort(reverse=True)
    recommendations = [movie for _, movie in scores[:K]]

    return recommendations
def get_model_size(model_filename):
    # Get the size of the model in bytes
    return os.path.getsize(model_filename)


ratings_file = 'extracted_ratings.csv'

train_df, val_df = prepare_data_csv(ratings_file)

train_data, valid_data = prepare_data_model(train_df, val_df)

model, training_time = train_model(train_data)

rmse_score = evaluate(model, valid_data)
print(f"Validation RMSE value is {rmse_score}")

all_movies= train_df['movie_id'].unique().tolist()

user_movies= train_df.groupby('user_id')['movie_id'].apply(set).to_dict()

test_user_id = 93
test_movie_id = train_df['movie_id'].iloc[0]
recommendations = predict(model, test_user_id, all_movies, user_movies)
print(f"Top 20 recommendations for user {test_user_id}: {recommendations}")

inference_time_ms = inference_cost_per_input(model, test_user_id, test_movie_id)
print(f"Inference time per input: {inference_time_ms:.6f} milliseconds")

model_filename = 'SVD_movie_recommender.pkl'
model_size_bytes = get_model_size(model_filename)
model_size_mb = model_size_bytes / (1024 * 1024)
print(f"Memory size of the model: {model_size_mb:.2f} MB")

print(f"Training time: {training_time:.2f} milliseconds")

Model saved to SVD_movie_recommender.pkl
RMSE: 0.8302
Validation RMSE value is 0.8301501547506631
Top 20 recommendations for user 93: ['the+lives+of+others+2006', 'the+shawshank+redemption+1994', 'das+boot+1981', 'gran+torino+2008', 'dancer+in+the+dark+2000', 'seven+samurai+1954', 'shallow+grave+1994', 'henry+v+1989', 'city+of+god+2002', 'the+prestige+2006', 'the+godfather+1972', 'oldboy+2003', 'memento+2000', 'life+is+beautiful+1997', 'the+usual+suspects+1995', 'dr.+strangelove+or+how+i+learned+to+stop+worrying+and+love+the+bomb+1964', 'spellbound+2002', 'everest+1998', 'persuasion+1995', 'pride++prejudice+2005']
Inference time per input: 0.037432 milliseconds
Memory size of the model: 21.98 MB
Training time: 1506.19 milliseconds


In [None]:
!pip install torch



In [None]:
!pip install psutil



### *SURPRISE MODEL- Experimental*

In [None]:
def get_user_features(age, gender, occupation, le_gender, le_occupation, scaler):
    age_scaled = scaler.transform([[age]])[0][0] if age is not None else 0
    gender_encoded = le_gender.transform([gender])[0] if gender in le_gender.classes_ else -1
    occupation_encoded = le_occupation.transform([occupation])[0] if occupation in le_occupation.classes_ else -1
    return np.array([age_scaled, gender_encoded, occupation_encoded])


def predict_rating(base_est, user_features, movie_features, mean_user_factors, user_bias, item_bias, n_factors):
    padded_features = np.pad(np.concatenate([user_features, movie_features]),
                             (0, max(0, n_factors - len(user_features) - len(movie_features))))
    adjustment = np.dot(padded_features, mean_user_factors) + user_bias + item_bias
    prediction = base_est + 0.01 * adjustment
    return max(0, min(1, prediction))

def preprocess_movie_features(movie_details, tfidf):
    movie_details['genres'] = movie_details['genres'].fillna('')
    genres_vector = tfidf.transform(movie_details['genres']).toarray()
    return dict(zip(movie_details['movie_id'], genres_vector))

def get_top_n_recommendations(model, user_id, age, gender, occupation, movie_features, le_gender, le_occupation, scaler, mean_user_factors, user_bias, item_bias, n=20):
    user_features = get_user_features(age, gender, occupation, le_gender, le_occupation, scaler)
    n_factors = model.pu.shape[1]

    def process_movie(movie_id_vector):
        movie_id, movie_vector = movie_id_vector
        try:
            base_est = model.predict(user_id, movie_id).est
        except ValueError:
            base_est = model.trainset.global_mean

        rating = predict_rating(base_est, user_features, movie_vector, mean_user_factors, user_bias, item_bias, n_factors)
        return movie_id, rating + np.random.normal(0, 0.001)

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        user_ratings = list(executor.map(process_movie, movie_features.items()))

    user_ratings.sort(key=lambda x: x[1], reverse=True)
    return user_ratings[:n]

def save_model_and_encoders(model, movie_to_index, le_gender, le_occupation, scaler, movie_features, tfidf, path='recommendation_model.pkl'):
    mean_user_factors = np.mean(model.pu, axis=0)
    user_bias = np.mean(model.bu)
    item_bias = np.mean(model.bi)

    with open(path, 'wb') as f:
        pickle.dump({
            'svd_model': model,
            'movie_to_index': movie_to_index,
            'le_gender': le_gender,
            'le_occupation': le_occupation,
            'scaler': scaler,
            'tfidf': tfidf,
            'movie_features': movie_features,
            'mean_user_factors': mean_user_factors,
            'user_bias': user_bias,
            'item_bias': item_bias
        }, f)
    print("Model and data saved successfully.")




def server_predict(user_id, age, gender, occupation):
    start_time = time.time()

    with open('recommendation_model.pkl', 'rb') as f:
        data = pickle.load(f)

    model = data['svd_model']
    le_gender = data['le_gender']
    le_occupation = data['le_occupation']
    scaler = data['scaler']
    movie_features = data['movie_features']
    mean_user_factors = data['mean_user_factors']
    user_bias = data['user_bias']
    item_bias = data['item_bias']

    recommendations = get_top_n_recommendations(
        model, user_id, age, gender, occupation, movie_features,
        le_gender, le_occupation, scaler, mean_user_factors, user_bias, item_bias
    )

    end_time = time.time()
    total_time = (end_time - start_time) * 1000
    print(f"Total prediction time: {total_time:.2f} ms")

    return recommendations


In [None]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import concurrent.futures
import time

ratings = pd.read_csv('extracted_ratings.csv')
watches = pd.read_csv('extracted_watches.csv')
movie_details = pd.read_csv('movie_details.csv')
user_details = pd.read_csv('user_details.csv')

ratings['interaction'] = 1
watches['interaction'] = 1
interactions = pd.concat([
        ratings[['user_id', 'movie_id', 'interaction']],
        watches[['user_id', 'movie_id', 'interaction']]
    ]).drop_duplicates()

le_gender = LabelEncoder()
le_occupation = LabelEncoder()
user_details['gender_encoded'] = le_gender.fit_transform(user_details['gender'])
user_details['occupation_encoded'] = le_occupation.fit_transform(user_details['occupation'])
scaler = StandardScaler()
user_details[['age_scaled']] = scaler.fit_transform(user_details[['age']])

movie_details['genres'] = movie_details['genres'].fillna('')
tfidf = TfidfVectorizer(stop_words='english')
movie_details['genres_vector'] = tfidf.fit_transform(movie_details['genres']).toarray().tolist()

interactions = interactions.merge(user_details, on='user_id', how='left')
interactions = interactions.merge(movie_details[['movie_id', 'genres_vector']], on='movie_id', how='left')

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(interactions[['user_id', 'movie_id', 'interaction']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

print("Training the model...")
model = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
model.fit(trainset)

print("Preprocessing movie features...")
movie_to_index = {movie: idx for idx, movie in enumerate(movie_details['movie_id'].unique())}
movie_features = preprocess_movie_features(movie_details, tfidf)

print("Saving the model and preprocessed data...")
save_model_and_encoders(model, movie_to_index, le_gender, le_occupation, scaler, movie_features, tfidf)



print("\nServer-side prediction example:")
# server_recommendations = server_predict('33039', 30, 'M', 'homemaker')
server_recommendations = server_predict('93', 55, 'F', 'scientist')
print("\nTop 20 recommendations:")
for movie_id, score in server_recommendations[:20]:
    print(f"Movie ID: {movie_id}, Predicted Score: {score:.4f}")


Training the model...
Preprocessing movie features...
Saving the model and preprocessed data...
Model and data saved successfully.

Server-side prediction example:




Total prediction time: 844.14 ms

Top 20 recommendations:
Movie ID: neighbors+1920, Predicted Score: 1.0037
Movie ID: the+house+that+dripped+blood+1971, Predicted Score: 1.0034
Movie ID: trouble+in+paradise+1932, Predicted Score: 1.0033
Movie ID: our+lady+of+the+assassins+2000, Predicted Score: 1.0032
Movie ID: fatal+instinct+1993, Predicted Score: 1.0031
Movie ID: no+greater+love+2009, Predicted Score: 1.0030
Movie ID: robin+hood+men+in+tights+1993, Predicted Score: 1.0030
Movie ID: future+weather+2012, Predicted Score: 1.0029
Movie ID: twilights+last+gleaming+1977, Predicted Score: 1.0029
Movie ID: plainlands+1988, Predicted Score: 1.0028
Movie ID: autumn+spring+2001, Predicted Score: 1.0028
Movie ID: the+wog+boy+2000, Predicted Score: 1.0027
Movie ID: resurrect+dead+the+mystery+of+the+toynbee+tiles+2011, Predicted Score: 1.0027
Movie ID: backbeat+1994, Predicted Score: 1.0027
Movie ID: return+of+the+jedi+1983, Predicted Score: 1.0027
Movie ID: dream+wife+1953, Predicted Score: 1.002

In [None]:
print("\nServer-side prediction example:")
server_recommendations = server_predict('33039', 30, 'M', 'homemaker')
for movie_id, score in server_recommendations[:20]:
    print(f"Movie ID: {movie_id}, Predicted Score: {score:.4f}")


Server-side prediction example:




Total prediction time: 866.69 ms
Movie ID: bedtime+stories+2008, Predicted Score: 1.0036
Movie ID: the+inheritance+2003, Predicted Score: 1.0035
Movie ID: my+way+2011, Predicted Score: 1.0032
Movie ID: capturing+the+friedmans+2003, Predicted Score: 1.0031
Movie ID: nausica+of+the+valley+of+the+wind+1984, Predicted Score: 1.0031
Movie ID: nightmare+man+2006, Predicted Score: 1.0030
Movie ID: mouth+to+mouth+2005, Predicted Score: 1.0029
Movie ID: queen+days+of+our+lives+2011, Predicted Score: 1.0029
Movie ID: gumshoe+1971, Predicted Score: 1.0029
Movie ID: in+the+hands+of+the+gods+2007, Predicted Score: 1.0028
Movie ID: lars+and+the+real+girl+2007, Predicted Score: 1.0028
Movie ID: mans+job+2007, Predicted Score: 1.0028
Movie ID: the+scarlet+clue+1945, Predicted Score: 1.0028
Movie ID: lovely+molly+2012, Predicted Score: 1.0027
Movie ID: the+girl+who+kicked+the+hornets+nest+2009, Predicted Score: 1.0027
Movie ID: bunty+aur+babli+2005, Predicted Score: 1.0027
Movie ID: high+tension+2003, 

In [None]:
server_recommendations = server_predict('6666', 27, 'M', 'scientist')
for movie_id, score in server_recommendations[:20]:
    print(f"Movie ID: {movie_id}, Predicted Score: {score:.4f}")



Model load time: 41.47 ms
Inference time: 424.41 ms
Total prediction time: 465.90 ms
Movie ID: the+journey+1992, Predicted Score: 1.0035
Movie ID: pandoras+box+1929, Predicted Score: 1.0033
Movie ID: the+americanization+of+emily+1964, Predicted Score: 1.0032
Movie ID: the+caller+2011, Predicted Score: 1.0031
Movie ID: arabian+nights+1974, Predicted Score: 1.0030
Movie ID: masti+2004, Predicted Score: 1.0030
Movie ID: virtuosity+1995, Predicted Score: 1.0029
Movie ID: midaq+alley+1995, Predicted Score: 1.0029
Movie ID: bad+boy+bubby+1993, Predicted Score: 1.0029
Movie ID: shine+a+light+2008, Predicted Score: 1.0028
Movie ID: pitch+black+2000, Predicted Score: 1.0028
Movie ID: tinker+tailor+soldier+spy+1979, Predicted Score: 1.0028
Movie ID: sharpes+sword+1995, Predicted Score: 1.0028
Movie ID: night+of+the+ghouls+1959, Predicted Score: 1.0028
Movie ID: time+regained+1999, Predicted Score: 1.0027
Movie ID: capitalism+a+love+story+2009, Predicted Score: 1.0027
Movie ID: the+botany+of+desi

In [None]:
server_recommendations = server_predict('466', 27, 'F', 'K-12 student')
for movie_id, score in server_recommendations[:20]:
    print(f"Movie ID: {movie_id}, Predicted Score: {score:.4f}")

Model load time: 154.16 ms
Inference time: 192.22 ms
Total prediction time: 346.38 ms
Movie ID: the+night+they+raided+minskys+1968, Predicted Score: 1.0037
Movie ID: the+joker+is+wild+1957, Predicted Score: 1.0035
Movie ID: premium+rush+2012, Predicted Score: 1.0033
Movie ID: the+prestige+2006, Predicted Score: 1.0030
Movie ID: the+sexual+life+of+the+belgians+1994, Predicted Score: 1.0029
Movie ID: charly+1968, Predicted Score: 1.0029
Movie ID: horse+feathers+1932, Predicted Score: 1.0029
Movie ID: waxworks+1924, Predicted Score: 1.0029
Movie ID: in++out+1997, Predicted Score: 1.0028
Movie ID: inside+llewyn+davis+2013, Predicted Score: 1.0028
Movie ID: the+intouchables+2011, Predicted Score: 1.0028
Movie ID: blink+1994, Predicted Score: 1.0028
Movie ID: the+kiss+of+the+vampire+1963, Predicted Score: 1.0028
Movie ID: his+girl+friday+1940, Predicted Score: 1.0028
Movie ID: pierrot+le+fou+1965, Predicted Score: 1.0028
Movie ID: nobody+loves+me+1994, Predicted Score: 1.0028
Movie ID: the+t



### SECOND MODEL-METRICS-Experimental

In [None]:
from sklearn.metrics import mean_squared_error, precision_score, recall_score, f1_score

def evaluate_model(model, test_loader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    criterion = nn.BCEWithLogitsLoss()

    with torch.no_grad():
        for user_features, movie_indices, labels in test_loader:
            user_features, movie_indices, labels = user_features.to(device), movie_indices.to(device), labels.to(device)

            outputs = model(user_features, movie_indices)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            all_preds.append(torch.sigmoid(outputs).cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    # Calculate metrics
    binary_preds = (all_preds > 0.5).astype(int)

    # RMSE
    rmse = np.sqrt(mean_squared_error(all_labels, all_preds))

    precision = precision_score(all_labels, binary_preds)
    recall = recall_score(all_labels, binary_preds)
    f1 = f1_score(all_labels, binary_preds)

    avg_loss = total_loss / len(test_loader)

    return avg_loss, rmse, precision, recall, f1

test_loss, test_rmse, test_precision, test_recall, test_f1 = evaluate_model(model, test_loader, device)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1-Score: {test_f1:.4f}")


Test Loss: 0.0877
Test RMSE: 0.1313
Test Precision: 1.0000
Test Recall: 0.9796
Test F1-Score: 0.9897


### **SECOND MODEL SVD with parameter changed-Experimental**

In [None]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import time
import psutil
import os

class SimpleCollaborativeFiltering(nn.Module):
    def __init__(self, num_movies, num_user_features, embedding_dim=400):
        super(SimpleCollaborativeFiltering, self).__init__()
        self.user_features = nn.Linear(num_user_features, embedding_dim)
        self.movie_embeddings = nn.Embedding(num_movies, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * 2, 100)
        self.fc2 = nn.Linear(100, 1)
        self.relu = nn.ReLU()

    def forward(self, user_features, movie_indices):
        user_embed = self.relu(self.user_features(user_features))
        movie_embed = self.movie_embeddings(movie_indices)
        x = torch.cat([user_embed, movie_embed], dim=1)
        x = self.relu(self.fc1(x))
        return self.fc2(x).squeeze()

class InteractionDataset(Dataset):
    def __init__(self, user_features, movie_indices, interaction_labels):
        self.user_features = torch.tensor(user_features, dtype=torch.float)
        self.movie_indices = torch.tensor(movie_indices, dtype=torch.long)
        self.labels = torch.tensor(interaction_labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.user_features[idx], self.movie_indices[idx], self.labels[idx]

def preprocess_data():
    ratings = pd.read_csv('extracted_ratings.csv')
    watches = pd.read_csv('extracted_watches.csv')
    movie_details = pd.read_csv('movie_details.csv')
    user_details = pd.read_csv('user_details.csv')

    ratings['interaction'] = 1
    watches['interaction'] = 1
    interactions = pd.concat([ratings[['user_id', 'movie_id', 'interaction']],
                              watches[['user_id', 'movie_id', 'interaction']]]).drop_duplicates()

    interactions = interactions.merge(user_details, on='user_id', how='left')

    interactions = interactions.merge(movie_details[['movie_id', 'title']], on='movie_id', how='left')

    interactions.dropna(subset=['age', 'gender', 'occupation', 'movie_id'], inplace=True)

    le_gender = LabelEncoder()
    le_occupation = LabelEncoder()

    interactions['gender'] = le_gender.fit_transform(interactions['gender'])
    interactions['occupation'] = le_occupation.fit_transform(interactions['occupation'])
    interactions['age'] = (interactions['age'] - interactions['age'].mean()) / interactions['age'].std()
    movie_to_index = {movie: idx for idx, movie in enumerate(interactions['movie_id'].unique())}
    interactions['movie_idx'] = interactions['movie_id'].map(movie_to_index)

    return interactions, movie_to_index, le_gender, le_occupation

def get_memory_usage():
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info().rss  # in bytes
    memory_in_mb = memory_info / (1024 ** 2)  # Convert to MB
    return memory_in_mb

interactions, movie_to_index, le_gender, le_occupation = preprocess_data()

train_data, test_data = train_test_split(interactions, test_size=0.2, random_state=42)

train_dataset = InteractionDataset(train_data[['age', 'gender', 'occupation']].values,
                                   train_data['movie_idx'].values, train_data['interaction'].values)
test_dataset = InteractionDataset(test_data[['age', 'gender', 'occupation']].values,
                                  test_data['movie_idx'].values, test_data['interaction'].values)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

num_movies = len(movie_to_index)
num_user_features = 3  # age, gender, occupation
model = SimpleCollaborativeFiltering(num_movies, num_user_features)

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# start the timer before the first training epoch
start_time = time.time()
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for user_features, movie_indices, labels in train_loader:
        user_features, movie_indices, labels = user_features.to(device), movie_indices.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(user_features, movie_indices)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}')

# end the timer after all training
end_time = time.time()
evaluation_duration_ms = (end_time - start_time) * 1000
print(f"Total evaluation time: {evaluation_duration_ms:.2f} milliseconds")
print(f"Memory usage after training: {get_memory_usage():.2f} MB")
print("------------------------------------------------------------------")



def save_model_and_encoders(model, movie_to_index, le_gender, le_occupation, scaler, path='collaborative_filtering.pth'):
    torch.save({
        'model_state_dict': model.state_dict(),
        'movie_to_index': movie_to_index,
        'le_gender': le_gender,
        'le_occupation': le_occupation,
        'scaler': scaler
    }, path)

interactions, movie_to_index, le_gender, le_occupation = preprocess_data()

scaler = StandardScaler()
interactions[['age']] = scaler.fit_transform(interactions[['age']])

save_model_and_encoders(model, movie_to_index, le_gender, le_occupation, scaler)

#USE THIS PART IN CALLING THE MODEL FOR PREDICTIONS

def load_model(path='collaborative_filtering.pth'):
    checkpoint = torch.load(path)
    num_movies = len(checkpoint['movie_to_index'])
    model = SimpleCollaborativeFiltering(num_movies, 3)  # 3 for user features: age, gender, occupation
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    return model, checkpoint

def preprocess_user(age, occupation, gender, checkpoint):
    le_gender = checkpoint['le_gender']
    le_occupation = checkpoint['le_occupation']
    scaler = checkpoint['scaler']

    gender_encoded = le_gender.transform([gender])[0]
    occupation_encoded = le_occupation.transform([occupation])[0]
    age_normalized = scaler.transform([[age]])[0][0]

    user_features = torch.tensor([age_normalized, gender_encoded, occupation_encoded], dtype=torch.float)
    return user_features

def get_top_n_recommendations(model, user_features, movie_to_index, n=20):
    model.eval()
    with torch.no_grad():
        start_time = time.time()
        user_features = user_features.unsqueeze(0).repeat(len(movie_to_index), 1)
        movie_indices = torch.tensor(list(movie_to_index.values()))
        predictions = model(user_features, movie_indices)

    top_n_indices = predictions.argsort(descending=True)[:n]

    index_to_movie = {idx: movie for movie, idx in movie_to_index.items()}
    top_n_movies = [index_to_movie[idx.item()] for idx in top_n_indices]
    end_time = time.time()
    latency = (end_time - start_time) * 1000
    print(f"Inference latency: {latency:.2f} milliseconds")

    return top_n_movies, predictions[top_n_indices].numpy()

model, checkpoint = load_model()



Epoch 1/10, Loss: 0.4690
Epoch 2/10, Loss: 0.0529
Epoch 3/10, Loss: 0.0181
Epoch 4/10, Loss: 0.0093
Epoch 5/10, Loss: 0.0052
Epoch 6/10, Loss: 0.0087
Epoch 7/10, Loss: 0.0026
Epoch 8/10, Loss: 0.0019
Epoch 9/10, Loss: 0.0048
Epoch 10/10, Loss: 0.0011
Total evaluation time: 309.27 milliseconds
Memory usage after training: 561.82 MB
------------------------------------------------------------------


  checkpoint = torch.load(path)


In [None]:
# Example user details
user_id = 474
age = 20
occupation = "scientist"
gender = "M"

user_features = preprocess_user(age, occupation, gender, checkpoint)

movie_to_index = checkpoint['movie_to_index']
top_movies, top_scores = get_top_n_recommendations(model, user_features, movie_to_index, n=20)

print(f"Top 20 movie recommendations for user (ID: {user_id}, Age: {age}, Occupation: {occupation}, Gender: {gender}):")
for movie, score in zip(top_movies, top_scores):
    print(f"Movie ID: {movie}, Score: {score:.4f}")

Inference latency: 5.28 milliseconds
Top 20 movie recommendations for user (ID: 474, Age: 20, Occupation: scientist, Gender: M):
Movie ID: the+godfather+1972, Score: 47.9227
Movie ID: the+english+patient+1996, Score: 47.6316
Movie ID: edward+scissorhands+1990, Score: 47.0703
Movie ID: starlift+1951, Score: 47.0538
Movie ID: harry+potter+and+the+philosophers+stone+2001, Score: 46.9577
Movie ID: the+men+who+stare+at+goats+2009, Score: 46.8793
Movie ID: iron+man+3+2013, Score: 46.8536
Movie ID: the+tin+drum+1979, Score: 46.8247
Movie ID: beauty+and+the+beast+1991, Score: 46.8207
Movie ID: until+the+end+of+the+world+1991, Score: 46.7858
Movie ID: true+lies+1994, Score: 46.7816
Movie ID: the+giver+2014, Score: 46.7418
Movie ID: the+lord+of+the+rings+the+fellowship+of+the+ring+2001, Score: 46.7133
Movie ID: kingsman+the+secret+service+2015, Score: 46.6717
Movie ID: the+inner+circle+1991, Score: 46.6531
Movie ID: seven+samurai+1954, Score: 46.6071
Movie ID: frozen+planet+2011, Score: 46.5982




# COMPARING SVD AND KNN

In [None]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD, SVDpp, KNNBasic
from surprise.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import accuracy
import pickle

ratings = pd.read_csv('extracted_ratings.csv')
watches = pd.read_csv('extracted_watches.csv')
movie_details = pd.read_csv('movie_details.csv')
user_details = pd.read_csv('user_details.csv')

ratings['interaction'] = 1
watches['interaction'] = 1
interactions = pd.concat([
    ratings[['user_id', 'movie_id', 'interaction']],
    watches[['user_id', 'movie_id', 'interaction']]
]).drop_duplicates()

le_gender = LabelEncoder()
le_occupation = LabelEncoder()
user_details['gender_encoded'] = le_gender.fit_transform(user_details['gender'])
user_details['occupation_encoded'] = le_occupation.fit_transform(user_details['occupation'])
scaler = StandardScaler()
user_details[['age_scaled']] = scaler.fit_transform(user_details[['age']])

movie_details['genres'] = movie_details['genres'].fillna('')
tfidf = TfidfVectorizer(stop_words='english')
movie_details['genres_vector'] = tfidf.fit_transform(movie_details['genres']).toarray().tolist()

interactions = interactions.merge(user_details, on='user_id', how='left')
interactions = interactions.merge(movie_details[['movie_id', 'genres_vector']], on='movie_id', how='left')

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(interactions[['user_id', 'movie_id', 'interaction']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

modelname = 'SVDpp'
if modelname == 'SVD':
  model = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
elif modelname == 'SVDpp':
  model = SVDpp(n_factors=150, n_epochs=10, lr_all=0.01, reg_all=0.03)
elif modelname == 'KNNBasic':
  model = KNNBasic(n_neighbors=5)
model.fit(trainset)

def get_user_features(age, gender, occupation, le_gender, le_occupation, scaler):
    age_scaled = scaler.transform([[age]])[0][0] if age is not None else 0
    gender_encoded = le_gender.transform([gender])[0] if gender in le_gender.classes_ else -1
    occupation_encoded = le_occupation.transform([occupation])[0] if occupation in le_occupation.classes_ else -1
    return np.array([age_scaled, gender_encoded, occupation_encoded])

def get_movie_features(movie_id):
    movie = movie_details[movie_details['movie_id'] == movie_id]
    if movie.empty:
        return np.zeros(len(tfidf.get_feature_names_out()))
    return np.array(movie['genres_vector'].iloc[0])

def predict_rating(model, user_id, movie_id, user_features, movie_features):
    try:
        base_est = model.predict(user_id, movie_id).est
    except ValueError:
        base_est = model.trainset.global_mean

    return max(0, min(1, base_est))

def get_top_n_recommendations(model, user_id, age, gender, occupation, movie_details, le_gender, le_occupation, scaler, n=20):
    user_features = get_user_features(age, gender, occupation, le_gender, le_occupation, scaler)
    user_ratings = []
    for _, row in movie_details.iterrows():
        movie_id = row['movie_id']
        movie_features = np.array(row['genres_vector'])
        rating = predict_rating(model, user_id, movie_id, user_features, movie_features)
        user_ratings.append((movie_id, rating))

    user_ratings = [(movie_id, score + np.random.normal(0, 0.001)) for movie_id, score in user_ratings]
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    return user_ratings[:n]

filename = 'recommendation_model_' + modelname + '.pkl'
with open(filename, 'wb') as f:
    pickle.dump({
        'svd_model': model,
        'le_gender': le_gender,
        'le_occupation': le_occupation,
        'scaler': scaler,
        'tfidf': tfidf
    }, f)

def server_predict(user_id, age, gender, occupation, movie_details_path):
    with open('recommendation_model.pkl', 'rb') as f:
        data = pickle.load(f)

    model = data['svd_model']
    le_gender = data['le_gender']
    le_occupation = data['le_occupation']
    scaler = data['scaler']
    tfidf = data['tfidf']

    movie_details = pd.read_csv(movie_details_path)
    movie_details['genres'] = movie_details['genres'].fillna('')
    movie_details['genres_vector'] = tfidf.transform(movie_details['genres']).toarray().tolist()

    recommendations = get_top_n_recommendations(model, user_id, age, gender, occupation, movie_details, le_gender, le_occupation, scaler)
    return recommendations

server_recommendations = server_predict('33039', 30, 'M', 'homemaker', 'movie_details.csv')
for movie_id, score in server_recommendations[:20]:
    print(f"Movie ID: {movie_id}, Predicted Score: {score:.4f}")

In [None]:
server_recommendations = server_predict('33095', 27, 'M', 'scientist', 'movie_details.csv')
for movie_id, score in server_recommendations[:20]:
    print(f"Movie ID: {movie_id}, Predicted Score: {score:.4f}")

In [None]:
import time

# Define how many times you want to run the inference
num_iterations = 20
total_inference_time = 0

# Loop to get server predictions and calculate inference cost
for i in range(num_iterations):
    start_time = time.time()  # Record the start time
    server_recommendations = server_predict('466', 27, 'F', 'K-12 student', 'movie_details.csv')
    end_time = time.time()  # Record the end time

    inference_time = end_time - start_time  # Calculate the inference time
    total_inference_time += inference_time

# Average inference time over all iterations
average_inference_time = total_inference_time / num_iterations
print(f"\nAverage Inference Time: {average_inference_time:.4f} seconds")

In [None]:
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print(rmse)