In [11]:
import django_jupyter
django_jupyter.init()

In [4]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split

# Setup for notebook
import warnings
warnings.filterwarnings('ignore')  # Optional: to keep the notebook tidy
print("Setup complete.")


Setup complete.


In [6]:
import pandas as pd
from django.db.models import F
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from ratings.models import Rating  # Adjust this import based on your actual model location

# Fetch ratings data directly from the database
queryset = Rating.objects.all().values('user', 'movie', 'score')
ratings_data = pd.DataFrame(list(queryset))

# Define the Reader object with the rating scale, adjust the scale as per your actual data
reader = Reader(rating_scale=(0.5, 10))  # Update the scale if your ratings range from 0.5 to 10

# Load the data into a Surprise dataset
data = Dataset.load_from_df(ratings_data[['user', 'movie', 'score']], reader)

# Split the data into training and test sets (e.g., 75% training, 25% testing)
trainset, testset = train_test_split(data, test_size=0.25)


In [7]:
from surprise import SVD
from surprise.model_selection import cross_validate

# Create the SVD algorithm instance
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)

# Predict ratings for the testset
predictions = algo.test(testset)

# Compute and print Root Mean Squared Error and Mean Absolute Error
from surprise import accuracy
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print("Root Mean Squared Error: ", rmse)
print("Mean Absolute Error: ", mae)


RMSE: 1.7741
MAE:  1.3548
Root Mean Squared Error:  1.774072161599227
Mean Absolute Error:  1.354769353452336


In [8]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

param_grid = {
    'n_epochs': [5, 10, 20], 
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.1]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print("Best RMSE score attained: ", gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print("Parameters that gave the best RMSE score: ", gs.best_params['rmse'])

# Use the best algorithm as final algorithm
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())


Best RMSE score attained:  1.7590541649503455
Parameters that gave the best RMSE score:  {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.1}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x162bc139d60>

In [9]:
import os
import surprise

# Save the trained algorithm
file_name = os.path.expanduser('~/trained_model.dump')
surprise.dump.dump(file_name, algo=algo)

print("Model saved successfully!")


Model saved successfully!


In [13]:
# Fetch data from the Rating model
ratings_data = Rating.objects.all().values_list('user_id', 'movie_id', 'score')

# Convert to DataFrame
df = pd.DataFrame(list(ratings_data), columns=['user_id', 'movie_id', 'rating'])

# Confirm that data is loaded
print(df.head())

   user_id  movie_id rating
0        4      2114   10.0
1        4      3060   10.0
2        4      3071    8.0
3        4      3169    8.0
4        4      3208    4.0


In [17]:
import os
from surprise import dump

# Load the trained SVD model
file_name = os.path.expanduser('~/trained_model.dump')
_, loaded_algo = dump.load(file_name)

# Now you can use loaded_algo to make predictions
user_id = str(4275)  # Surprise expects raw user ids to be string
movie_id = str(3071)  # Surprise expects raw item ids to be string
actual_rating = 3.2  # Example of an actual rating

# Make prediction
pred = loaded_algo.predict(user_id, movie_id, r_ui=actual_rating, verbose=True)
print(f'Predicted rating: {pred.est}')


user: 4275       item: 3071       r_ui = 3.20   est = 6.86   {'was_impossible': False}
Predicted rating: 6.858483314337212


In [20]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import GridSearchCV
import pandas as pd


# Load the dataset
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[['user_id', 'movie_id', 'rating']], reader)


# Set the parameter grid for SVD
param_grid = {
    'n_epochs': [5, 10, 20],  # Number of epochs
    'lr_all': [0.002, 0.005, 0.01],  # Learning rate
    'reg_all': [0.02, 0.1, 0.2]  # Regularization term
}


# Setup GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)


# Perform grid search
gs.fit(data)


# Best RMSE score
print("Best RMSE score attained: ", gs.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
print("Parameters that gave the best RMSE score: ", gs.best_params['rmse'])


Best RMSE score attained:  1.7459636750485765
Parameters that gave the best RMSE score:  {'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.2}


In [21]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse, mae

# Assuming you have a DataFrame 'df' with columns ['user_id', 'movie_id', 'rating']
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[['user_id', 'movie_id', 'rating']], reader)
trainset = data.build_full_trainset()

# Create the SVD algorithm with optimized parameters
algo = SVD(n_epochs=20, lr_all=0.01, reg_all=0.2)

# Train the model
algo.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x162b5404320>

In [22]:
import os
import surprise

# Save the trained algorithm
file_name = os.path.expanduser('~/trained_model_optimized.dump')
surprise.dump.dump(file_name, algo=algo)

print("Optimized model saved successfully!")


Optimized model saved successfully!


In [23]:
import pandas as pd
from django.db import connection
from surprise import Dataset, Reader, dump
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse, mae

# Define a function to fetch data from the database
def fetch_ratings():
    with connection.cursor() as cursor:
        cursor.execute("SELECT user_id, movie_id, score FROM ratings_rating")
        result = cursor.fetchall()
        return pd.DataFrame(result, columns=['user_id', 'movie_id', 'rating'])

# Fetch the data
ratings_df = fetch_ratings()

# Define the reader with the rating scale
reader = Reader(rating_scale=(1, 10))  # adjust the scale if your ratings differ

# Load the dataset from the DataFrame
data = Dataset.load_from_df(ratings_df, reader)

# Split the dataset into training and test set (use 20% for testing)
trainset, testset = train_test_split(data, test_size=0.2)

# Load the optimized model
file_name = os.path.expanduser('~/trained_model_optimized.dump')
_, loaded_algo = dump.load(file_name)

# Use the algorithm to make predictions on the test set
predictions = loaded_algo.test(testset)

# Calculate and print the accuracy measures
print("RMSE: ", rmse(predictions))
print("MAE: ", mae(predictions))


RMSE: 1.1460
RMSE:  1.1460007892671162
MAE:  0.8934
MAE:  0.8933818852879325


In [24]:
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise import Dataset
from surprise import Reader

# Assume you have a DataFrame 'df' with user ratings
# Ensure df columns are named appropriately for Surprise to process
reader = Reader(rating_scale=(1, 10))  # Adjust rating_scale to match your dataset
data = Dataset.load_from_df(df[['user_id', 'movie_id', 'rating']], reader)

# Define the SVD algorithm with optimized parameters
algo = SVD(n_epochs=20, lr_all=0.01, reg_all=0.2)

# Perform cross-validation
cross_val_results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Print out the average RMSE and MAE across all folds
print(f"Average RMSE: {np.mean(cross_val_results['test_rmse'])}")
print(f"Average MAE: {np.mean(cross_val_results['test_mae'])}")


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6910  1.7430  1.7443  1.7451  1.7350  1.7317  0.0207  
MAE (testset)     1.2989  1.3326  1.3313  1.3317  1.3194  1.3228  0.0129  
Fit time          0.53    0.48    0.61    0.50    0.50    0.52    0.05    
Test time         0.06    0.05    0.55    0.05    0.05    0.15    0.20    
Average RMSE: 1.7316875302698471
Average MAE: 1.3227918379183579


In [25]:
import pandas as pd

# Assuming df is your DataFrame containing user and movie ratings
print("Existing user ID sample:", df['user_id'].sample(1).iloc[0])
print("Existing movie ID sample:", df['movie_id'].sample(1).iloc[0])

# Generate synthetic new user and movie IDs
new_user_id = df['user_id'].max() + 1
new_movie_id = df['movie_id'].max() + 1

print("New user ID for testing:", new_user_id)
print("New movie ID for testing:", new_movie_id)


Existing user ID sample: 4387
Existing movie ID sample: 3475
New user ID for testing: 5630
New movie ID for testing: 36084


In [26]:
# Assuming 'algo' is your trained model loaded and ready for predictions
# Test with edge cases:
test_cases = [
    (5630, 36084, 10.0),  # High rating
    (5630, 36084, 1.0),   # Low rating
    (4387, 36084, 5.0),  # New movie
    (5630, 3475, 5.0)   # New user
]

for user, item, actual_rating in test_cases:
    # We use 'predict' because there might not be actual ratings for these combinations
    prediction = algo.predict(user, item, r_ui=actual_rating, verbose=True)
    print(f"Predicted rating for user {user}, item {item}: {prediction.est}")


user: 5630       item: 36084      r_ui = 10.00   est = 6.86   {'was_impossible': False}
Predicted rating for user 5630, item 36084: 6.860148432840766
user: 5630       item: 36084      r_ui = 1.00   est = 6.86   {'was_impossible': False}
Predicted rating for user 5630, item 36084: 6.860148432840766
user: 4387       item: 36084      r_ui = 5.00   est = 6.96   {'was_impossible': False}
Predicted rating for user 4387, item 36084: 6.95976035583924
user: 5630       item: 3475       r_ui = 5.00   est = 7.34   {'was_impossible': False}
Predicted rating for user 5630, item 3475: 7.341403597402606
