In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, GridSearchCV

In [3]:
ratings = pd.read_csv('data/ratings.csv')
movies = pd.read_csv('data/movies.csv')

In [4]:
data = pd.merge(ratings, movies, on='movieId')

In [7]:
user_movie_matrix = data.pivot_table(index='userId', columns='title', values='rating')

In [8]:
user_movie_matrix.fillna(0, inplace=True)

In [9]:
user_movie_matrix_mean = user_movie_matrix.mean(axis=1)
user_movie_matrix_demeaned = user_movie_matrix.sub(user_movie_matrix_mean, axis=0)

In [10]:
user_movie_matrix_demeaned_np = user_movie_matrix_demeaned.values

In [11]:
user_movie_matrix_sparse = csr_matrix(user_movie_matrix_demeaned_np)

In [12]:
print(type(user_movie_matrix_sparse))
print(user_movie_matrix_sparse.shape)

<class 'scipy.sparse._csr.csr_matrix'>
(610, 9719)


In [13]:
U, sigma, Vt = svds(user_movie_matrix_sparse, k=50)
sigma = np.diag(sigma)

In [14]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_movie_matrix_mean.values.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns=user_movie_matrix.columns)

In [15]:
def recommend_movies_svd(user_id, num_recommendations=5):
    user_row_number = user_id - 1  # UserId starts from 1
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False)
    recommendations = sorted_user_predictions.head(num_recommendations)
    return recommendations

In [16]:
recommendations_svd = recommend_movies_svd(user_id=1)
print("Recommended movies for user 1 using SVD:\n", recommendations_svd)

Recommended movies for user 1 using SVD:
 title
Star Wars: Episode V - The Empire Strikes Back (1980)                             6.430872
Star Wars: Episode IV - A New Hope (1977)                                         6.341897
Star Wars: Episode VI - Return of the Jedi (1983)                                 6.037529
Indiana Jones and the Last Crusade (1989)                                         6.020816
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    5.995290
Name: 0, dtype: float64


In [19]:
tfidf = TfidfVectorizer(stop_words='english')
movies['genres'] = movies['genres'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies['genres'])

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [24]:
def get_content_recommendations(title, num_recommendations=5):
    idx = movies[movies['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

In [25]:
content_recommendations = get_content_recommendations("Toy Story (1995)")
print("Content-based recommendations for 'Toy Story (1995)':\n", content_recommendations)

Content-based recommendations for 'Toy Story (1995)':
 1706                                       Antz (1998)
2355                                Toy Story 2 (1999)
2809    Adventures of Rocky and Bullwinkle, The (2000)
3000                  Emperor's New Groove, The (2000)
3568                             Monsters, Inc. (2001)
Name: title, dtype: object


In [26]:
def hybrid_recommendations(user_id, title, num_recommendations=5):
    user_recommendations = recommend_movies_svd(user_id, num_recommendations)
    content_recommendations = get_content_recommendations(title, num_recommendations)
    
    combined_recommendations = pd.concat([user_recommendations, content_recommendations]).drop_duplicates().head(num_recommendations)
    return combined_recommendations

In [27]:
hybrid_recommendations = hybrid_recommendations(user_id=1, title="Toy Story (1995)")
print("Hybrid recommendations for user 1 and 'Toy Story (1995)':\n", hybrid_recommendations)

Hybrid recommendations for user 1 and 'Toy Story (1995)':
 Star Wars: Episode V - The Empire Strikes Back (1980)                             6.430872
Star Wars: Episode IV - A New Hope (1977)                                         6.341897
Star Wars: Episode VI - Return of the Jedi (1983)                                 6.037529
Indiana Jones and the Last Crusade (1989)                                         6.020816
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)     5.99529
dtype: object


In [28]:
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [29]:
algo = SVD()

In [30]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8750  0.8676  0.8724  0.8749  0.8771  0.8734  0.0033  
MAE (testset)     0.6717  0.6661  0.6733  0.6698  0.6734  0.6709  0.0027  
Fit time          1.13    0.83    0.81    0.82    0.82    0.88    0.13    
Test time         0.24    0.07    0.07    0.13    0.07    0.12    0.07    


{'test_rmse': array([0.87500955, 0.86755813, 0.8723834 , 0.87494594, 0.877126  ]),
 'test_mae': array([0.6717063 , 0.66609581, 0.67333407, 0.66979311, 0.67339015]),
 'fit_time': (1.1312594413757324,
  0.8257057666778564,
  0.8128676414489746,
  0.8152611255645752,
  0.81691575050354),
 'test_time': (0.2408161163330078,
  0.07204890251159668,
  0.07459259033203125,
  0.13174748420715332,
  0.07200193405151367)}

In [31]:
param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30, 40],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.1]
}

In [32]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

In [None]:
best_algo = gs.best_estimator['rmse']
print("Best RMSE score:", gs.best_score['rmse'])
print("Best parameters:", gs.best_params['rmse'])

In [None]:
trainset = data.build_full_trainset()
best_algo.fit(trainset)

In [None]:
def surprise_recommend(user_id, movie_id):
    prediction = best_algo.predict(user_id, movie_id)
    return prediction.est

In [None]:
predicted_rating = surprise_recommend(1, 1)
print(f"Predicted rating for user 1 and movie 1: {predicted_rating}")