In [1]:
import pandas as pd
import numpy as np

**Cargamos los datos: una lista de las películas y la matriz de ratings**

In [2]:
ratings_list = [i.strip().split("::") for i in open('ratings.dat', 'r').readlines()]
movies_list = [i.strip().split("::") for i in open('movies.dat', 'r', encoding='UTF-8', errors='ignore').readlines()]

ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = int)
movies_df = pd.DataFrame(movies_list, columns = ['MovieID', 'Title', 'Genres'])

In [3]:
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


**Conviertimos UserID y MovieID de string a numeric en ambos dataframes, pues más adelante lo vamos a necesitar para poder ejecutar un .merge() de los dos datasets**

In [5]:
movies_df['MovieID'] = pd.to_numeric(movies_df['MovieID'])

In [6]:
ratings_df['MovieID'] = pd.to_numeric(ratings_df['MovieID'])

In [7]:
ratings_df['UserID'] = pd.to_numeric(ratings_df['UserID'])

**Contamos con 6040 usuarios únicos**

In [8]:
ratings_df['UserID'].nunique()

6040

**En total, han reankeado 3706 películas únicas**

In [9]:
ratings_df['MovieID'].nunique()

3706

**Convertimos la matriz de ratings en una que tenga los usuarios en filas y las películas en columnas**

In [10]:
R_df = ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**En total tenemos, 6040 filas y 3706 columnas**

In [11]:
R_df.shape

(6040, 3706)

**Para poder utilizar esta matriz en el recomendador necesitamos 2.convertir el data frame en un numpy array y 2. normalizar por la media de cada usuario**

In [12]:
R = R_df.to_numpy()
R = R.astype(np.float)
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

**A continuación, descomponemos la matriz de ratings**

In [13]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)
sigma = np.diag(sigma)

**Usaremos su producto matricial para predecir la matriz de ratings completa con la que, finalmente, haremos las recomendaciones**

In [14]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [15]:
preds_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.288861,0.143055,-0.19508,-0.018843,0.012232,-0.176604,-0.07412,0.141358,-0.059553,-0.19595,...,0.027807,0.00164,0.026395,-0.022024,-0.085415,0.403529,0.105579,0.031912,0.05045,0.08891
1,0.744716,0.169659,0.335418,0.000758,0.022475,1.35305,0.051426,0.071258,0.161601,1.567246,...,-0.056502,-0.013733,-0.01058,0.062576,-0.016248,0.15579,-0.418737,-0.101102,-0.054098,-0.140188
2,1.818824,0.456136,0.090978,-0.043037,-0.025694,-0.158617,-0.131778,0.098977,0.030551,0.73547,...,0.040481,-0.005301,0.012832,0.029349,0.020866,0.121532,0.076205,0.012345,0.015148,-0.109956
3,0.408057,-0.07296,0.039642,0.089363,0.04195,0.237753,-0.049426,0.009467,0.045469,-0.11137,...,0.008571,-0.005425,-0.0085,-0.003417,-0.083982,0.094512,0.057557,-0.02605,0.014841,-0.034224
4,1.574272,0.021239,-0.0513,0.246884,-0.032406,1.552281,-0.19963,-0.01492,-0.060498,0.450512,...,0.110151,0.04601,0.006934,-0.01594,-0.05008,-0.052539,0.507189,0.03383,0.125706,0.199244


**Creamos la función que, consumiendo la matriz de ratings predichos, recomienda a un usuario las películas que más podrían gustarle y aún no vio**

In [16]:
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Obtenemos los ratings predichos para el usuario en cuestión y ordenamos sus ratings de mayor a menor
    user_row_number = userID - 1 # UserID comienza en 1, no 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # De la información original de este usuario, obtenemos cuántas y cuáles películas ya vio
    user_data = original_ratings_df[original_ratings_df['UserID'] == userID]
    user_full = user_data.merge(movies_df, how = 'left', left_on = 'MovieID', right_on = 'MovieID').sort_values(['Rating'], ascending=False)
    
    # Recomendamos las primeras que no vio con mayor rating predicho
    sorted_user_predictions=pd.DataFrame(sorted_user_predictions).reset_index()
    sorted_user_predictions['MovieID'] = pd.to_numeric(sorted_user_predictions['MovieID'])

    recommendations = (movies_df[~movies_df['MovieID'].isin(user_full['MovieID'])].merge(sorted_user_predictions, how = 'left',left_on = 'MovieID',right_on = 'MovieID').
    rename(columns = {user_row_number: 'Predictions'}).
    sort_values('Predictions', ascending = False).
    iloc[:num_recommendations, :-1])

    return user_full, recommendations

**Corremos la función de recomendación para algún UserID**

In [17]:
already_rated, predictions = recommend_movies(preds_df, 2, movies_df, ratings_df, 10)

In [18]:
already_rated.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres
0,2,1357,5,978298709,Shine (1996),Drama|Romance
28,2,2236,5,978299220,Simon Birch (1998),Drama
33,2,3147,5,978298652,"Green Mile, The (1999)",Drama|Thriller
35,2,1293,5,978298261,Gandhi (1982),Drama
40,2,110,5,978298625,Braveheart (1995),Action|Drama|War


In [19]:
predictions.head(10)

Unnamed: 0,MovieID,Title,Genres
1477,1580,Men in Black (1997),Action|Adventure|Comedy|Sci-Fi
699,733,"Rock, The (1996)",Action|Adventure|Thriller
503,527,Schindler's List (1993),Drama|War
1815,1961,Rain Man (1988),Drama
1590,1704,Good Will Hunting (1997),Drama
361,377,Speed (1994),Action|Romance|Thriller
251,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
1048,1097,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
1503,1608,Air Force One (1997),Action|Thriller
453,474,In the Line of Fire (1993),Action|Thriller


**Data set utilizado:** MovieLens 1M Dataset (disponible en https://grouplens.org/datasets/movielens/)

**Fuente original de este código:** https://beckernick.github.io/matrix-factorization-recommender/