# Collaborative Filtering и Matrix Factorization

План семинара:
1. Теоретическое введение в Collaborative Filtering и Matrix Factorization
2. Построение рекомендательной модели на основе похожих фильмов
3. Построение рекомендательной модели на основе SVD

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix, csr_matrix, csc_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

%matplotlib inline

## Подготовка данных

In [2]:
df = pd.read_csv("ml-20m/ratings.csv")

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [4]:
df.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [5]:
df.groupby("userId").size().min()

20

In [6]:
print("Unique user count", len(df["userId"].unique()))
print("Unique movie count", len(df["movieId"].unique()))

Unique user count 138493
Unique movie count 26744


In [7]:
index_to_item_id_mapping = [x for x in df["movieId"].sort_values().unique()]
item_id_to_index_mapping = {}
for index, value in enumerate(index_to_item_id_mapping, start=1):
    item_id_to_index_mapping[int(value)] = index

assert len(index_to_item_id_mapping) == 26744

In [8]:
df = pd.DataFrame.from_records(
    {"userId": row.userId, "movieId": item_id_to_index_mapping[int(row.movieId)], "rating": row.rating}
    for row in df.itertuples())

Загрузим дополнительную информацию о фильмах

In [9]:
!head ml-20m/movies.csv -n 5

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance


In [10]:
df_movies = pd.read_csv("ml-20m/movies.csv")

In [11]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
!head ml-20m/links.csv -n 5

movieId,imdbId,tmdbId
1,0114709,862
2,0113497,8844
3,0113228,15602
4,0114885,31357


In [13]:
df_links = pd.read_csv("ml-20m/links.csv")
df_movies_joined = pd.merge(df_movies, df_links, on="movieId")

In [14]:
df_movies_joined.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [15]:
assert df_movies_joined.shape[0] == df_movies.shape[0]

## Поиск похожих фильмов

In [16]:
from scipy.sparse import issparse

def compute_similarity_matrix(m):
    sim = m.T.dot(m)
    
    # Это должно уместиться в памяти :)
    if issparse(sim):
        sim = sim.todense()
    
    sim = sim + 1e-9
    norms = np.array(np.sqrt(np.diagonal(sim)))
    return sim / norms / norms.T

In [17]:
M = csc_matrix(
    (df["rating"], (df["userId"] - 1, df["movieId"] - 1)))

In [18]:
cs = cosine_similarity(M.T)

In [19]:
def find_top_similar_movies(movie_id, similarity_matrix, k=10):
    indices = np.asarray(np.argsort(similarity_matrix[movie_id, :])).reshape(-1)[-k:-1]
    transformed_indices = [index_to_item_id_mapping[z] for z in indices]
    return df_movies_joined[df_movies_joined["movieId"].isin(transformed_indices)]

In [20]:
import requests
from IPython.display import Image, display

# Инструкция как получить свой токен: https://developers.themoviedb.org/3/getting-started/introduction
API_KEY = "5b6cf33b91eb09338d2d7816329f52fc"
TMDB_URL = "https://api.themoviedb.org/3/movie/{}/images?api_key={}"

cfg = requests.get("http://api.themoviedb.org/3/configuration",
                   params={"api_key": API_KEY}, headers={"Content-Type": "application/json"})
cfg.raise_for_status()
cfg_json = cfg.json()

base_url = cfg_json["images"]["base_url"] + "w185"

def get_top_similar_movies_posters(similar_movies_df):
    posters = []

    for movie in similar_movies_df.itertuples():
        url = TMDB_URL.format(int(movie.tmdbId), API_KEY)
        rsp = requests.get(url)
        rsp.raise_for_status()
        info = rsp.json()

        poster = info["posters"][0]["file_path"]
        posters.append(Image(url=base_url + poster))
    
    return posters

In [21]:
similar_movies = find_top_similar_movies(0, cs)  # Movies similar to Toy Story

In [22]:
similar_movies

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,76759,11.0
352,356,Forrest Gump (1994),Comedy|Drama|Romance|War,109830,13.0
476,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,107290,329.0
582,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,103639,812.0
640,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,117060,954.0
767,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller,116629,602.0
1184,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,86190,1892.0
1242,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi,88763,105.0
3027,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,120363,863.0


In [23]:
display(*get_top_similar_movies_posters(similar_movies))

Получим латентные вектора через SVD разложение

In [24]:
from scipy.sparse.linalg import svds

U, S, Vt = svds(M, k=30)
cs_svd = cosine_similarity(Vt.T)

In [25]:
similar_movies_svd = find_top_similar_movies(0, cs_svd)  # Toy story

In [26]:
similar_movies_svd

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
105,107,Muppet Treasure Island (1996),Adventure|Children|Comedy|Musical,117110,10874.0
603,609,Homeward Bound II: Lost in San Francisco (1996),Adventure|Children,116552,25059.0
624,631,All Dogs Go to Heaven 2 (1996),Adventure|Animation|Children|Fantasy|Musical|R...,115509,19042.0
653,661,James and the Giant Peach (1996),Adventure|Animation|Children|Fantasy|Musical,116683,10539.0
699,711,Flipper (1996),Adventure|Children,116322,36355.0
813,828,"Adventures of Pinocchio, The (1996)",Adventure|Children,115472,18975.0
1336,1367,101 Dalmatians (1996),Adventure|Children|Comedy,115433,11674.0
2270,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,120623,9487.0
3027,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,120363,863.0


In [27]:
display(*get_top_similar_movies_posters(similar_movies_svd))

In [28]:
similar_movies_svd_sm = find_top_similar_movies(item_id_to_index_mapping[5349] - 1, cs_svd)  # Spider-Man

In [29]:
similar_movies_svd_sm

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
3702,3793,X-Men (2000),Action|Adventure|Sci-Fi,120903,36657.0
4800,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy,241527,671.0
4867,4963,Ocean's Eleven (2001),Crime|Thriller,240772,161.0
5281,5378,Star Wars: Episode II - Attack of the Clones (...,Action|Adventure|Sci-Fi|IMAX,121765,1894.0
5348,5445,Minority Report (2002),Action|Crime|Mystery|Sci-Fi|Thriller,181689,180.0
6234,6333,X2: X-Men United (2003),Action|Adventure|Sci-Fi|Thriller,290334,36658.0
6260,6365,"Matrix Reloaded, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX,234215,604.0
6429,6539,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy,325980,22.0
7953,8636,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX,316654,558.0


In [30]:
display(*get_top_similar_movies_posters(similar_movies_svd_sm))

## Предсказание рейтингов с помощью Collaborative Filtering

In [31]:
class ErrorAggregator(object):
    def __init__(self):
        self.sum_ = 0.0
        self.count = 0
        
    def add_value(self, x, count):
        self.sum_ += x
        self.count += count
        
    def get_rmse(self):
        return np.sqrt(self.sum_ / (self.count - 1))

In [33]:
import time

df["random"] = np.random.random(size=(df.shape[0], 1))
idx = df.sort_values(by="random") \
    .reset_index() \
    .groupby("userId") \
    .head(n=7)["index"]

mask = df.index.isin(idx)
train_df = df[~mask]
test_df = df[mask]
    
train_M = csr_matrix(
    (train_df["rating"], (train_df["userId"] - 1, train_df["movieId"] - 1)), shape=(138493, 26744))

train_M_mean = train_M.mean()

U, S, Vt = svds(train_M, k=30)

svd_error = ErrorAggregator()
mean_error = ErrorAggregator()
times = []

for user_id, group in test_df.groupby("userId"):
    start_time = time.time()
    
    uvec = U[user_id - 1, :] * S
    
    movie_ids = group["movieId"] - 1
    ratings = group["rating"]
    
    diff = (uvec.dot(Vt[:, movie_ids]) - ratings.values) ** 2
    svd_error.add_value(np.sum(diff), len(diff))
    
    mean_diff = (ratings.values - train_M_mean) ** 2
    mean_error.add_value(np.sum(mean_diff), len(mean_diff))
    
    times.append(time.time() - start_time)

print("RMSE:", svd_error.get_rmse())
print("Random RMSE:", mean_error.get_rmse())
print("Average evaluation time:", np.mean(times))

RMSE: 2.99555769755
Random RMSE: 3.7621751181
Average evaluation time: 0.000321781755016


Топовый бенчмарк RMSE: ~0.85

In [34]:
len(df[df["userId"] == 10])

38

In [35]:
tenth_user_movies = pd.merge(df[df["userId"] == 10], df_movies, on="movieId")

In [36]:
t = (U[9, :] * S).dot(Vt)

In [37]:
recommendations = np.argsort(t).tolist()

In [38]:
tenth_user_movies

Unnamed: 0,movieId,rating,userId,random,title,genres
0,1,4.0,10,0.826535,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,11,4.0,10,0.999618,"American President, The (1995)",Comedy|Drama|Romance
2,25,4.0,10,0.938792,Leaving Las Vegas (1995),Drama|Romance
3,258,4.0,10,0.083354,"Kid in King Arthur's Court, A (1995)",Adventure|Children|Comedy|Fantasy|Romance
4,353,3.0,10,0.311346,"Crow, The (1994)",Action|Crime|Fantasy|Thriller
5,524,5.0,10,0.134673,Rudy (1993),Drama
6,844,5.0,10,0.399124,"Story of Xinghua, The (Xinghua san yue tian) (...",Drama
7,896,4.0,10,0.330293,Wild Reeds (Les roseaux sauvages) (1994),Drama
8,953,4.0,10,0.314624,It's a Wonderful Life (1946),Drama|Fantasy|Romance
9,1073,3.0,10,0.207523,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical


In [40]:
films = []

for r in recommendations[-100:]:
    if r + 1 not in tenth_user_movies["movieId"].values:  # Не рекомендуем то, что пользователь уже смотрел
        films.append(r)

In [41]:
filtered_movies = df_movies_joined[df_movies_joined["movieId"].isin([index_to_item_id_mapping[x] for x in films])]

In [42]:
display(*get_top_similar_movies_posters(filtered_movies))

## Библиотеки для матричной факторизации

a. http://libfm.org/

Эффективная реализация модели факторизационных машин. Алгоритмы SGD, ALS.

b. GraphLab Collaborative Filtering Toolkit (https://turi.com/products/create/docs/graphlab.toolkits.recommender.html)

Умеет SVD (ALS, SGD), SVD++, Weighted-ALS, non-negative matrix factorization.

c. Vowpal Wabbit (https://github.com/JohnLangford/vowpal_wabbit/wiki/Matrix-factorization-example)

Можно использовать разложение матрички оттуда.

## Заключение

1. Построена рекомендательная модель для каждого фильма с помощью косинусной близости
2. Построена рекомендательная система на основе SVD
3. Познакомились с библиотеками для CF & MF