In [1]:
PATH_TO_DATA = '/Users/dgizdatullin/Downloads/ml-latest-small/'

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [15]:
np.random.seed(42)

In [16]:
import tensorflow as tf

Подготовим словарь с названиями фильмов, чтобы быстро находить название по movieId

In [4]:
dict_of_titles = dict()
with open(PATH_TO_DATA + 'movies.csv') as f:
    next(f)
    for line in f:
        line_splited = line.split(',')
        movieId = int(line_splited[0])
        title = line_splited[1]
        dict_of_titles[movieId] = title

# Подготовка данных.

### 1. Создадим датафрейм с нашими рейтингами.

In [5]:
ratings = pd.read_csv(PATH_TO_DATA + 'ratings.csv')

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [17]:
ratings['userId'].max()

610

### 2. Разделим нашу выборку на train (80%) и test (20%)

In [8]:
ratings = ratings.sample(frac=1)

In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
98583,607,2028,5.0,963078901
95972,602,21,4.0,840875720
11707,70,3147,2.5,1355198512
2539,19,1457,3.0,965706728
77943,483,109846,3.5,1415575011


In [10]:
train_size = int(ratings.shape[0] * 0.8)

In [11]:
ratings_train = ratings.iloc[0: train_size].copy(deep=True)
ratings_test = ratings.iloc[train_size:].copy(deep=True)

In [12]:
ratings_train.shape

(80668, 4)

In [13]:
ratings_test.shape

(20168, 4)

### 3. Подготовим метрику для оценки качества наших рекомендаций

In [18]:
# сохраним тестовые рейтинги в словарь {userId: set(movieId)}
ratings_test_dict = ratings_test.groupby('userId')['movieId'].agg(set).to_dict()

In [19]:
def avg_precision_topK(res, pred, k=5):
    total_precision = list()
    for uid in pred:
        if uid in res:
            pred_uid = sorted(pred[uid], key=lambda x: -x[1])[0: k]
            user_precision = len(set([movie for (movie, score) in pred_uid]) & res[uid]) / k
            total_precision.append(user_precision)
    return np.mean(total_precision)

In [20]:
pred = {15: [(47, 0.8), (260, 0.5), (11, 0.3)]}

In [21]:
avg_precision_topK(ratings_test_dict, pred, 3)

0.0

# Холодный старт.

### 1. Подготовим данные для построения векторов тэгов.

In [22]:
tags_df = pd.read_csv(PATH_TO_DATA + 'tags.csv')
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [23]:
tags_vectors = pd.get_dummies(tags_df['tag'])

In [24]:
tags_vectors.shape

(3683, 1589)

In [25]:
tags_vectors.head()

Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
list_of_tags = []
for i in range(tags_vectors.values.shape[0]):
    list_of_tags.append(tags_vectors.values[i, :])

In [27]:
tags_df['tag_vectors'] = list_of_tags

In [28]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp,tag_vectors
0,2,60756,funny,1445714994,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,60756,Highly quotable,1445714996,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,60756,will ferrell,1445714992,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2,89774,Boxing story,1445715207,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,2,89774,MMA,1445715200,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [29]:
grouped = tags_df.groupby('movieId')
aggregate = list((k, v['tag_vectors'].sum()) for k, v in grouped)

In [30]:
aggregate[0: 5]

[(1, array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)),
 (2, array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)),
 (3, array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)),
 (5, array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)),
 (7, array([0, 0, 0, ..., 0, 0, 0], dtype=uint8))]

In [31]:
max_movieId = ratings['movieId'].max()

In [32]:
aggregate[0][1].shape

(1589,)

In [33]:
item_tags_vectors = np.zeros((max_movieId+1, 1589))

In [34]:
for item_id, vec in aggregate:
    item_tags_vectors[item_id, :] = vec

In [35]:
del tags_df, grouped, aggregate 

### 2. Написать функцию для поиска похожего по тэгам контента.

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
def find_topK_similar_items(item_id, k):
    ???

In [37]:
dict_of_titles[89745]

In [38]:
for item_id, score in find_topK_similar_items(89745, 10):
    print(dict_of_titles[item_id], score)

In [36]:
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender

In [37]:
model_cosine = CosineRecommender(K=40)

In [76]:
from scipy.sparse import csr_matrix, vstack

In [39]:
model_cosine.fit(csr_matrix(item_tags_vectors))

100%|██████████| 193610/193610 [00:01<00:00, 184473.42it/s]


In [39]:
for item_id, score in model_cosine.similar_items(89745):
    print(dict_of_titles[item_id], score)

In [41]:
model_tfidf = TFIDFRecommender(K=40)

In [42]:
model_tfidf.fit(csr_matrix(item_tags_vectors))

100%|██████████| 193610/193610 [00:00<00:00, 468551.20it/s]


In [40]:
for item_id, score in model_tfidf.similar_items(89745):
    print(dict_of_titles[item_id], score)

### 3. Написать рекомендатель на основе content-based подхода

In [44]:
def return_user_recommendations(uid, k):
    user_ratings = ratings_train[ratings_train['userId'] == uid].copy(deep=True)
    user_ratings['rating'] = user_ratings['rating'] - user_ratings['rating'].mean() + 0.001
    user_recommendations = np.zeros(max_movieId+1)
    user_recommendations_divisor = np.zeros(max_movieId+1)
    user_recommendations_divisor += 0.001
    for row in user_ratings.iterrows():
        movieId = int(row[1][1])
        rating = row[1][2]
        for iid, score in find_topK_similar_items(movieId, 10):
            user_recommendations[iid] += score*rating
            user_recommendations_divisor[iid] += score
    user_recommendations = user_recommendations / user_recommendations_divisor
    most_similar_indices = np.argsort(user_recommendations)[-1: -k: -1]
    ans = list(zip(most_similar_indices, user_recommendations[most_similar_indices]))
    return ans

In [45]:
%%time
rec = return_user_recommendations(15, 20)

CPU times: user 5min 33s, sys: 3min 38s, total: 9min 11s
Wall time: 9min 57s


### 4. Переписать рекомендатель на основе content-based через матричные вычисления.

In [46]:
from sklearn.preprocessing import normalize

In [47]:
item_tags_vectors = csr_matrix(item_tags_vectors)

In [48]:
normalized_matrix = ???

In [49]:
cosine_sim_matrix = ???

In [41]:
cosine_sim_matrix.shape

In [43]:
for item_id in np.argsort(cosine_sim_matrix[89745].toarray())[0][-1: -10: -1]:
    print(dict_of_titles[item_id], cosine_sim_matrix[89745, item_id])

In [97]:
# сотавим только топ 30 значений в каждой строке
m = 30
rows = []
for row_id in range(cosine_sim_matrix.shape[0]):
    row = cosine_sim_matrix[row_id]
    if row.nnz > m:
        work_row = row.tolil()
        work_row[0, row.nonzero()[1][np.argsort(row.data)[-m:]]] = 0
        row = row - work_row.tocsr()
    rows.append(row)
topk_matrix = vstack(rows)
topk_matrix.eliminate_zeros()

In [98]:
topk_matrix

<193610x193610 sparse matrix of type '<class 'numpy.float64'>'
	with 17724 stored elements in Compressed Sparse Row format>

In [99]:
ratings_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
641,6,201,3.0,845556290
8611,58,590,3.0,847718380
45382,298,104283,3.5,1447560399
36732,249,6503,3.0,1353801377
24478,169,4193,4.0,1059427762


In [108]:
def return_user_recommendations(uid, k):
    ???

In [109]:
%%time
rec = return_user_recommendations(15, 20)

CPU times: user 12.3 ms, sys: 1.81 ms, total: 14.1 ms
Wall time: 12.5 ms


In [44]:
res = dict()
for uid in tqdm(list(ratings_test_dict.keys())):
    try:
        res[uid] = return_user_recommendations(uid, 20)
    except:
        continue
avg_precision_topK(ratings_test_dict, res, 20)

# Factorization Machines

### 1. Подготовить данные для модели FM.

In [53]:
from tffm import TFFMRegressor, TFFMClassifier

In [54]:
max_movieId

193609

In [55]:
# добавим колонку с векторами тэгов для фильмов
???

In [45]:
ratings_train.head()

In [57]:
ratings_train['movieId'].max()

193609

In [58]:
user_dummies = ???
movie_dummies = ???

In [47]:
user_dummies.shape

In [48]:
movie_dummies.shape

In [61]:
tags_dummies = ???

In [63]:
train_for_FM = ???

In [64]:
from scipy.sparse import csr_matrix

In [65]:
train_for_FM = csr_matrix(train_for_FM)

### 2. Обучить модель FM

In [66]:
model_FM = TFFMRegressor(
    order=2, 
    rank=10, 
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
    n_epochs=50, 
    batch_size=1024,
    init_std=0.001,
    reg=0.01,
    input_type='sparse'
)

In [67]:
y_train = ratings_train['rating'].values

In [68]:
model_FM.fit(train_for_FM, y_train*20-10, show_progress=True)



100%|██████████| 50/50 [00:07<00:00,  6.48epoch/s]


### 3. С помощью модели сделать предсказание для одного пользователя

In [69]:
len(ratings_train[ratings_train['userId'] == 1].head(1).index)

1

In [70]:
ratings_train[ratings_train['movieId'] == 1342342314].head(1).index

Int64Index([], dtype='int64')

In [71]:
user_dummies = user_dummies.values
movie_dummies = movie_dummies.values

In [72]:
ratings_train = ratings_train.reset_index()

In [73]:
def get_user_dummy_vector(user_id):
    ???

In [74]:
def get_user_dummy_vector(user_id):
    ???

In [77]:
def user_recommendations(uid, k):
    ???

In [49]:
user_recommendations(1, 2)

### 4. Обучить ALS модель и использовать ее как выбор кандидатов.

In [58]:
from implicit.als import AlternatingLeastSquares

In [59]:
model_ALS = AlternatingLeastSquares(factors=20)



### 5. Соединить ALS и FM модели.

In [66]:
def user_recommendations(uid):
    ???

In [50]:
res = dict()
for uid in tqdm(list(ratings_test_dict.keys())):
    try:
        res[uid] = user_recommendations(uid)
    except:
        continue
avg_precision_topK(ratings_test_dict, res, 20)

In [51]:
res = dict()
for uid in tqdm(list(ratings_test_dict.keys())):
    try:
        res[uid] = model_ALS.recommend(uid, user_item_matrix_csr, N=20)
    except:
        continue
avg_precision_topK(ratings_test_dict, res, 20)

In [52]:
avg_precision_topK(ratings_test_dict, res, 20)

In [53]:
# user_recommendations(1, 2)

# Onboarding.

In [71]:
from sklearn.cluster import KMeans

In [76]:
items_popularity = ratings_train.groupby('movieId')['userId'].count().to_dict()

In [75]:
kmeans = KMeans(n_clusters=20)

In [77]:
item_clsuters = ???

In [78]:
# 'item_id', 'item_cluster'
df_item_cluster = ???

In [54]:
for i in range(20):
    items = list(df_item_cluster[df_item_cluster['item_cluster'] == i]['item_id'].values)
    items_with_pop = [(item, items_popularity.get(item, 0)) for item in items]
    max_item = max(items_with_pop, key=lambda x: x[1])
    print(dict_of_titles[max_item[0]], max_item[1])