# Model training
## Preparation of Data Set


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)

In [4]:
# movie_df = pd.read_csv('movies_prediction/movie.csv')
# rating_df = pd.read_csv('movies_prediction/rating.csv')
# tag_df = pd.read_csv('movies_prediction/tag.csv')
# genome_scores_df = pd.read_csv('movies_prediction/genome_scores.csv')
# genome_tags_df = pd.read_csv('movies_prediction/genome_tags.csv')
movie_df = pd.read_csv('drive/MyDrive/movies/movies.csv')
rating_df = pd.read_csv('drive/MyDrive/movies/ratings.csv')

In [5]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [6]:
# rating_df['timestamp'] = pd.to_datetime(rating_df['timestamp'], format='%Y-%m-%d %H:%M:%S')
rating_df['timestamp'] = pd.to_datetime(rating_df['timestamp'])

In [7]:
rating_df['day_of_week'] = rating_df['timestamp'].dt.weekday
rating_df['hour_of_day'] = rating_df['timestamp'].dt.hour
rating_df['month'] = rating_df['timestamp'].dt.month
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,day_of_week,hour_of_day,month
0,1,16,4.0,1970-01-01 00:00:01.217897793,3,0,1
1,1,24,1.5,1970-01-01 00:00:01.217895807,3,0,1
2,1,32,4.0,1970-01-01 00:00:01.217896246,3,0,1
3,1,47,4.0,1970-01-01 00:00:01.217896556,3,0,1
4,1,50,4.0,1970-01-01 00:00:01.217896523,3,0,1


In [8]:
rating_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp,day_of_week,hour_of_day,month
count,105339.0,105339.0,105339.0,105339,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1970-01-01 00:00:01.130423971,3.0,0.0,1.0
min,1.0,1.0,0.5,1970-01-01 00:00:00.828564954,3.0,0.0,1.0
25%,192.0,1073.0,3.0,1970-01-01 00:00:00.971100797,3.0,0.0,1.0
50%,383.0,2497.0,3.5,1970-01-01 00:00:01.115154056,3.0,0.0,1.0
75%,557.0,5991.0,4.0,1970-01-01 00:00:01.275495998,3.0,0.0,1.0
max,668.0,149532.0,5.0,1970-01-01 00:00:01.452404919,3.0,0.0,1.0
std,197.486905,26170.456869,1.044872,,0.0,0.0,0.0


In [9]:
movie_df['genres'] = movie_df['genres'].str.split('|')
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [10]:
genres = set(g for sublist in movie_df['genres'] for g in sublist)
for genre in genres:
    movie_df[genre] = movie_df['genres'].apply(lambda x: int(genre in x))

In [11]:
movie_df.head()

Unnamed: 0,movieId,title,genres,Children,Action,Western,War,Sci-Fi,Thriller,Horror,Documentary,Film-Noir,Romance,Adventure,Comedy,Drama,Crime,Musical,IMAX,Animation,Mystery,(no genres listed),Fantasy
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [15]:
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

In [16]:
user_features_df = rating_df.drop(['movieId', 'rating', 'timestamp'], axis=1)
item_features_df = movie_df.drop(['title', 'genres'], axis=1)

In [17]:
dataset = Dataset()
dataset.fit(rating_df['userId'].unique(), movie_df['movieId'].unique(),
            # user_features=user_features_df.columns.tolist(),
            item_features=item_features_df.columns.tolist())

In [18]:
item_id_mapping = dataset.mapping()[2]
index_to_movie_id = {index: movie_id for movie_id, index in item_id_mapping.items()}

In [19]:
(interactions, weights) = dataset.build_interactions(rating_df[['userId', 'movieId', 'rating']].values)

In [20]:
# user_features = dataset.build_user_features([
#     (row.userId, list(map(str, row[1:].values))) for _, row in user_features_df.iterrows()
# ])

item_features = dataset.build_item_features([
    (row.movieId, dict(zip(item_features_df.columns[1:], row[1:].values))) # Changed this line
    for _, row in item_features_df.iterrows()
])

In [21]:
train_interactions, test_interactions = random_train_test_split(interactions, test_percentage=0.2, random_state=42)


In [35]:
model = LightFM(loss='warp', learning_rate=0.15, no_components=30)
# model.fit(interactions, user_features=user_features, item_features=item_features, epochs=10, num_threads=2)
model.fit(train_interactions, item_features=item_features, epochs=30, num_threads=4)


<lightfm.lightfm.LightFM at 0x78f0eab2ab10>

In [23]:
def recommend_movies(model, user_id, n=5):
    n_items = interactions.shape[1]
    scores = model.predict(user_id, np.arange(n_items), item_features=item_features)
    top_items = np.argsort(-scores)[:n]

    # top_movie_ids = [index_to_movie_id[index] for index in top_items]
    # print(f"Рекомендованные фильмы для пользователя {user_id}:")
    for index in top_items:
        movie_id = index_to_movie_id[index]
        print(movie_df[movie_df['movieId'] == movie_id]['title'].values[0], f'(score {scores[index]})' )

In [31]:
recommend_movies(model, user_id=10, n=5)

Baton Rouge (Bâton rouge) (1988) (score 249.91079711914062)
I Am David (2003) (score 155.9187469482422)
Carnages (a.k.a. Carnage) (2002) (score 149.27133178710938)
Godfather, The (1972) (score 110.53330993652344)
Boys Are Back, The (2009) (score 108.60161590576172)


In [36]:
train_precision = precision_at_k(model, train_interactions, item_features=item_features, k=10).mean()
test_precision = precision_at_k(model, test_interactions, k=10, item_features=item_features).mean()

# Рассчитываем AUC
train_auc = auc_score(model, train_interactions, item_features=item_features).mean()
test_auc = auc_score(model, test_interactions, item_features=item_features).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.55, test 0.08.
AUC: train 0.99, test 0.93.
