# Data Wrangling

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
df_ratings = pd.read_csv('movie_data/ratings.csv')
df_ratings = df_ratings[df_ratings.userId.isin(range(9000, 10010))]
df_movies = pd.read_csv('movie_data/movies.csv')
df_tags = pd.read_csv('movie_data/tags.csv')
df_genome_scores = pd.read_csv('movie_data/genome-scores.csv')
df_genome_tags = pd.read_csv('movie_data/genome-tags.csv')

In [3]:
df_rm = pd.merge(df_ratings, df_movies, on='movieId', how='left')
df_tags.rename(columns={'timestamp':'timestamp_tag'}, inplace=True)
df_rmt = pd.merge(df_rm, df_tags, on=['userId','movieId'], how='left')
df_rmt.dropna().head(3)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,tag,timestamp_tag
3495,9032,152,4.5,1063478591,"Addiction, The (1995)",Drama|Horror,Christopher Walken,1186863000.0
3496,9032,152,4.5,1063478591,"Addiction, The (1995)",Drama|Horror,drug abuse,1186863000.0
3497,9032,152,4.5,1063478591,"Addiction, The (1995)",Drama|Horror,philosophical,1186863000.0


In [4]:
df_rmt_mult_rows = df_rmt[['userId', 'movieId', 'rating', 'timestamp', 'genres', 'tag']]
df_rmt_sing_row = df_rmt_mult_rows\
    .groupby(['userId', 'movieId', 'rating', 'timestamp', 'genres'])\
    .agg({'tag': list})\
    .reset_index()

In [5]:
df_genome = pd.merge(df_genome_scores, df_genome_tags, on=['tagId'], how='inner')
df_movie_genome = df_genome\
    .groupby('movieId')[['tag', 'relevance']]\
    .apply(lambda x:pd.Series({'genome': x.values.tolist()}))\
    .reset_index()
df_rmtg = pd.merge(df_rmt_sing_row, df_movie_genome, on=['movieId'], how='left')
df_rmtg.dropna().head(3)

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag,genome
0,9000,10,4.0,836670560,Action|Adventure|Thriller,[nan],"[[007, 0.99975], [007 (series), 0.99975], [18t..."
1,9000,21,4.0,836670809,Comedy|Crime|Thriller,[nan],"[[007, 0.03125], [007 (series), 0.03425], [18t..."
2,9000,47,3.0,836670587,Mystery|Thriller,[nan],"[[007, 0.01924999999999999], [007 (series), 0...."


In [6]:
genres_values = df_movies['genres'].map(lambda x: x.split('|')).values
genres = list(set([genre for movie_genres in genres_values for genre in movie_genres if genre != "(no genres listed)"]))
genres_dict = {genres[i]: i for i in range(0,len(genres))}

def get_genres_features(genres_str, genres_dict):
    genres_num = len(genres_dict)
    genres_features = [0] * genres_num
    genres_list = [genre for genre in genres_str.split('|') if genre != "(no genres listed)"]
    for genre in genres_list:
        genres_features[genres_dict[genre]] = 1
    return(genres_features)

In [7]:
df_popular_tags = df_tags\
    .groupby('tag')\
    .agg({'userId': 'count'}).reset_index()\
    .sort_values(by=['userId'], ascending=False)\
    .head(150)
# df_popular_tags.userId.sum() / df_tags.shape[0]
popular_tags = df_popular_tags['tag'].values
pop_tags_dict = {popular_tags[i]: i for i in range(0,len(popular_tags))}

def get_pop_tags_features(tags, pop_tags_dict):
    pop_tags = [tag for tag in tags if tag in pop_tags_dict.keys()]
    tags_num = len(pop_tags_dict)
    pop_tags_features = [0] * tags_num
    if (len(pop_tags) > 0):
        weight = 1 / len(tags)
        for tag in pop_tags:
            pop_tags_features[pop_tags_dict[tag]] = weight
    return(pop_tags_features)

In [8]:
df_relevant_tags = df_genome\
    .groupby('tag')\
    .agg({'relevance': 'mean'}).reset_index()\
    .sort_values(by=['relevance'], ascending=False)\
    .head(150)
relevant_tags = df_relevant_tags['tag'].values
rel_tags_dict = {relevant_tags[i]: i for i in range(0,len(relevant_tags))}

def get_rel_tags_features(genomes, rel_tags_dict):
    try:
        rel_genome = [genome for genome in genomes if genome[0] in rel_tags_dict.keys()]
    except:
        rel_genome = []
    tags_num = len(rel_tags_dict)
    rel_tags_features = [0] * tags_num
    if (len(rel_genome) > 0):
        for [tag, relevance] in rel_genome:
            rel_tags_features[rel_tags_dict[tag]] = relevance
    return(rel_tags_features)

In [9]:
df_rmtg['genres_features'] = df_rmtg['genres'].map(lambda x: get_genres_features(x, genres_dict))
df_rmtg['pop_tag_features'] = df_rmtg['tag'].map(lambda x: get_pop_tags_features(x, pop_tags_dict))
df_rmtg['rel_tag_features'] = df_rmtg['genome'].map(lambda x: get_rel_tags_features(x, rel_tags_dict))
df_rmtg.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag,genome,genres_features,pop_tag_features,rel_tag_features
0,9000,10,4.0,836670560,Action|Adventure|Thriller,[nan],"[[007, 0.99975], [007 (series), 0.99975], [18t...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.63975, 0.5435, 0.47675, 0.4515, 0.282749999..."
1,9000,21,4.0,836670809,Comedy|Crime|Thriller,[nan],"[[007, 0.03125], [007 (series), 0.03425], [18t...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.8140000000000001, 0.4585, 0.596, 0.7685, 0...."
2,9000,47,3.0,836670587,Mystery|Thriller,[nan],"[[007, 0.01924999999999999], [007 (series), 0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.8734999999999999, 0.70025, 0.96875, 0.69175..."
3,9000,50,4.0,836670869,Crime|Mystery|Thriller,[nan],"[[007, 0.02875], [007 (series), 0.029000000000...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.90025, 0.733, 0.9885, 0.82725, 0.8387500000..."
4,9000,110,3.0,836670587,Action|Drama|War,[nan],"[[007, 0.03375], [007 (series), 0.031000000000...","[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.81, 0.8262499999999999, 0.8205, 0.652, 0.63..."


In [10]:
df_rmtg['features1'] = df_rmtg['genres_features']
df_rmtg['features2'] = df_rmtg.\
    apply(lambda x: x['genres_features'] + x['rel_tag_features'], axis=1)
df_rmtg['features3'] = df_rmtg.\
    apply(lambda x: x['genres_features'] + x['pop_tag_features'] + x['rel_tag_features'], axis=1)
df_abt = df_rmtg[['userId', 'movieId', 'rating', 'features1', 'features2', 'features3']]

In [11]:
df_abt.head(5)

Unnamed: 0,userId,movieId,rating,features1,features2,features3
0,9000,10,4.0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,9000,21,4.0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
2,9000,47,3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,9000,50,4.0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,9000,110,3.0,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
df_movie_features = pd.merge(df_movies, df_movie_genome, on='movieId', how='inner')
df_movie_features['relevance_features'] = df_movie_features['genome']\
    .map(lambda x: get_rel_tags_features(x, rel_tags_dict))
df_movie_features['genre_features'] = df_movie_features['genres']\
    .map(lambda x: get_genres_features(x, genres_dict))
df_movie_features['features1'] = df_movie_features['genre_features']
df_movie_features['features2'] = df_movie_features.\
    apply(lambda x: x['genre_features'] + x['relevance_features'], axis=1)
df_movie_features = df_movie_features[['movieId', 'features1', 'features2']]

In [13]:
df_movie_features.head(5)

Unnamed: 0,movieId,features1,features2
0,1,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ..."
1,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
2,3,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3,4,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
4,5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


# Light FM

In [32]:
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k, auc_score

In [20]:
### Will be reading in CSV and then running on data - once Omar has completed data wrangling.
# UTILIZE THIS: https://lyst.github.io/lightfm/docs/lightfm.data.html
# AND https://github.com/lyst/lightfm/issues/347#issuecomment-407383263
# AND https://github.com/lyst/lightfm/issues/161
# AND https://www.ethanrosenthal.com/2016/11/07/implicit-mf-part-2/


users = df_abt['userId'].values
users = np.reshape(users, (len(users),1))
movies = df_abt['movieId'].values
movies = np.reshape(movies, (len(movies),1))
ratings = df_abt['rating'].values
ratings = np.reshape(ratings, (len(ratings),1))
data = np.concatenate((users, movies, ratings), axis=1)

unique_users = np.unique(data[:, 0])
unique_movies = np.unique(data[:, 1])

movie_features_data = df_movie_features.apply(lambda x: (x['movieId'],x['features1']), axis=1).values
movie_features = []
for row in range(len(movie_features_data)):
    movie = movie_features_data[row][0]
    if (movie in unique_movies):
        feature_map = movie_features_data[row][1]
        feature_dict = {}
        for feat in range(len(feature_map)):
            feature_dict[feat] = feature_map[feat]
        movie_features.append((movie,feature_dict))

In [34]:
# Use Dataset to prep your interactions and weights.
dataset = Dataset()
dataset.fit(users=unique_users, items=unique_movies, item_features=[*movie_features[0][1]])

movie_feature_matrix = dataset.build_item_features(movie_features)
train_interactions, train_weights = dataset.build_interactions((i[0], i[1], i[2]) for i in data)

# Fit the model using your full set of historic data.
model = LightFM()
model.fit(interactions=train_interactions, sample_weight=train_weights)
test_precision = precision_at_k(model, train_interactions, k=5, num_threads=2).mean()
test_auc = auc_score(model, train_interactions, num_threads=2).mean()
print('Model')
print('Precision: ', test_precision)
print('AUC: ', test_auc)

# Now, include item features
model_item_features = LightFM()
model_item_features.fit(interactions=train_interactions, sample_weight=train_weights, item_features=movie_feature_matrix)
test_precision_item_features = precision_at_k(model_item_features, train_interactions, item_features=movie_feature_matrix, k=5, num_threads=2).mean()
test_auc_item_features = auc_score(model_item_features, train_interactions, item_features=movie_feature_matrix, num_threads=2).mean()
print('Model with Item Features')
print('Precision: ', test_precision_item_features)
print('AUC: ', test_auc_item_features)


Model
Precision:  0.4760396
AUC:  0.92514235
Model with Item Features
Precision:  0.11287129
AUC:  0.634772


## Ignore everything below here, just used for testing

In [None]:
data2 = Dataset()
data2.fit([0,1,2,3],[0,1,2],['ufeat1','ufeat2'],['ifeat1'])
data2.build_interactions([(0,0),(1,1),(2,2)])
data2.build_user_features([(0,['ufeat1']), (2,['ufeat1','ufeat2'])],normalize=False)
data2.build_item_features([(0,['ifeat1']),(2,['ifeat1'])])


In [None]:
# Instantiate and train the model
model = LightFM(loss='warp')
model.fit(data['train'], epochs=30, num_threads=2)

In [None]:
# Evaluate the trained model
test_precision = precision_at_k(model, data['test'], k=5).mean()

In [None]:
print(data)
print(repr(data["train"]))
print(repr(data["test"]))
print(repr(data["item_features"]))
print(repr(data["item_feature_labels"]))
print(repr(data["item_labels"]))

In [None]:
from lightfm.datasets import fetch_stackexchange

data_se = fetch_stackexchange('crossvalidated',
                           test_set_fraction=0.1,
                           indicator_features=False,
                           tag_features=True)

In [None]:
print(data_se)
print(data_se['item_features'])