# Meta learning with word2vec

The idea of this notebook is to use Word2Vec in recommendations for MovieLens dataset.

In [None]:
!pip install -q matplotlib
!pip install -q pandas
!pip install -q numpy
!pip install -q gensim 

In [None]:
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline

In [None]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

resp = urlopen("http://files.grouplens.org/datasets/movielens/ml-100k.zip")
zipfile = ZipFile(BytesIO(resp.read()))
file = 'ml-100k/u.data'
df = pd.read_csv(zipfile.open(file), low_memory=False, skiprows=[0], sep='\t', names=['user', 'item', 'rate', 'time'])
df.head()

First we'll get the data and sort by time. The reason of sorting by time is due to the fact that we need the movies that each user saw in chronological order. We also filter movies with ranking above of 3.

In [None]:
# np.random.seed(42)
# # split data into train and test set
# msk = np.random.rand(len(df)) < 0.7
# df_train = df[msk]
# df_test = df[~msk]

df_train = df

df_train['time'] = pd.to_datetime(df_train['time'], unit='s')
df_train = df_train.sort_values(by='time')

df_train = df_train[df_train['rate'] > 3]
df_train.head()

We will get the movie features, see the description in http://files.grouplens.org/datasets/movielens/ml-100k-README.txt

In [None]:
names = ['item', 'movie title', 'release date', 'video release date',
              'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
              'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western' ]

file_item = 'ml-100k/u.item'

df_items = pd.read_csv(zipfile.open(file_item), names= names, sep='|', encoding = 'ISO-8859-1')



In [None]:
df_items = df_items.filter(['item', 'movie title', 'Action', 'Adventure', 'Animation',
              'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western'])
display(df_items.head())

We merge the training dataset in order to have items with genre. We look at all the items watched by user '914'.

In [None]:
df_train_extended = pd.merge(df_train, df_items, on='item')


display(df_train_extended[df_train_extended['user'] == 914])

Now we will create a dataset with all the users and all their watched films sorted by timestamp. Also we'll add the first genre that is related to each movie. Note that a movie can have several genres but this is to keep the example simple. 

In [None]:
genrelist = ['Action', 'Adventure', 'Animation', 
    'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
    'Thriller', 'War', 'Western']

def get_features_movie(item_id):
    features = []
    row = df_items[df_items['item'] == item_id]
    for i in genrelist:
        if row.iloc[0][i]:
            features = features + [i]
    return features


display(df_items[df_items['item'] == 402])
get_features_movie(402)

In [None]:
def convert_to_list(item_id):
    return " ".join([str(x) for x in get_features_movie(item_id)[:1]])
convert_to_list(402)

In [None]:
# the word2vec training set
train_watched = pd.DataFrame(columns=['user', 'watched'])

for index, user_id in enumerate(range(min(df_train_extended['user']), max(df_train_extended['user']))):
    user_itemslist = df_train_extended[df_train_extended['user'] == user_id].item.values
    
    l_to_string = " ".join([convert_to_list(x)+" "+str(x) for x in user_itemslist])
    train_watched.loc[index] = [user_id, l_to_string]

In [None]:
train_watched.head()

In the new dataset we can see for every user the watched films and the genre for each user.

In [None]:
list_doc = []

for row in train_watched.to_dict(orient='record'):
    list_doc.append(str(row['watched']).strip().split(' '))

In [None]:
print(list_doc[2])
len(list_doc[2])

In [None]:
from gensim.test.utils import common_texts
from gensim.models.word2vec import Word2Vec

In [None]:
model = Word2Vec(list_doc, window=5, min_count=1, workers=4)

In [None]:
def most_similar(item_id_or_genre):
    try:
        print("Similar of "+df_items[df_items['item'] == int(item_id_or_genre)].iloc[0]['movie title'])
    except:
        print("Similar of "+item_id_or_genre)
    return [(x, df_items[df_items['item'] == int(x[0])].iloc[0]['movie title']) for x in model.wv.most_similar(item_id_or_genre)]

Look at the most similar movies to 'Action' genre.

In [None]:
most_similar('Action')

In [None]:
most_similar('402')

In [None]:
most_similar('Horror')

We would like to create recommendations for a user. For example, this is what user 914 saw:

In [None]:
display(df_train_extended[df_train_extended['user'] == 914].filter(['item', 'movie title']+genrelist))

The simplest way to define a user embedding is to average the embeddings of movies that he/she saw.

In [None]:
def create_avg_user_vector(user_id):
    item_id_list = df_train_extended[df_train_extended['user'] == user_id]['item'].tolist()
    vector_item_id_list = [model.wv[str(x)] for x in item_id_list]
    return np.average(vector_item_id_list, axis=0)

def most_similar_by_vector(vector):
    return [(x, df_items[df_items['item'] == int(x[0])].iloc[0]['movie title']) for x in model.wv.similar_by_vector(vector)]


recomendations = most_similar_by_vector(create_avg_user_vector(914))
display(pd.DataFrame(recomendations))

In [None]:
[int(r[0]) for r in model.wv.similar_by_vector(create_avg_user_vector(914))]


### Questions: Evaluate precision@k on the test dataset 

In [None]:
df_train.head()


Generate recommendations from the trained model for a list of users.

### Define and compute Precision@K score

We first create a validation set for every user which consists of all the products that the user rated higher than 3.5 (the value of the mean rate).

We then compute precistion@K for our recommendations.

### Create validation set for every user

In [None]:
def create_validation_set(df, minRate=3.5, k=5):
    validation_set = {}
    
    for user in np.unique(df['user'].values) :
        rated_items = df[df['user'] == user]['item'].values 
        rates = df[df['user'] == user]['rate'].values

        best_ranked_items = rated_items[np.where(rates > minRate)[0]]
        if len(best_ranked_items) >= k:
            validation_set[user] = best_ranked_items
            
    return validation_set


 Compute precision@k using the recommendations and the validation set.

In [None]:
def precisionAtK(validations_set, recommendations_set, k=5):
    
    res = []
    for user in validations_set.keys():
        
        v = validations_set[user]
        r = recommendations_set[user][:k]
        
        ans = len(np.intersect1d(v, r)) / k
        res.append(ans)

    return np.mean(res)


In [None]:
def compute_precisionAtK_from_recommendations(model, df_test, validation_set=None, nrRecommendations=20, k=10):
    """
    Compute precisionAtK from recommendations and validation set. Generate recommendations applying \
    'model' to dataset 'df_test'. 
    """
    if validation_set is None:
        validation_set = create_validation_set(df_test, minRate=3.5, k=k)
        
    recommendations_set = create_recommendations(model, df_test, \
                                              validation_set.keys(), nrRecommendations=nrRecommendations)
    precision = precisionAtK(validation_set, recommendations_set, k=k)
    
    return precision



In [None]:
def create_recommendations(model, df, listOfUsers, nrRecommendations=20):
    
    recommendations_set = {}
    for user in listOfUsers:
        r = []
        for rr in model.wv.similar_by_vector(create_avg_user_vector(user)):
            try:
                r.append(int(rr[0]))
            except:
                pass
        recommendations_set[user] = r
        
    return recommendations_set

In [None]:
validation_set = create_validation_set(df_train_extended, minRate=3.5)

precision = compute_precisionAtK_from_recommendations(model, df_train_extended, validation_set=validation_set, nrRecommendations=20, k=5)
print("MF: Precision@{} is {}".format(5, precision))

