# Meta learning with word2vec

The idea of this notebook is to use Word2Vec in recommendations for MovieLens dataset.

In [None]:
!pip install -q matplotlib
!pip install -q pandas
!pip install -q numpy
!pip install -q gensim 

In [2]:
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline

In [3]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

resp = urlopen("http://files.grouplens.org/datasets/movielens/ml-100k.zip")
zipfile = ZipFile(BytesIO(resp.read()))
file = 'ml-100k/u.data'
df = pd.read_csv(zipfile.open(file), low_memory=False, skiprows=[0], sep='\t', names=['user', 'item', 'rate', 'time'])
df.head()

Unnamed: 0,user,item,rate,time
0,186,302,3,891717742
1,22,377,1,878887116
2,244,51,2,880606923
3,166,346,1,886397596
4,298,474,4,884182806


First we'll get the data and sort by time. The reason of sorting by time is due to the fact that we need the movies that each user saw in chronological order. We also filter movies with ranking above of 3.

In [4]:
# np.random.seed(42)
# # split data into train and test set
# msk = np.random.rand(len(df)) < 0.7
# df_train = df[msk]
# df_test = df[~msk]

df_train = df

df_train['time'] = pd.to_datetime(df_train['time'], unit='s')
df_train = df_train.sort_values(by='time')

df_train = df_train[df_train['rate'] > 3]
df_train.head()

Unnamed: 0,user,item,rate,time
213,259,255,4,1997-09-20 03:05:10
83964,259,286,4,1997-09-20 03:05:27
43026,259,298,4,1997-09-20 03:05:54
21395,259,185,4,1997-09-20 03:06:21
82654,259,173,4,1997-09-20 03:07:23


We will get the movie features, see the description in http://files.grouplens.org/datasets/movielens/ml-100k-README.txt

In [5]:
names = ['item', 'movie title', 'release date', 'video release date',
              'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
              'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western' ]

file_item = 'ml-100k/u.item'

df_items = pd.read_csv(zipfile.open(file_item), names= names, sep='|', encoding = 'ISO-8859-1')



In [6]:
df_items = df_items.filter(['item', 'movie title', 'Action', 'Adventure', 'Animation',
              'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western'])
display(df_items.head())

Unnamed: 0,item,movie title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


We merge the training dataset in order to have items with genre. We look at all the items watched by user '914'.

In [7]:
df_train_extended = pd.merge(df_train, df_items, on='item')


display(df_train_extended[df_train_extended['user'] == 914])

Unnamed: 0,user,item,rate,time,movie title,Action,Adventure,Animation,Children,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
4799,914,402,5,1998-02-10 15:26:16,Ghost (1990),0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0
28338,914,197,4,1998-02-10 14:47:08,"Graduate, The (1967)",0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
36119,914,155,5,1998-02-10 15:22:01,Dirty Dancing (1987),0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
41620,914,781,5,1998-02-10 15:11:04,French Kiss (1995),0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
44408,914,371,4,1998-02-10 14:47:09,"Bridges of Madison County, The (1995)",0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
49559,914,778,5,1998-02-10 14:48:05,Don Juan DeMarco (1995),0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
54987,914,643,4,1998-02-10 15:18:06,The Innocent (1994),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
54989,914,1406,4,1998-02-10 15:18:06,When Night Is Falling (1995),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


Now we will create a dataset with all the users and all their watched films sorted by timestamp. Also we'll add the first genre that is related to each movie. Note that a movie can have several genres but this is to keep the example simple. 

In [8]:
genrelist = ['Action', 'Adventure', 'Animation', 
    'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
    'Thriller', 'War', 'Western']

def get_features_movie(item_id):
    features = []
    row = df_items[df_items['item'] == item_id]
    for i in genrelist:
        if row.iloc[0][i]:
            features = features + [i]
    return features


display(df_items[df_items['item'] == 402])
get_features_movie(402)

Unnamed: 0,item,movie title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
401,402,Ghost (1990),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0


['Comedy', 'Romance', 'Thriller']

In [9]:
def convert_to_list(item_id):
    return " ".join([str(x) for x in get_features_movie(item_id)[:1]])
convert_to_list(402)

'Comedy'

In [10]:
# the word2vec training set
train_watched = pd.DataFrame(columns=['user', 'watched'])

for index, user_id in enumerate(range(min(df_train_extended['user']), max(df_train_extended['user']))):
    user_itemslist = df_train_extended[df_train_extended['user'] == user_id].item.values
    
    l_to_string = " ".join([convert_to_list(x)+" "+str(x) for x in user_itemslist])
    train_watched.loc[index] = [user_id, l_to_string]

In [11]:
train_watched.head()

Unnamed: 0,user,watched
0,1,Horror 185 Action 173 Comedy 108 Action 176 Ac...
1,2,Comedy 255 Drama 286 Action 295 Action 50 Acti...
2,3,Action 181 Drama 318 Action 328 Crime 327 Crim...
3,4,Drama 357 Action 50 Action 300 Crime 11 Drama ...
4,5,Action 173 Action 121 Action 50 Action 385 Act...


In the new dataset we can see for every user the watched films and the genre for each user.

In [12]:
list_doc = []

for row in train_watched.to_dict(orient='record'):
    list_doc.append(str(row['watched']).strip().split(' '))

In [13]:
print(list_doc[2])
len(list_doc[2])

['Action', '181', 'Drama', '318', 'Action', '328', 'Crime', '327', 'Crime', '329', 'Comedy', '321', 'Documentary', '320', 'Action', '260', 'Adventure', '331', 'Drama', '340', 'Comedy', '342', 'Crime', '348', 'Crime', '346', 'Comedy', '347', 'Drama', '344']


30

In [14]:
from gensim.test.utils import common_texts
from gensim.models.word2vec import Word2Vec

In [15]:
model = Word2Vec(list_doc, window=5, min_count=1, workers=4)

In [16]:
def most_similar(item_id_or_genre):
    try:
        print("Similar of "+df_items[df_items['item'] == int(item_id_or_genre)].iloc[0]['movie title'])
    except:
        print("Similar of "+item_id_or_genre)
    return [(x, df_items[df_items['item'] == int(x[0])].iloc[0]['movie title']) for x in model.wv.most_similar(item_id_or_genre)]

Look at the most similar movies to 'Action' genre.

In [17]:
most_similar('Action')

Similar of Action


[(('2', 0.741334080696106), 'GoldenEye (1995)'),
 (('233', 0.7342119216918945), 'Under Siege (1992)'),
 (('226', 0.7213338613510132), 'Die Hard 2 (1990)'),
 (('684', 0.7165249586105347), 'In the Line of Fire (1993)'),
 (('92', 0.6944090723991394), 'True Romance (1993)'),
 (('68', 0.6887273788452148), 'Crow, The (1994)'),
 (('550', 0.6839775443077087), 'Die Hard: With a Vengeance (1995)'),
 (('147', 0.6825858950614929), 'Long Kiss Goodnight, The (1996)'),
 (('121', 0.6758855581283569), 'Independence Day (ID4) (1996)'),
 (('566', 0.6743015050888062), 'Clear and Present Danger (1994)')]

In [18]:
most_similar('402')

Similar of Ghost (1990)


[(('739', 0.9967565536499023), 'Pretty Woman (1990)'),
 (('421', 0.9860407710075378),
  "William Shakespeare's Romeo and Juliet (1996)"),
 (('969', 0.9785128235816956), 'Winnie the Pooh and the Blustery Day (1968)'),
 (('79', 0.9777017831802368), 'Fugitive, The (1993)'),
 (('692', 0.974472165107727), 'American President, The (1995)'),
 (('392', 0.9722467660903931), 'Man Without a Face, The (1993)'),
 (('97', 0.9651090502738953), 'Dances with Wolves (1990)'),
 (('96', 0.9636562466621399), 'Terminator 2: Judgment Day (1991)'),
 (('553', 0.9607328176498413), 'Walk in the Clouds, A (1995)'),
 (('66', 0.9594049453735352), 'While You Were Sleeping (1995)')]

In [19]:
most_similar('Horror')

Similar of Horror


[(('448', 0.9206116199493408), 'Omen, The (1976)'),
 (('219', 0.9172361493110657), 'Nightmare on Elm Street, A (1984)'),
 (('436', 0.8959780931472778), 'American Werewolf in London, An (1981)'),
 (('447', 0.8958535194396973), 'Carrie (1976)'),
 (('217', 0.8873440027236938), "Bram Stoker's Dracula (1992)"),
 (('443', 0.8856233358383179), 'Birds, The (1963)'),
 (('164', 0.8559550046920776), 'Abyss, The (1989)'),
 (('559', 0.8458583354949951), 'Interview with the Vampire (1994)'),
 (('185', 0.8397208452224731), 'Psycho (1960)'),
 (('772', 0.8202890157699585), 'Kids (1995)')]

We would like to create recommendations for a user. For example, this is what user 914 saw:

In [20]:
display(df_train_extended[df_train_extended['user'] == 914].filter(['item', 'movie title']+l))

NameError: name 'l' is not defined

The simplest way to define a user embedding is to average the embeddings of movies that he/she saw.

In [None]:
def create_avg_user_vector(user_id):
    item_id_list = df_train_extended[df_train_extended['user'] == user_id]['item'].tolist()
    vector_item_id_list = [model.wv[str(x)] for x in item_id_list]
    return np.average(vector_item_id_list, axis=0)

def most_similar_by_vector(vector):
    return [(x, df_items[df_items['item'] == int(x[0])].iloc[0]['movie title']) for x in model.wv.similar_by_vector(vector)]


recomendations = most_similar_by_vector(create_avg_user_vector(914))
display(pd.DataFrame(recomendations))

In [None]:
[int(r[0]) for r in model.wv.similar_by_vector(create_avg_user_vector(914))]


### Questions: Evaluate precision@k on the test dataset 

In [None]:
df_train.head()


Generate recommendations from the trained model for a list of users.

### Define and compute Precision@K score

We first create a validation set for every user which consists of all the products that the user rated higher than 3.5 (the value of the mean rate).

We then compute precistion@K for our recommendations.

### Create validation set for every user

In [None]:
def create_validation_set(df, minRate=3.5, k=5):
    validation_set = {}
    
    for user in np.unique(df['user'].values) - 1:
        rated_items = df[df['user'] == user]['item'].values -1
        rates = df[df['user'] == user]['rate'].values

        best_ranked_items = rated_items[np.where(rates > minRate)[0]]
        if len(best_ranked_items) >= k:
            validation_set[user] = best_ranked_items
            
    return validation_set


 Compute precision@k using the recommendations and the validation set.

In [None]:
def precisionAtK(validations_set, recommendations_set, k=5):
    
    res = []
    for user in validations_set.keys():
        
        v = validations_set[user]
        r = recommendations_set[user][:k]
        
        ans = len(np.intersect1d(v, r)) / k
        res.append(ans)

    return np.mean(res)


In [None]:
def compute_precisionAtK_from_recommendations(model, df_test, validation_set=None, nrRecommendations=20, k=10):
    """
    Compute precisionAtK from recommendations and validation set. Generate recommendations applying \
    'model' to dataset 'df_test'. 
    """
    if validation_set is None:
        validation_set = create_validation_set(df_test, minRate=3.5, k=k)
        
    recommendations_set = create_recommendations(model, df_test, \
                                              validation_set.keys(), nrRecommendations=nrRecommendations)
    precision = precisionAtK(validation_set, recommendations_set, k=k)
    
    return precision



In [None]:
def create_recommendations(model, df, listOfUsers, nrRecommendations=20):
    
    recommendations_set = {}
    for user in listOfUsers:
        r = []
        for rr in model.wv.similar_by_vector(create_avg_user_vector(user)):
            try:
                r.append(int(rr[0]))
            except:
                pass
        recommendations_set[user] = r
        
    return recommendations_set

In [None]:
validation_set = create_validation_set(df_train_extended, minRate=3.5)

precision = compute_precisionAtK_from_recommendations(model, df_train_extended, validation_set=validation_set, nrRecommendations=20, k=5)
print("MF: Precision@{} is {}".format(5, precision))

