In [1]:
# 1-get data
# 2-tokenize
# 3-map all tokens to w2v 300d vector
# 4-average of the 300d vectors of the movie = 1 300d vector

import gensim.models as gsm

w2v = gsm.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [3]:
# 1- get data
import pandas as pd 

movies_metadata = pd.read_csv("movies_metadata.csv", low_memory=False)
print movies_metadata.shape


movies_titles_description = movies_metadata.loc[:,["original_title", "overview"]]
movies_titles_description.head()

(45466, 24)


Unnamed: 0,original_title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/dania/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
# 2-tokenize
import nltk
import gensim

stopwords = set(nltk.corpus.stopwords.words("english"))
def tokenize(text):
    return [x for x in gensim.utils.tokenize(text, lowercase=True, deacc=True, errors="ignore")
           if x not in stopwords]


movies_titles_description.loc[:,"original_title_tokens"] = movies_titles_description["original_title"].map(lambda x: tokenize(str(x)))  
movies_titles_description.loc[:,"overview_tokens"] = movies_titles_description["overview"].map(lambda x: tokenize(str(x))) 

movies_titles_description.head()

Unnamed: 0,original_title,overview,original_title_tokens,overview_tokens
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[toy, story]","[led, woody, andy, toys, live, happily, room, ..."
1,Jumanji,When siblings Judy and Peter discover an encha...,[jumanji],"[siblings, judy, peter, discover, enchanted, b..."
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[grumpier, old, men]","[family, wedding, reignites, ancient, feud, ne..."
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[waiting, exhale]","[cheated, mistreated, stepped, women, holding,..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[father, bride, part, ii]","[george, banks, recovered, daughter, wedding, ..."


In [19]:
title_documents = [item for sublist in movies_titles_description["original_title_tokens"] 
                   for item in sublist]
overview_documents = [item for sublist in movies_titles_description["overview_tokens"] 
                      for item in sublist]

documents = title_documents + overview_documents

print len(title_documents)
print len(overview_documents)
print len(documents)

# save only tokens with frequency > 1 into a dictionary
from collections import defaultdict

frequency = defaultdict(int)
for token in documents:
    frequency[token] += 1

frequent_documents = [[token for document in documents if frequency[token] > 1]
         for document in documents]

print len(frequent_documents)

# final dictionary
dictionary = corpora.Dictionary(frequent_documents)
dictionary.save('movies.dict')
print dictionary

103659
1408569
1512228


In [33]:
[w2v.wv[word] for word in movies_titles_description.overview_tokens[0]]

# OOV words: fastText
# https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
# Break the unknown word into smaller character n-grams.
# Assemble the word vector from vectors of these ngrams.
# The intuition: find similarity in the surface form, and assume similarity on the semantic level from that.

KeyError: u"word 'lightyear' not in vocabulary"

In [44]:
# OOV words: skip
import numpy as np

v = [w2v.wv[word] for word in movies_titles_description.overview_tokens[0] if word in w2v.wv.vocab]
print movies_titles_description.overview_tokens[0], len(movies_titles_description.overview_tokens[0])
print len(v), len(v[0])

avg = np.average(v,  axis=0)
print len(avg)

[u'led', u'woody', u'andy', u'toys', u'live', u'happily', u'room', u'andy', u'birthday', u'brings', u'buzz', u'lightyear', u'onto', u'scene', u'afraid', u'losing', u'place', u'andy', u'heart', u'woody', u'plots', u'buzz', u'circumstances', u'separate', u'buzz', u'woody', u'owner', u'duo', u'eventually', u'learns', u'put', u'aside', u'differences'] 33
32 300
300


In [45]:
# 3-map all tokens to w2v 300d vector
# 4-average of the 300d vectors of the movie = 1 300d vector
# OOV words: skip
def avg_word2vec(tokens):
    word2vec_embeddings = [w2v.wv[word] for word in tokens if word in w2v.wv.vocab]
    return(np.average(word2vec_embeddings, axis=0))

movies_titles_description["overview_vector"] = movies_titles_description["overview_tokens"].map(lambda x: avg_word2vec(x))
movies_titles_description["original_title_vector"] = movies_titles_description["original_title_tokens"].map(lambda x: avg_word2vec(x))

movies_titles_description.head()

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,original_title,overview,original_title_tokens,overview_tokens,overview_vector,original_title_vector
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[toy, story]","[led, woody, andy, toys, live, happily, room, ...","[0.066690445, 0.09592056, 0.019028902, 0.05512...","[0.13549805, 0.097717285, -0.06188965, 0.11779..."
1,Jumanji,When siblings Judy and Peter discover an encha...,[jumanji],"[siblings, judy, peter, discover, enchanted, b...","[0.031378668, 0.044890534, -0.028069574, 0.075...",
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[grumpier, old, men]","[family, wedding, reignites, ancient, feud, ne...","[0.043774772, 0.023932962, -0.023906035, 0.102...","[0.102864586, 0.12434896, 0.06526693, 0.038136..."
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[waiting, exhale]","[cheated, mistreated, stepped, women, holding,...","[0.017270016, 0.07522348, -0.007965088, 0.0510...","[0.12060547, 0.0087890625, 0.29052734, 0.05981..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[father, bride, part, ii]","[george, banks, recovered, daughter, wedding, ...","[0.030315053, 0.037719727, -0.0469291, 0.11105...","[-0.0490036, -0.05670166, 0.019943237, 0.01733..."


In [None]:
# OOV words: fastText
ft = gsm.KeyedVectors.load_word2vec_format('~/Downloads/wiki.en.zip', binary=True)
def avg_fastText(tokens):
    fastText_embeddings = [ft.wv[word] for word in tokens]
    return(np.average(fastText_embeddings, axis=0))

movies_titles_description["overview_vector_fastText"] = movies_titles_description["overview_tokens"].map(lambda x: avg_fastText(x))
movies_titles_description["original_title_vector_fastTex"] = movies_titles_description["original_title_tokens"].map(lambda x: avg_fastText(x))

movies_titles_description.head()

In [None]:
movies_titles.to_csv("movies_embbedings.csv", index=0)