In [1]:
import pandas as pd
import numpy as np
# Libraries for text preprocessing
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [2]:
df_movies = pd.read_csv('./datasets/movies_imdb.csv')
df_movies.dropna(subset=['plots']).isna().sum()

cast           281
color          115
directors      267
genres           8
movieId          0
plots            0
producers      986
rating          12
runtimes        54
synopsis     18312
title            0
writers       1260
year             0
dtype: int64

In [3]:
df_movies['plots'] = df_movies['plots'].apply(lambda x: str(x).replace("|"," "))
stop_words = set(stopwords.words("english"))
docs = []
for i in range(0, df_movies.shape[0]):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', df_movies['plots'][i])
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    ##Stemming
    ps=PorterStemmer()
    #Lemmatisation
    lem = WordNetLemmatizer()
    words = [lem.lemmatize(word) for word in text if not word in  
            stop_words] 
    #text = " ".join(text)
    docs.append(words)

In [5]:
from gensim.models.phrases import Phrases, Phraser
phrases = Phrases(docs, min_count=30, progress_per=10000)

In [6]:
docs[0]

['cowboy',
 'doll',
 'profoundly',
 'threatened',
 'jealous',
 'new',
 'spaceman',
 'figure',
 'supplants',
 'top',
 'toy',
 'boy',
 'room',
 'little',
 'boy',
 'named',
 'andy',
 'love',
 'room',
 'playing',
 'toy',
 'especially',
 'doll',
 'named',
 'woody',
 'toy',
 'andy',
 'come',
 'life',
 'woody',
 'belief',
 'life',
 'toy',
 'good',
 'however',
 'must',
 'worry',
 'andy',
 'family',
 'moving',
 'woody',
 'know',
 'andy',
 'birthday',
 'party',
 'woody',
 'realize',
 'andy',
 'mother',
 'gave',
 'action',
 'figure',
 'known',
 'buzz',
 'lightyear',
 'believe',
 'toy',
 'quickly',
 'becomes',
 'andy',
 'new',
 'favorite',
 'toy',
 'woody',
 'consumed',
 'jealousy',
 'try',
 'get',
 'rid',
 'buzz',
 'woody',
 'buzz',
 'lost',
 'must',
 'find',
 'way',
 'get',
 'back',
 'andy',
 'move',
 'without',
 'pas',
 'ruthless',
 'toy',
 'killer',
 'sid',
 'phillips',
 'woody',
 'good',
 'hearted',
 'cowboy',
 'doll',
 'belongs',
 'young',
 'boy',
 'named',
 'andy',
 'see',
 'position',
 'an

In [8]:
sentences = phrases[docs]

In [11]:
sentences[0]

['cowboy',
 'doll',
 'profoundly',
 'threatened',
 'jealous',
 'new',
 'spaceman',
 'figure',
 'supplants',
 'top',
 'toy',
 'boy',
 'room',
 'little',
 'boy_named',
 'andy',
 'love',
 'room',
 'playing',
 'toy',
 'especially',
 'doll',
 'named',
 'woody',
 'toy',
 'andy',
 'come',
 'life',
 'woody',
 'belief',
 'life',
 'toy',
 'good',
 'however',
 'must',
 'worry',
 'andy',
 'family',
 'moving',
 'woody',
 'know',
 'andy',
 'birthday_party',
 'woody',
 'realize',
 'andy',
 'mother',
 'gave',
 'action',
 'figure',
 'known',
 'buzz',
 'lightyear',
 'believe',
 'toy',
 'quickly',
 'becomes',
 'andy',
 'new',
 'favorite',
 'toy',
 'woody',
 'consumed',
 'jealousy',
 'try',
 'get_rid',
 'buzz',
 'woody',
 'buzz',
 'lost',
 'must',
 'find_way',
 'get',
 'back',
 'andy',
 'move',
 'without',
 'pas',
 'ruthless',
 'toy',
 'killer',
 'sid',
 'phillips',
 'woody',
 'good',
 'hearted',
 'cowboy',
 'doll',
 'belongs',
 'young_boy',
 'named',
 'andy',
 'see',
 'position',
 'andy',
 'favorite',
 '

In [26]:

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

75313

In [33]:
from operator import itemgetter
sorted(word_freq, key=word_freq.get, reverse=True)[:10]


['life',
 'one',
 'find',
 'get',
 'two',
 'family',
 'man',
 'friend',
 'take',
 'father']

In [34]:
import multiprocessing

from gensim.models import Word2Vec

In [35]:
cores = multiprocessing.cpu_count()
cores

8

In [50]:
from time import time
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.32 mins


In [51]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 8.19 mins


In [52]:
w2v_model.init_sims(replace=True)

In [55]:
w2v_model.wv.most_similar(positive=["friend"])

[('best_friend', 0.5820000171661377),
 ('girlfriend', 0.42186182737350464),
 ('childhood_friend', 0.4153825044631958),
 ('bff', 0.4146702289581299),
 ('boyfriend', 0.408772349357605),
 ('acquaintance', 0.4053356349468231),
 ('pal', 0.39235299825668335),
 ('rhonda', 0.39014139771461487),
 ('close_friend', 0.37696152925491333),
 ('schoolmate', 0.3760896325111389)]

In [68]:
span_plot = w2v_model.wv[[w for w in sentences[0] if w in w2v_model.wv.vocab]]

In [69]:
span_plot.shape

(244, 300)

In [71]:
from sympy import *

In [72]:
m = Matrix(span_plot)

In [73]:
plot2m = m.rref()

In [79]:
plot2m = np.array(plot2m[0]).astype(np.float64)

In [80]:
plot2m.shape

(244, 300)

In [85]:
plot2m = plot2m[~np.all(plot2m == 0, axis=1)]

In [90]:
np.sum(plot2m,axis=0).shape

(300,)

In [129]:
def plot2vecRref(plot):
    span_plot = w2v_model.wv[[w for w in plot if w in w2v_model.wv.vocab]]
    m = Matrix(span_plot.T)
    plot2m = m.rref()
    plot2m = np.array(plot2m[0]).astype(np.float64).T[:span_plot.shape[1]]
    plot2m = plot2m[~np.all(plot2m == 0, axis=1)]
    return np.sum(plot2m,axis=0)

In [130]:
plot0rref= plot2vecRref(sentences[0])

In [133]:
t = time()
plot1rref = plot2vecRref(sentences[1])
print('Time to compute vector: {} mins'.format(round((time() - t) / 60, 2)))

Time to compute vector: 1.57 mins


In [132]:
cosine_similarity(plot0rref.reshape(1,-1), plot1rref.reshape(1,-1))

array([[0.46489411]])

In [122]:
def plot2vecSum(plot):
    span_plot = w2v_model.wv[[w for w in plot if w in w2v_model.wv.vocab]]
    return np.sum(span_plot,axis=0)

In [125]:
plot0 = plot2vecSum(sentences[0])
plot1 = plot2vecSum(sentences[1])

In [128]:
 cosine_similarity(plot0.reshape(1,-1), plot1.reshape(1,-1))

array([[0.555506]], dtype=float32)