In [1]:
# For Word2Vec
import gensim 
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# For text preprocessing
import nltk
from nltk.corpus import stopwords
import string
from string import digits

# Generics
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import json
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import numpy as np
import multiprocessing

In [2]:
lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop_words = stopwords.words('english')

In [18]:
def preprocess_word(word):
    """
    INPUT : String
    OUTPUT: String lemmatized
    Apply standard cleaning operations of strings like:
    - Lemmatization
    - Stemming
    """
    word = lemmatizer.lemmatize(word)
    return stemmer.stem(word)

def preprocess_doc(document):
    
    """
    INPUT : String
    OUTPUT: List of strings 
    Apply standard cleaning operations to each of the words
    and return a list of the cleaned words
    """
    document = document.lower().translate(str.maketrans(' ', ' ', string.punctuation)).split()
    return [preprocess_word(x)  for x in document if x not in stop_words]


def tag(list_of_docs):
    """
    INPUT : A dataframe row
    OUTPUT: List of strings
    Associate a tag/number with each document of the training corpus.
    """
    for i, line in enumerate(list_of_docs):
        yield  TaggedDocument(line, [i])
        
def getvector(query):
    """
    INPUT: String
    OUTUT: np.array containing the embeddings for query
    """
    # Parse the query
    return model.infer_vector(preprocess_doc(query))
        
def query(query, n_results=5):
    """
    INPUT: String to query, numbers of docs to retireve
    OUTUT: Dataframe containing the n most similar docs to the query
    """
    
    # Parse the query
    query_preprocessed = getvector(query) 
    
    # Get matches
    results = pd.DataFrame( model.docvecs.most_similar([query_preprocessed], topn=n_results), columns=['doc_ID', 'distance']) 
    
    # Match query results with original df
    get_ids = df.loc[results['doc_ID'].tolist()]
    # Add distances to dataframe
    get_ids['distance'] = results['distance'].tolist()        
    return get_ids[['Name','Plot','distance']]

##### Read the preprocessed DF

In [19]:
path   = "./preprocessed/"
df     = pd.read_pickle(path+"df_preprocessed.pkl")
df.head(5)

Unnamed: 0,Wiki_ID,Plot,Name,genres,len,Corpus,genres_all
0,0,The nation of Panem consists of a wealthy Capi...,The Hunger Games,Action/Adventure,4559,"[nation, panem, consist, wealthi, capitol, twe...","[Action/Adventure, Science Fiction, Action, Dr..."
1,1,Poovalli Induchoodan is sentenced for six yea...,Narasimham,Musical,3099,"[pooval, induchoodan, sentenc, six, year, pris...","[Musical, Action, Drama, Bollywood]"
2,2,"The Lemon Drop Kid , a New York City swindler,...",The Lemon Drop Kid,Screwball comedy,4917,"[lemon, drop, kid, new, york, citi, swindler, ...","[Screwball comedy, Comedy]"
3,3,Seventh-day Adventist Church pastor Michael Ch...,A Cry in the Dark,Crime Fiction,2425,"[seventhday, adventist, church, pastor, michae...","[Crime Fiction, Drama, Docudrama, World cinema..."
4,4,The president is on his way to give a speech. ...,End Game,Thriller,1937,"[presid, way, give, speech, travel, man, show,...","[Thriller, Action/Adventure, Action, Drama]"


##### Create a generator that will be used to iterate over the movies during the training (row)

In [5]:
Corpus_Tagged = list(tag(df['Corpus'].tolist()))

### Train a Doc2Vec model
----

**Word2Vec** is a more recent model that embeds words in a lower-dimensional vector space using a shallow neural network
The result is a set of word-vectors where vectors close together in vector space have similar meanings based on context, and word-vectors distant to each other have differing meanings. 

With the Word2Vec model, one can calculate the vectors for each word in a document. But what if we want to calculate a vector for the entire document?

2 Possibilities: 

- Average the vectors for each word in the document
- Doc2Vec (Mikolov,2014)


There are two implementations:
- **Distributed Memory (PV-DM)**:  task of predicting a center word based an average of both context word-vectors and the full document’s doc-vector
- **Distributed Bag of Words (PV-DBOW)**: task of predicting a target word just from the full document’s doc-vector.



In [6]:
# Parameters for the training

dm   = 1                               # Defines the training algorithm.  (dm=1) --> (PV-DM),  (dm=0) --> (PV-DBOW) 
size = 100                             # Dimensionality of the feature vectors
window = 5                             # How many neighbours (to the left & to the right) will be analyzed for the current word
min_count = 5                          # Ignore all words with total frequency lower than this.
workers =multiprocessing.cpu_count()   # Faster training with multicore machines

model = Doc2Vec(Corpus_Tagged,
                vector_size=size ,
                workers=workers,
                min_count=min_count,
                epochs=40 )

#### Save the model

In [7]:
path   = "./models/"
model.save(path+ "doc2vec_100_Movies")

#### Load the model

In [4]:
model = Doc2Vec.load("./models/doc2vec_100_Movies")

In [5]:
df = df.reset_index(drop=True)
df

Unnamed: 0,Wiki_ID,Plot,Name,genres,len,Corpus,genres_all
0,0,The nation of Panem consists of a wealthy Capi...,The Hunger Games,Action/Adventure,4559,"[nation, panem, consist, wealthi, capitol, twe...","[Action/Adventure, Science Fiction, Action, Dr..."
1,1,Poovalli Induchoodan is sentenced for six yea...,Narasimham,Musical,3099,"[pooval, induchoodan, sentenc, six, year, pris...","[Musical, Action, Drama, Bollywood]"
2,2,"The Lemon Drop Kid , a New York City swindler,...",The Lemon Drop Kid,Screwball comedy,4917,"[lemon, drop, kid, new, york, citi, swindler, ...","[Screwball comedy, Comedy]"
3,3,Seventh-day Adventist Church pastor Michael Ch...,A Cry in the Dark,Crime Fiction,2425,"[seventhday, adventist, church, pastor, michae...","[Crime Fiction, Drama, Docudrama, World cinema..."
4,4,The president is on his way to give a speech. ...,End Game,Thriller,1937,"[presid, way, give, speech, travel, man, show,...","[Thriller, Action/Adventure, Action, Drama]"
...,...,...,...,...,...,...,...
21101,21101,Lucy is working as a dancer in a sleazy strip...,I Don't Want to Be Born,Horror,4801,"[luci, work, dancer, sleazi, strip, joint, sta...",[Horror]
21102,21102,Twenty-something Eun-mo listens to a taxi driv...,Paju,Romantic drama,1406,"[twentysometh, eunmo, listen, taxi, driver, dr...","[Romantic drama, Romance Film, Drama, World ci..."
21103,21103,"In 1928 Hollywood, director Leo Andreyev look...",The Last Command,Silent film,2971,"[1928, hollywood, director, leo, andreyev, loo...","[Silent film, Indie, Black-and-white, Period p..."
21104,21104,"Abdur Rehman Khan , a middle-aged dry fruit se...",Kabuliwala,Drama,1289,"[abdur, rehman, khan, middleag, dri, fruit, se...",[Drama]


In [6]:
def sanity_check(query, id):
    return model.docvecs.most_similar([getvector(query)], topn=1)[0][0] == id
 
def test_coherence(df, nitem = 500, verbose=0, seed=1):
    df_temp = df.sample(nitem)
    
    #print(df_temp)
    
    ids    =  df_temp['Wiki_ID'].tolist()
    docs    = df_temp['Plot'].tolist()
    titles  = df_temp['Name'].tolist()
    
    wrongs = 0
    for i, doc,t in zip(ids,docs,titles):
        if(sanity_check(doc,i) == False ):
            if verbose == 1:
                print(i)
                print("WRONG: [" + t + "] \n ")
            wrongs += 1
            
    print("\n\n ***** Correctness: [ " + str(((nitem-wrongs)/nitem * 100)) + "% ] ***** with seed "+ str(seed)+" \n \t \t " )    

In [12]:
test_coherence(df,300, verbose=1, seed=2)



 ***** Correctness: [ 100.0% ] ***** with seed 2 
 	 	 
