In [None]:
from gensim.models import KeyedVectors
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer

from nltk import sent_tokenize
from nltk import pos_tag
from nltk import map_tag
from nltk import word_tokenize
from nltk.corpus import stopwords

import pandas as pd
import numpy as np
import torch
import json
import re
import csv
from tqdm import tqdm

In [None]:
metadata = pd.read_csv("../data/movie.metadata.tsv", sep = '\t', header = None)
metadata.columns = ["movie_id",1,"movie_name",3,4,5,6,7,"genre"]

plots = []
with open("../data/plot_summaries.txt", 'r') as f:
    reader = csv.reader(f, dialect='excel-tab') 
    for row in tqdm(reader):
        plots.append(row)

movie_id = []
plot = []

# extract movie Ids and plot summaries
for i in tqdm(plots):
    movie_id.append(i[0])
    plot.append(i[1])

# create dataframe
movies = pd.DataFrame({'movie_id': movie_id, 'plot': plot})

# change datatype of 'movie_id'
metadata['movie_id'] = metadata['movie_id'].astype(str)

# merge meta with movies
movies = pd.merge(movies, metadata[['movie_id', 'movie_name', 'genre']], on = 'movie_id')

genres = [] 

# extract genres
for i in movies['genre']: 
    genres.append(list(json.loads(i).values())) 

# add to 'movies' dataframe  
movies['genre_new'] = genres

movies_new = movies[~(movies['genre_new'].str.len() == 0)]

def clean_text(text):
    # remove a string like {{plot}}
    text = re.sub("\s*{{\w*}}\s*", "", text)
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text)
    
    text = text.lower().replace('\n', ' ').replace('\t', ' ').replace('\xa0',' ') #get rid of problem chars
    
    text = ' '.join(text.split())
    
    return text

movies_new['clean_plot'] = movies_new['plot'].apply(lambda x: clean_text(x))

In [None]:
def doc2vec(data_df):
    data = []
    print("Building TaggedDocuments")
    total = len(data_df[['movie_id', 'clean_plot']].as_matrix().tolist())
    processed = 0
    for x in data_df[['movie_id', 'clean_plot']].as_matrix().tolist():
        label = ["_".join(x[0].split())]
        words = []
        sentences = sent_tokenize(x[1])
        for s in sentences:
            words.extend([x for x in word_tokenize(s)])
        doc = TaggedDocument(words, label)
        data.append(doc)

        processed += 1
        if processed % 10000 == 0:
            print(processed, "/", total)

    model = Doc2Vec(min_count=1, window=10, size=300, sample=1e-5, negative=5, workers=2, epochs=20, min_alpha=0.00025)
    
    print("Building Vocabulary")
    model.build_vocab(data)
    
    print("Training starts")
    
    model.train(documents=data, total_examples=model.corpus_count, epochs=model.epochs)
    
    # Build doc2vec vectors
    x_data = []
    genres = data_df['genre_new'].as_matrix().tolist()
    binarizer = MultiLabelBinarizer()
    y_data = binarizer.fit_transform(genres)
    ids = data_df[['movie_id']].as_matrix().tolist()
    for i in range(len(ids)):
        movie_id = ids[i][0]
        label = "_".join(movie_id.split())
        x_data.append(model.docvecs[label])

    return np.array(x_data), y_data


In [None]:
x_data, y_data = doc2vec(movies_new)

In [None]:
import pickle 
with open('doc2vec_data.pkl', 'wb') as f:
    pickle.dump((x_data, y_data), f)

In [None]:
with open('doc2vec_data.pkl', 'rb') as f:
    data = pickle.load(f)
    x_data, y_data = data[0], data[1]