### Description

We have textual dialogue data taken from IMDB. Dialogue texts map to respective movie genres. Goal of the model is to predict movie genre based on the given dialogue text from the movie. 

In the separate notebook dialogues were processed. After different models were built using different libraries. You can see those models below.

In [None]:
### Imports ###

import matplotlib
from matplotlib import pyplot as plt
matplotlib.pyplot.style.use('ggplot')

from sklearn import datasets, linear_model, metrics, model_selection, pipeline, preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


import numpy as np
import pandas as pd
import random
import nltk
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re
from gensim.models import doc2vec

%matplotlib inline

In [None]:
### Import processed train and test data ###

train_data_stem = pd.read_csv('clean_train_stem.csv')
test_data_stem = pd.read_csv('clean_test_stem.csv')
train_data_lemma = pd.read_csv('clean_train_lemma.csv')
test_data_lemma = pd.read_csv('clean_test_lemma.csv')

In [None]:
def split_and_choose(text):
    l = text.split(",")
    text = random.choice(l).strip()
    return text

train_data_stem['genres'] = train_data_stem['genres'].apply(lambda x: split_and_choose(x))
train_data_lemma['genres'] = train_data_lemma['genres'].apply(lambda x: split_and_choose(x))

In [None]:
data_stem, holdout_data_stem = model_selection.train_test_split(train_data_stem, test_size = 0.2, random_state = 1)
data_lemma, holdout_data_lemma = model_selection.train_test_split(train_data_lemma, test_size = 0.2, random_state = 1)

## Vectorizer

In [None]:
vectorizer = CountVectorizer(min_df = 3, stop_words={'english'})
vectorized_train_data = vectorizer.fit_transform(data_stem['dialogue'])
vectorized_test_data = vectorizer.transform(holdout_data_stem['dialogue'])
vectorized_final_data = vectorizer.transform(test_data_stem['dialogue'].values.astype('U'))

In [None]:
model = linear_model.LogisticRegression(max_iter=1000)
model.fit(vectorized_train_data, data_stem['genres'])

In [None]:
train_preds = model.predict(vectorized_train_data)
test_preds = model.predict(vectorized_test_data)
final_preds = model.predict(vectorized_final_data)

In [None]:
np_true_train = data_stem['genres'].to_numpy()
np_true_test = holdout_data_stem['genres'].to_numpy()

In [None]:
print(f'train accuracy: {metrics.accuracy_score(np_true_train, train_preds):.3f}')
print(f'test accuracy: {metrics.accuracy_score(np_true_test, test_preds):.3f}')

In [None]:
example = pd.read_csv('sample_submission_most_popular.csv')

In [None]:
example['genres'] = final_preds

In [None]:
example.to_csv('stem.csv', index=False)

## Doc2Vec

In [None]:
def label_sentences(corpus, label_type):
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

train_doc, test_doc = model_selection.train_test_split(train_data_stem, random_state=0, test_size=0.3)
X_train = train_doc['dialogue']
X_test = test_doc['dialogue']
y_train = train_doc['genres']
y_test = test_doc['genres']
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [None]:
model_dbow = Doc2Vec(dm=0, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

In [None]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 100, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 100, 'Test')

In [None]:
logreg = linear_model.LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors_dbow, y_train)
logreg = logreg.fit(train_vectors_dbow, y_train)

In [None]:
y_train_pred = logreg.predict(train_vectors_dbow)
y_holdout_pred = logreg.predict(test_vectors_dbow)

In [None]:
print(f'train accuracy: {metrics.accuracy_score(y_train , y_train_pred):.3f}')
print(f'test accuracy: {metrics.accuracy_score(y_test , y_holdout_pred):.3f}')

## TD-IDF

In [None]:
data_stem, holdout_data_stem = model_selection.train_test_split(train_data_stem, test_size = 0.2, random_state = 1)
#data_lemma, holdout_data_lemma = model_selection.train_test_split(train_data_lemma, test_size = 0.2, random_state = 1)

In [None]:
tfidf_vect = TfidfVectorizer(strip_accents = 'unicode', stop_words = {'english'},
                             min_df = 3, max_df = 6000,
                             analyzer='word', token_pattern=r'\w{2,}')#, max_features=9000)
tfidf_vect.fit(data_stem['dialogue'].values.astype('U'))
tfidf_train = tfidf_vect.transform(data_stem['dialogue'].values.astype('U'))
tfidf_test = tfidf_vect.transform(holdout_data_stem['dialogue'].values.astype('U'))
tfidf_final = tfidf_vect.transform(test_data_stem['dialogue'].values.astype('U'))

In [None]:
len(tfidf_vect.vocabulary_)

In [None]:
tfidf_model = linear_model.LogisticRegression(max_iter=1000)
tfidf_model.fit(tfidf_train, data_stem['genres'])

In [None]:
train_preds = tfidf_model.predict(tfidf_train)
test_preds = tfidf_model.predict(tfidf_test)
final_preds = tfidf_model.predict(tfidf_final)

In [None]:
np_true_train = data_stem['genres'].to_numpy()
np_true_test = holdout_data_stem['genres'].to_numpy()

In [None]:
print(f'train accuracy: {metrics.accuracy_score(np_true_train, train_preds):.3f}')
print(f'test accuracy: {metrics.accuracy_score(np_true_test, test_preds):.3f}')


In [None]:
example = pd.read_csv('sample_submission_most_popular.csv')

In [None]:
example['genres'] = final_preds

In [None]:
example.to_csv('tf-idf.csv', index=False)

In [None]:
data_stem['genres'].nunique()