In [1]:
import re
import string

import nltk
import pickle
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.decomposition import PCA, TruncatedSVD
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

In [2]:
wiki_data_all = pd.read_csv('./data/wiki_movie_plots_deduped.csv')
wiki_data_all = wiki_data_all[~wiki_data_all['Genre'].str.contains('unknown')]
wiki_data_all

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
10,1906,Dream of a Rarebit Fiend,American,Wallace McCutcheon and Edwin S. Porter,,short,https://en.wikipedia.org/wiki/Dream_of_a_Rareb...,The Rarebit Fiend gorges on Welsh rarebit at a...
11,1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_t...,The film features a train traveling through th...
12,1906,Kathleen Mavourneen,American,Edwin S. Porter,,short film,https://en.wikipedia.org/wiki/Kathleen_Mavourn...,Irish villager Kathleen is a tenant of Captain...
...,...,...,...,...,...,...,...,...
34877,2013,Particle (film),Turkish,Erdem Tepegöz,"Jale Arıkan, Rüçhan Caliskur, Özay Fecht, Remz...",drama film,https://en.wikipedia.org/wiki/Particle_(film),"Zeynep lost her job at weaving factory, and he..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


In [3]:
def filter_genre(genre): 
    return [g.split('(')[0].strip().lower() for g in re.split(',|/', genre) if ')' not in g]
    
    
def filter_plot(plot):
    plot = ' '.join(re.split(r'\[\d+\]', plot))
    return ' '.join([s.strip(string.punctuation) for s in plot.encode('ascii', 'ignore').decode().split()])

In [4]:
# wiki_data_all['genre_split'] = wiki_data_all['Genre'].apply(filter_genre)

wiki_data_select = wiki_data_all[~wiki_data_all['Genre'].str.contains(' ')]
top_10_genre = list(wiki_data_select['Genre'].value_counts()[:5].keys())

wiki_data_reduced = wiki_data_select[wiki_data_select['Genre'].isin(top_10_genre)].reset_index(drop=True)
wiki_data_reduced['plot_filtered'] = wiki_data_reduced['Plot'].apply(filter_plot)
wiki_data_reduced['title_filtered'] = wiki_data_reduced['Title'].apply(filter_plot)

wiki_data_reduced.Genre.value_counts()

drama       5964
comedy      4379
horror      1167
action      1098
thriller     966
Name: Genre, dtype: int64

In [5]:
X = wiki_data_reduced['plot_filtered']
y = wiki_data_reduced['Genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=123)
X_train, y_train

(9831     Harry England Michael Crawford a British car s...
 2814     Mildred Turner is a patient of a New York psyc...
 4553     An agent of the United States CIA is arrested ...
 7332     In Japanese society it is said a curse is crea...
 7989     Following their successful heist in Brazil Dom...
                                ...                        
 5218     In 2031 Dr Buchanan and his team work to devel...
 12252    Cleopatra is an emotional family film which co...
 1346     James Cagney plays a truck driver named Danny ...
 11646    Krishna is the adopted son of a woman who foun...
 3582     Professional racecar driver Frank Capua Paul N...
 Name: plot_filtered, Length: 9501, dtype: object,
 9831     comedy
 2814     comedy
 4553     comedy
 7332     horror
 7989     action
           ...  
 5218     horror
 12252     drama
 1346      drama
 11646    action
 3582      drama
 Name: Genre, Length: 9501, dtype: object)

## Data Generation

In [6]:
def gen_count(X_train, X_test, ngram_range=(1,1), stop_words=None, max_features=10000):
    count_vect = CountVectorizer(analyzer='word', 
                                 ngram_range=ngram_range, 
                                 stop_words=stop_words, 
                                 max_features=max_features)
    
    X_train_count_trans = count_vect.fit_transform(X_train)
    X_test_count_trans = count_vect.transform(X_test)
    
    return X_train_count_trans, X_test_count_trans
    
    
def gen_tf(X_train_count_trans, X_test_count_trans):
    tf_vect = TfidfTransformer()
    
    X_train_tf_trans = tf_vect.fit_transform(X_train_count_trans)
    X_test_tf_trans = tf_vect.transform(X_test_count_trans)
    
    return X_train_tf_trans, X_test_tf_trans



def train_model(model, X_train, y_train, param_grid):
    grid = GridSearchCV(model, param_grid, scoring='balanced_accuracy', n_jobs=-1, verbose=10)
    
    grid.fit(X_train, y_train)
    
    return grid
    

In [7]:
X_train_count, X_test_count = gen_count(X_train, 
                                        X_test, 
                                        ngram_range=(1,1), 
                                        stop_words=None)
X_train_tf, X_test_tf = gen_tf(X_train_count, X_test_count)

X_train_count_ngram, X_test_count_ngram = gen_count(X_train, 
                                                    X_test, 
                                                    ngram_range=(2,3), 
                                                    stop_words='english')
X_train_tf_ngram, X_test_tf_ngram = gen_tf(X_train_count_ngram, X_test_count_ngram)



### Logistic Regression Pipe

In [11]:
model = LogisticRegression(max_iter=500)
param_grid = {
    'penalty':['l1', 'l2'],
    'C':[0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'saga']
}


lr_grid_count = train_model(model, X_train_count, y_train, param_grid)
lr_grid_tf = train_model(model, X_train_tf, y_train, param_grid)

lr_grid_count_ngram = train_model(model, X_train_count_ngram, y_train, param_grid)
lr_grid_tf_ngram = train_model(model, X_train_tf_ngram, y_train, param_grid)


Fitting 5 folds for each of 16 candidates, totalling 80 fits


 0.53652562 0.54081244        nan 0.53948022 0.53314352 0.54305069
        nan 0.54183923 0.52576747 0.54351406]


Fitting 5 folds for each of 16 candidates, totalling 80 fits


 0.26560879 0.26545925        nan 0.44802309 0.45582085 0.45608783
        nan 0.52930424 0.53867437 0.54003059]


Fitting 5 folds for each of 16 candidates, totalling 80 fits


 0.37345918 0.3758734         nan 0.39320583 0.41341138 0.40982128
        nan 0.41422121 0.40527927 0.41356644]


Fitting 5 folds for each of 16 candidates, totalling 80 fits


 0.21117037 0.21130173        nan 0.29438487 0.31996138 0.32020381
        nan 0.40435687 0.39816146 0.39791485]


### MB Pipeline

In [None]:
pipe = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=107)),
    ('mb', MultinomialNB())
])

# param_grid = {
#     'count__ngram_range':[(1,2)],
#     'count__stop_words':[None, 'english'],
#     'count__max_features':[100000],
#     'mb__alpha':[round(.1*i, 1) for i in range(11)]
# }


param_grid = {
    'count__ngram_range':[(1,1), (1,2), (1,3), (1,4)],
    'count__stop_words':[None, 'english'],
    'count__max_features':[1000, 10000, 100000],
    'mb__alpha':[round(.1*i, 1) for i in range(11)]
}


grid = GridSearchCV(pipe, param_grid, n_jobs=-1, scoring='balanced_accuracy', verbose=100)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 264 candidates, totalling 1320 fits


In [None]:
pred = grid.predict(X_test)
print(classification_report(y_test, pred))


In [19]:
print(classification_report(y_test, grid.best_estimator_.predict(X_test)))


              precision    recall  f1-score   support

      action       0.54      0.58      0.56       351
      comedy       0.70      0.71      0.70      1334
       drama       0.72      0.74      0.73      1754
      horror       0.71      0.81      0.75       344
    thriller       0.39      0.19      0.26       290

    accuracy                           0.68      4073
   macro avg       0.61      0.60      0.60      4073
weighted avg       0.67      0.68      0.67      4073

