In [12]:
import re
import string

import nltk
import pickle
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.decomposition import PCA, TruncatedSVD
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

In [2]:
wiki_data_all = pd.read_csv('./data/wiki_movie_plots_deduped.csv')
wiki_data_all = wiki_data_all[~wiki_data_all['Genre'].str.contains('unknown')]
wiki_data_all

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
10,1906,Dream of a Rarebit Fiend,American,Wallace McCutcheon and Edwin S. Porter,,short,https://en.wikipedia.org/wiki/Dream_of_a_Rareb...,The Rarebit Fiend gorges on Welsh rarebit at a...
11,1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_t...,The film features a train traveling through th...
12,1906,Kathleen Mavourneen,American,Edwin S. Porter,,short film,https://en.wikipedia.org/wiki/Kathleen_Mavourn...,Irish villager Kathleen is a tenant of Captain...
...,...,...,...,...,...,...,...,...
34877,2013,Particle (film),Turkish,Erdem Tepegöz,"Jale Arıkan, Rüçhan Caliskur, Özay Fecht, Remz...",drama film,https://en.wikipedia.org/wiki/Particle_(film),"Zeynep lost her job at weaving factory, and he..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


In [3]:
def filter_genre(genre): 
    return [g.split('(')[0].strip().lower() for g in re.split(',|/', genre) if ')' not in g]
    
    
def filter_plot(plot):
    plot = ' '.join(re.split(r'\[\d+\]', plot))
    return ' '.join([s.strip(string.punctuation) for s in plot.encode('ascii', 'ignore').decode().split()])

In [4]:
# wiki_data_all['genre_split'] = wiki_data_all['Genre'].apply(filter_genre)

wiki_data_select = wiki_data_all[~wiki_data_all['Genre'].str.contains(' ')]
top_10_genre = list(wiki_data_select['Genre'].value_counts()[:5].keys())

wiki_data_reduced = wiki_data_select[wiki_data_select['Genre'].isin(top_10_genre)].reset_index(drop=True)
wiki_data_reduced['plot_filtered'] = wiki_data_reduced['Plot'].apply(filter_plot)
wiki_data_reduced['title_filtered'] = wiki_data_reduced['Title'].apply(filter_plot)

wiki_data_reduced.Genre.value_counts()

drama       5964
comedy      4379
horror      1167
action      1098
thriller     966
Name: Genre, dtype: int64

In [5]:
X = wiki_data_reduced['plot_filtered']
y = wiki_data_reduced['Genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=123)
X_train, y_train

(9831     Harry England Michael Crawford a British car s...
 2814     Mildred Turner is a patient of a New York psyc...
 4553     An agent of the United States CIA is arrested ...
 7332     In Japanese society it is said a curse is crea...
 7989     Following their successful heist in Brazil Dom...
                                ...                        
 5218     In 2031 Dr Buchanan and his team work to devel...
 12252    Cleopatra is an emotional family film which co...
 1346     James Cagney plays a truck driver named Danny ...
 11646    Krishna is the adopted son of a woman who foun...
 3582     Professional racecar driver Frank Capua Paul N...
 Name: plot_filtered, Length: 9501, dtype: object,
 9831     comedy
 2814     comedy
 4553     comedy
 7332     horror
 7989     action
           ...  
 5218     horror
 12252     drama
 1346      drama
 11646    action
 3582      drama
 Name: Genre, Length: 9501, dtype: object)

In [6]:
# count_vectorizer = CountVectorizer(ngram_range=(1,1),stop_words='english')

# X_train_transformed = count_vectorizer.fit_transform(X_train)
# X_test_transformed = count_vectorizer.transform(X_test)

# X_test_transformed

# X = wiki_data_reduced['plot_filtered']
# y = wiki_data_reduced['Genre']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=123)
# X_train, y_train

pipe = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=107)),
    ('lr', LogisticRegression(max_iter=500,
                              n_jobs=-1))
])

In [9]:
param_grid = {
    'count__ngram_range':[(1,1), (1,2), (1,3), (1,4)],
    'count__stop_words':[None, 'english'],
    'count__max_features':[1000, 10000, 100000],
    'lr__penalty':['l1', 'l2', 'elasticnet'],
    'lr__C':[0.01, 0.1, 1, 10],
    'lr__max_iter':[200, 500],
    'lr__fit_intercept':[True, False]
}


grid = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose=10)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   30.4s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   40.3s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   48.8s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  3

GridSearchCV(estimator=Pipeline(steps=[('count', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('smote', SMOTE(random_state=107)),
                                       ('lr',
                                        LogisticRegression(max_iter=500,
                                                           n_jobs=-1,
                                                           verbose=3))]),
             n_jobs=-1,
             param_grid={'count__max_features': [1000, 10000, 100000],
                         'count__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
                         'count__stop_words': [None, 'english'],
                         'lr__C': [0.01, 0.1, 1, 10],
                         'lr__fit_intercept': [True, False],
                         'lr__max_iter': [200, 500],
                         'lr__penalty': ['l1', 'l2', 'elasticnet']},
             verbose=10)

In [10]:
grid.best_estimator_

Pipeline(steps=[('count',
                 CountVectorizer(max_features=100000, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('smote', SMOTE(random_state=107)),
                ('lr',
                 LogisticRegression(C=10, fit_intercept=False, max_iter=200,
                                    n_jobs=-1, verbose=3))])

In [14]:
pickle.dump(grid,open('lr_grid.pkl', 'wb'))

In [8]:
class ClfSwitcher(BaseEstimator):

    def __init__(
        self, 
        estimator = LogisticRegression(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 

        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)

In [26]:
count_pipe = Pipeline([
    ('count', CountVectorizer()),
    ('clf', ClfSwitcher())
])




parameters = [
    {
        'count__ngram_range': [(1,1), (1,2), (1,3)],
        'count__max_features': [None, 1000, 10000],
        'clf__estimator': [LogisticRegression()],
        'clf__estimator__max_iter':(100, 1000),
        'clf__estimator__n_jobs': [-1],
        'clf__estimator__C': [1.0, 0.1, 0.01],
        'clf__estimator__fit_intercept': [True, False],
        'clf__estimator__penalty': ['l2', 'l1'],
        'clf__estimator__solver': ['lbfgs','saga']
    }
]



In [27]:
grid = GridSearchCV(count_pipe, parameters, cv=5, n_jobs=-1, verbose=10)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   35.6s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   52.2s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   54.7s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   57.8s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   58.7s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   59.3s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  2

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('count', CountVectorizer()),
                                       ('clf', ClfSwitcher())]),
             n_jobs=-1,
             param_grid=[{'clf__estimator': [LogisticRegression(n_jobs=-1)],
                          'clf__estimator__C': [1.0, 0.1, 0.01],
                          'clf__estimator__fit_intercept': [True, False],
                          'clf__estimator__max_iter': (100, 1000),
                          'clf__estimator__n_jobs': [-1],
                          'clf__estimator__penalty': ['l2', 'l1'],
                          'clf__estimator__solver': ['lbfgs', 'saga'],
                          'count__max_features': [None, 1000, 10000],
                          'count__ngram_range': [(1, 1), (1, 2), (1, 3)]}],
             verbose=10)

In [29]:
pred = grid.predict(X_test)
sum(pred == y_test)/len(y_test)

0.44494780382115423