In [12]:
import re
import string

import nltk
import pickle
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.decomposition import PCA, TruncatedSVD
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, roc_auc_score

import matplotlib.pyplot as plt

## Data Loading and EDA

In [2]:
wiki_data_all = pd.read_csv('./data/wiki_movie_plots_deduped.csv')
wiki_data_all = wiki_data_all[~wiki_data_all['Genre'].str.contains('unknown')]
wiki_data_all

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
10,1906,Dream of a Rarebit Fiend,American,Wallace McCutcheon and Edwin S. Porter,,short,https://en.wikipedia.org/wiki/Dream_of_a_Rareb...,The Rarebit Fiend gorges on Welsh rarebit at a...
11,1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_t...,The film features a train traveling through th...
12,1906,Kathleen Mavourneen,American,Edwin S. Porter,,short film,https://en.wikipedia.org/wiki/Kathleen_Mavourn...,Irish villager Kathleen is a tenant of Captain...
...,...,...,...,...,...,...,...,...
34877,2013,Particle (film),Turkish,Erdem Tepegöz,"Jale Arıkan, Rüçhan Caliskur, Özay Fecht, Remz...",drama film,https://en.wikipedia.org/wiki/Particle_(film),"Zeynep lost her job at weaving factory, and he..."
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


In [27]:
for k, v in wiki_data_all['Genre'].value_counts().items():
    print(k)

drama
comedy
horror
action
thriller
romance
western
crime
adventure
musical
crime drama
romantic comedy
science fiction
film noir
mystery
war
animation
comedy, drama
sci-fi
family
fantasy
animated
musical comedy
comedy-drama
biography
anime
suspense
romantic drama
comedy drama
animated short
drama, romance
social
historical
action thriller
documentary
serial
world war ii
family drama
war drama
drama, crime
comedy, musical
comedy/drama
comedy, romance
romance, drama
biopic
crime thriller
historical drama
black comedy
action comedy
comedy short
superhero
horror comedy
crime comedy
drama, biography
martial arts
action, drama
action, romance
drama, war
action, thriller
romance/comedy
social drama
melodrama
drama, adventure
romance/drama
action, comedy
action drama
biography, drama
comedy, family
drama, musical
drama, family
drama, thriller
comedy, crime
mockumentary
drama / romance
short
comedy / romance
romance, comedy
 
romance drama
tokusatsu
drama 
action, crime, drama
spy
animated fil

In [16]:
print('# Unique Genres:', str(len(wiki_data_all['Genre'].unique())))

# Unique Genres: 2264


## Data Cleaning and splitting

In [3]:
def filter_genre(genre): 
    return [g.split('(')[0].strip().lower() for g in re.split(',|/', genre) if ')' not in g]
    
    
def filter_plot(plot):
    plot = ' '.join(re.split(r'\[\d+\]', plot))
    return ' '.join([s.strip(string.punctuation) for s in plot.encode('ascii', 'ignore').decode().split()])

In [30]:
# wiki_data_all['genre_split'] = wiki_data_all['Genre'].apply(filter_genre)

wiki_data_select = wiki_data_all[~wiki_data_all['Genre'].str.contains(' ')]
top_10_genre = list(wiki_data_select['Genre'].value_counts()[:5].keys())

wiki_data_reduced = wiki_data_select[wiki_data_select['Genre'].isin(top_10_genre)].reset_index(drop=True)
wiki_data_reduced['plot_filtered'] = wiki_data_reduced['Plot'].apply(filter_plot)
wiki_data_reduced['title_filtered'] = wiki_data_reduced['Title'].apply(filter_plot)

wiki_data_reduced.Genre.value_counts()

drama       5964
comedy      4379
horror      1167
action      1098
thriller     966
Name: Genre, dtype: int64

In [32]:
wiki_data_reduced[['plot_filtered', 'Genre']]

Unnamed: 0,plot_filtered,Genre
0,The film is about a family who move to the sub...,comedy
1,Before heading out to a baseball game at a nea...,comedy
2,The plot is that of a black woman going to the...,comedy
3,On a beautiful summer day a father and mother ...,drama
4,A thug accosts a girl as she leaves her workpl...,drama
...,...,...
13569,Hasan is a twelve-year-old boy living with his...,drama
13570,Through the night three cars carry a small gro...,drama
13571,The film opens with a Senegalese boy named Kha...,drama
13572,Two musicians Salih and Grkan described the ad...,comedy


In [5]:
X = wiki_data_reduced['plot_filtered']
y = wiki_data_reduced['Genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=123)
X_train, y_train

(9831     Harry England Michael Crawford a British car s...
 2814     Mildred Turner is a patient of a New York psyc...
 4553     An agent of the United States CIA is arrested ...
 7332     In Japanese society it is said a curse is crea...
 7989     Following their successful heist in Brazil Dom...
                                ...                        
 5218     In 2031 Dr Buchanan and his team work to devel...
 12252    Cleopatra is an emotional family film which co...
 1346     James Cagney plays a truck driver named Danny ...
 11646    Krishna is the adopted son of a woman who foun...
 3582     Professional racecar driver Frank Capua Paul N...
 Name: plot_filtered, Length: 9501, dtype: object,
 9831     comedy
 2814     comedy
 4553     comedy
 7332     horror
 7989     action
           ...  
 5218     horror
 12252     drama
 1346      drama
 11646    action
 3582      drama
 Name: Genre, Length: 9501, dtype: object)

## Model Pipelines

### Logistic Regression Pipe

In [9]:
%%time 
pipe = Pipeline([
    ('count', CountVectorizer(max_features=20000)),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=107)),
    ('lr', LogisticRegression(max_iter=500))
])

param_grid = {
    'count__ngram_range':[(1,1), (2,4)],
    'count__stop_words':[None, 'english'],
    'lr__penalty':['l1', 'l2'],
    'lr__C':[0.01, 0.1, 1, 10],
    'lr__solver': ['lbfgs', 'saga']

}


grid = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose=10)
grid.fit(X_train, y_train)


pickle.dump(grid,open('./saved_models/lr_grid.pkl', 'wb'))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START count__ngram_range=(1, 1), count__stop_words=english, lr__C=1, lr__max_iter=500, lr__penalty=l2
[CV 1/5; 1/1] END count__ngram_range=(1, 1), count__stop_words=english, lr__C=1, lr__max_iter=500, lr__penalty=l2;, score=0.659 total time=   9.2s
[CV 2/5; 1/1] START count__ngram_range=(1, 1), count__stop_words=english, lr__C=1, lr__max_iter=500, lr__penalty=l2
[CV 2/5; 1/1] END count__ngram_range=(1, 1), count__stop_words=english, lr__C=1, lr__max_iter=500, lr__penalty=l2;, score=0.665 total time=   8.3s
[CV 3/5; 1/1] START count__ngram_range=(1, 1), count__stop_words=english, lr__C=1, lr__max_iter=500, lr__penalty=l2
[CV 3/5; 1/1] END count__ngram_range=(1, 1), count__stop_words=english, lr__C=1, lr__max_iter=500, lr__penalty=l2;, score=0.688 total time=   7.7s
[CV 4/5; 1/1] START count__ngram_range=(1, 1), count__stop_words=english, lr__C=1, lr__max_iter=500, lr__penalty=l2
[CV 4/5; 1/1] END count__ngram_range

GridSearchCV(estimator=Pipeline(steps=[('count',
                                        CountVectorizer(max_features=20000)),
                                       ('tfidf', TfidfTransformer()),
                                       ('smote', SMOTE(random_state=107)),
                                       ('lr',
                                        LogisticRegression(max_iter=500,
                                                           n_jobs=-1))]),
             param_grid={'count__ngram_range': [(1, 1)],
                         'count__stop_words': ['english'], 'lr__C': [1],
                         'lr__max_iter': [500], 'lr__penalty': ['l2']},
             verbose=10)

### Naive Bayes Pipeline

In [11]:
%%time 

pipe = Pipeline([
    ('count', CountVectorizer(max_features=20000)),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=107)),
    ('mb', MultinomialNB())
])


param_grid = {
    'count__ngram_range':[(1,1), (2,4)],
    'count__stop_words':[None, 'english'],
    'mb__alpha':[round(.1*i, 1) for i in range(11)]
}


grid = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose=10)
grid.fit(X_train, y_train)

pickle.dump(grid,open('./saved_models/nb_grid.pkl', 'wb'))

Fitting 5 folds for each of 44 candidates, totalling 220 fits
Wall time: 8min 23s


### RandomForest Pipeline

In [None]:
%%time 

pipe = Pipeline([
    ('count', CountVectorizer(max_features=20000)),
    ('tfidf', TfidfTransformer()),
    ('smote', SMOTE(random_state=107)),
    ('rf', RandomForestClassifier())
])



param_grid = {
    'count__ngram_range':[(1,1), (2,4)],
    'count__stop_words':[None, 'english'],
    'rf__n_estimators':[ 80*i for i in range(1, 6)],
    'rf__criterion': ['gini', 'entropy'],
    'rf__max_features':['auto', 'log2']
}

grid = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose=10)
grid.fit(X_train, y_train)

pickle.dump(grid,open('./saved_models/rf_grid.pkl', 'wb'))

Fitting 5 folds for each of 80 candidates, totalling 400 fits


## Performances

In [9]:
lr_grid = pickle.load(open('./saved_models/lr_grid.pkl', 'rb'))
nb_grid = pickle.load(open('./saved_models/nb_grid.pkl', 'rb'))
rf_grid = pickle.load(open('./saved_models/rf_grid.pkl', 'rb'))



In [15]:
lr_pred = lr_grid.predict(X_test)
print(classification_report(y_test, lr_pred))

nb_pred = nb_grid.predict(X_test)
print(classification_report(y_test, nb_pred))

rf_pred = rf_grid.predict(X_test)
print(classification_report(y_test, rf_pred))


              precision    recall  f1-score   support

      action       0.61      0.55      0.58       351
      comedy       0.72      0.71      0.71      1334
       drama       0.70      0.77      0.74      1754
      horror       0.75      0.76      0.75       344
    thriller       0.35      0.21      0.26       290

    accuracy                           0.69      4073
   macro avg       0.63      0.60      0.61      4073
weighted avg       0.68      0.69      0.68      4073

              precision    recall  f1-score   support

      action       0.45      0.56      0.50       351
      comedy       0.67      0.71      0.69      1334
       drama       0.72      0.64      0.68      1754
      horror       0.68      0.81      0.74       344
    thriller       0.26      0.22      0.24       290

    accuracy                           0.64      4073
   macro avg       0.56      0.59      0.57      4073
weighted avg       0.64      0.64      0.64      4073

              precisio

In [14]:
roc_auc_score(y_test, lr_pred)

ValueError: could not convert string to float: 'drama'