In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import re
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

pd.set_option('display.max_colwidth', -1)

### Predict Genre Based on Plot

In [2]:
omdb = pd.read_json('movies/data/omdb-data.json.gz', lines=True)

In [3]:
# keeping first genre only, change later to multiple
omdb['omdb_genres'] = omdb['omdb_genres'].str[0]

In [4]:
genres = omdb['omdb_genres'].values

In [5]:
#X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train, X_test, y_train, y_test = train_test_split(omdb.omdb_plot, omdb.omdb_genres, test_size=0.2)

In [6]:
# ignore this and just use below pipeline
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(omdb['omdb_plot'])
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, omdb['omdb_genres'])
plot = ['hi']
X_new_counts = count_vect.transform(plot)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
predicted[0]

'Comedy'

In [7]:
clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))),
    ('tfidf',TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None))
])
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.4994834710743802


In [12]:
# search for best parameters using grid search
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
              'clf__penalty': ['none', 'l2', 'l1']
}
#gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)
#gs_clf = gs_clf.fit(omdb.omdb_plot, omdb.omdb_genres)

In [13]:
# logs
# svc - 0.4793388429752066 but takes forever
# knn - 0.303202479338843

In [10]:
#gs_clf.best_score_ 

In [14]:
#for param_name in sorted(parameters.keys()):
#    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

In [17]:
#keep triple quote
#PLACE PLOT BETWEEN TRIPPLE QUOTES TO PREDICT
plot = """When Thor's evil brother, Loki (Tom Hiddleston), gains access to the unlimited power of the energy cube called the Tesseract, Nick Fury (Samuel L. Jackson), director of S.H.I.E.L.D., initiates a superhero recruitment effort to defeat the unprecedented threat to Earth. Joining Fury's "dream team" are Iron Man (Robert Downey Jr.), Captain America (Chris Evans), the Hulk (Mark Ruffalo), Thor (Chris Hemsworth), the Black Widow (Scarlett Johansson) and Hawkeye (Jeremy Renner)."""
predicted = clf.predict([plot])
predicted

array(['Action'], dtype='<U11')