In [141]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import re
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_colwidth', -1)

### Predict Genre Based on Plot

In [21]:
omdb = pd.read_json('movies/data/omdb-data.json.gz', lines=True)

In [51]:
plot_words = omdb['omdb_plot'].to_string()
re.sub('(\\n\d*)| (\s{2,}) | ([\,\.])','', plot_words )
len(plot_words)

' '

In [58]:
# keeping first genre only, change later to multiple
omdb['omdb_genres'] = omdb['omdb_genres'].str[0]

In [86]:
genres = omdb['omdb_genres'].values

In [102]:
#X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train, X_test, y_train, y_test = train_test_split(omdb.omdb_plot, omdb.omdb_genres, test_size=0.2)

In [82]:
# ignore this and just use below pipeline
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(omdb['omdb_plot'])
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, omdb['omdb_genres'])
plot = ['hi']
X_new_counts = count_vect.transform(plot)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
predicted[0]

In [161]:
clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))),
    ('tfidf',TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None))
])
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.4896694214876033


In [156]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
              'clf__penalty': ['none', 'l2', 'l1']
}
gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(omdb.omdb_plot, omdb.omdb_genres)



In [None]:
# logs
# svc - 0.4793388429752066
# knn - 0.303202479338843

In [157]:
gs_clf.best_score_ 

0.4708557255064076

In [158]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
clf__penalty: 'l2'
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [171]:
predicted = clf.predict(['Sam Bell has a three year contract to work for Lunar Industries. For the contract\'s entire duration, he is the sole employee based at their lunar station. His primary job responsibility is to harvest and periodically rocket back to Earth supplies of helium-3, the current clean and abundant fuel used on Earth. There is no direct communication link available between the lunar station and Earth, so his only direct real-time interaction is with GERTY, the intelligent computer whose function is to attend to his day to day needs. With such little human contact and all of it indirect, he feels that three years is far too long to be so isolated; he knows he is beginning to hallucinate as the end of his three years approaches. All he wants is to return to Earth to be with his wife Tess and their infant daughter Eve, who was born just prior to his leaving for this job. With two weeks to go, he gets into an accident at one of the mechanical harvesters and is rendered unconscious. Injured, he awakens back at the station in the infirmary, he assumes assisted by GERTY. GERTY tells him that a rescue team named Eliza will come to the station to clean up the aftermath of the accident. After his recuperation, he takes an unauthorized trip back to the broken harvester, where he makes an unexpected discovery. Because of his find, he begins to doubt his sanity, then his true identity, then the company and GERTY\'s willingness to do what is best for him. Because of his resulting beliefs, his sole mission becomes how to get back to Earth on his own.'])

In [174]:
predicted

array(['Drama'], dtype='<U11')