In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
imdb_data = pd.read_csv('./imdb_labelled.txt', sep='\t', names=['review', 'label'],
                        converters={'review': str.strip, 'label': int})
imdb_data.head()

Unnamed: 0,review,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [3]:
train_inp, test_inp, train_out, test_out = train_test_split(imdb_data.review, imdb_data.label, test_size=0.2)
print(len(train_inp))
print(len(test_inp))

598
150


In [7]:
from sklearn.naive_bayes import BernoulliNB as NaiveBayes
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
classifier = NaiveBayes()

In [14]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('extractor', vectorizer),
    ('classifier', classifier)
])

pipe.fit(train_inp, train_out)

Pipeline(memory=None,
     steps=[('extractor', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...lary=None)), ('classifier', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))])

In [10]:
train_inp_vectorized = vectorizer.fit_transform(train_inp, train_out)
classifier.fit(train_inp_vectorized, train_out)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [18]:
print(classification_report(pipe.predict(test_inp), test_out))
print(accuracy_score(pipe.predict(test_inp), test_out))


             precision    recall  f1-score   support

          0       0.67      0.83      0.74        58
          1       0.87      0.74      0.80        92

avg / total       0.79      0.77      0.78       150

0.773333333333


In [22]:
def print_top10(vectorizer, clf, class_label):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    top10 = np.argsort(clf.coef_[0])[-10:]
    print("%s: %s" % (class_label,
          " ".join(feature_names[j] for j in top10)))

In [23]:
print_top10(vectorizer, classifier, 'Positive Review')

Positive Review: just love best characters like great really good movie film


In [37]:
test = np.argsort(classifier.coef_[0])[-10:]
test2 = classifier.coef_[0]
test

array([1217, 1338,  211,  344, 1295,  975, 1755,  959, 1452,  830], dtype=int32)

In [38]:
np(classifier)

TypeError: 'module' object is not callable

In [28]:
feature_names = vectorizer.get_feature_names()
feature_names

['10',
 '12',
 '13',
 '15',
 '15pm',
 '17',
 '1928',
 '1948',
 '1949',
 '1971',
 '1973',
 '1986',
 '1995',
 '1998',
 '20',
 '2005',
 '2006',
 '20th',
 '25',
 '30',
 '50',
 '54',
 '70',
 '70s',
 '80',
 '80s',
 '8pm',
 '90',
 '95',
 'aailiyah',
 'abandoned',
 'ability',
 'absolutely',
 'abstruse',
 'abysmal',
 'academy',
 'accents',
 'acclaimed',
 'accolades',
 'accurate',
 'accurately',
 'accused',
 'achievement',
 'achille',
 'ackerman',
 'act',
 'acted',
 'acting',
 'action',
 'actor',
 'actors',
 'actress',
 'actresses',
 'actually',
 'adams',
 'adaptation',
 'add',
 'added',
 'addition',
 'admins',
 'admiration',
 'admitted',
 'adorable',
 'adrift',
 'adventure',
 'advise',
 'aerial',
 'aesthetically',
 'affected',
 'afraid',
 'africa',
 'afternoon',
 'age',
 'aged',
 'ages',
 'ago',
 'agree',
 'agreed',
 'aimless',
 'air',
 'akasha',
 'akin',
 'alert',
 'alexander',
 'allison',
 'allow',
 'allowing',
 'amateurish',
 'amaze',
 'amazed',
 'amazing',
 'amazingly',
 'america',
 'americ

In [41]:
test = np.argsort(classifier.coef_[0])