In [1]:
import sklearn
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import nltk
%pprint

Pretty printing has been turned OFF


# Movie Reviews

In [2]:
moviedir = r'movie_reviews/'

In [3]:
movie_train = load_files(moviedir, shuffle=True)

In [4]:
len(movie_train.data)

1962

Strange, I expected to get 2000 here.

In [5]:
movie_train.target_names

['neg', 'pos']

In [6]:
movie_train.data[0]

b"unfortunately it doesn't get much more formulaic than one tough cop . \nthere's the renegade cop with the loser partner who has to many problems to explain . \nthe renegade has to prove his good name and is trapped between the good guys , the bad guys and some woman who really has nothing to do with the story other than being there for the purpose of providing sex for the hero in the middle of the film . \nbo dietl ( pronounced deedle , baldwin ) is one tough cop , a guy who is being investigated by hardass fbi agents due to his association with the ny mafia . \non top of that problem , he has a drunk , gambling addicted partner ( penn ) who has a penchant for collecting parking tickets ( i guess cops aren't immune ? ) . \nthen , throw into the mix the femme fatale ( gershon ) . \nright there you have plenty of ammo for a decent cop drama . \nbut right in the middle of the film , they throw in a completely un-related plot point . \ndietl and his partner duke try to solve a case that 

In [7]:
movie_train.filenames[0]

'movie_reviews/neg\\cv056_14663.txt'

In [8]:
movie_train.target[0]

0

0 = Negative, 1 = Positive

# An Aside

In [9]:
sents = ['A rose is a rose is a rose is a rose.',
         'Oh, what a fine day it is.',
        "It ain't over till it's over, I tell you!!"]

In [10]:
# NLTK's tokenizer allows us to remove stop words
foovec = CountVectorizer(min_df=1, tokenizer=nltk.word_tokenize)

In [11]:
sents_counts = foovec.fit_transform(sents)
foovec.vocabulary_

{'a': 4, 'rose': 14, 'is': 9, '.': 3, 'oh': 12, ',': 2, 'what': 17, 'fine': 7, 'day': 6, 'it': 10, 'ai': 5, "n't": 11, 'over': 13, 'till': 16, "'s": 1, 'i': 8, 'tell': 15, 'you': 18, '!': 0}

In [12]:
sents_counts.shape

(3, 19)

In [13]:
sents_counts.toarray()

array([[0, 0, 0, 1, 4, 0, 0, 0, 0, 3, 0, 0, 0, 0, 4, 0, 0, 0, 0],
       [0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0],
       [2, 1, 1, 0, 0, 1, 0, 0, 1, 0, 2, 1, 0, 2, 0, 1, 1, 0, 1]], dtype=int64)

In [14]:
tfidf_transformer = TfidfTransformer()
sents_tfidf = tfidf_transformer.fit_transform(sents_counts)

In [15]:
sents_tfidf.toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.13650997,  0.54603988,
         0.        ,  0.        ,  0.        ,  0.        ,  0.40952991,
         0.        ,  0.        ,  0.        ,  0.        ,  0.71797683,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.28969526,  0.28969526,  0.28969526,
         0.        ,  0.38091445,  0.38091445,  0.        ,  0.28969526,
         0.28969526,  0.        ,  0.38091445,  0.        ,  0.        ,
         0.        ,  0.        ,  0.38091445,  0.        ],
       [ 0.47282517,  0.23641258,  0.17979786,  0.        ,  0.        ,
         0.23641258,  0.        ,  0.        ,  0.23641258,  0.        ,
         0.35959573,  0.23641258,  0.        ,  0.47282517,  0.        ,
         0.23641258,  0.23641258,  0.        ,  0.23641258]])

# Back to Movies

In [16]:
movie_vec = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)
movie_counts = movie_vec.fit_transform(movie_train.data)

In [17]:
movie_vec.vocabulary_.get('screen')

19458

In [18]:
movie_vec.vocabulary_.get('segal')

19595

In [19]:
movie_counts.shape

(1962, 25072)

In [20]:
tfidf_transformer = TfidfTransformer()
movie_tfidf = tfidf_transformer.fit_transform(movie_counts)

In [21]:
movie_tfidf.shape

(1962, 25072)

In [22]:
docs_train, docs_test, y_train, y_test = train_test_split(movie_tfidf, movie_train.target, test_size = 0.20, random_state = 12)

In [23]:
clf = MultinomialNB().fit(docs_train, y_train)

In [24]:
y_pred = clf.predict(docs_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

0.82951653944020354

In [25]:
y_pred

array([1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0,

In [26]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[149,  39],
       [ 28, 177]], dtype=int64)

In [27]:
reviews_new = ['This movie was excellent', 'Absolute joy ride', 
            'Steven Seagal was terrible', 'Steven Seagal shined through.', 
              'This was certainly a movie', 'Two thumbs up', 'I fell asleep halfway through', 
              "We can't wait for the sequel!!", '!', '?', 'I cannot recommend this highly enough', 
              'instant classic.', 'Steven Seagal was amazing. His performance was Oscar-worthy.']
reviews_new_counts = movie_vec.transform(reviews_new)
reviews_new_tfidf = tfidf_transformer.transform(reviews_new_counts)

In [28]:
pred = clf.predict(reviews_new_tfidf)

In [29]:
for review, category in zip(reviews_new, pred):
    print('%r => %s' % (review, movie_train.target_names[category]))

'This movie was excellent' => pos
'Absolute joy ride' => pos
'Steven Seagal was terrible' => neg
'Steven Seagal shined through.' => neg
'This was certainly a movie' => neg
'Two thumbs up' => neg
'I fell asleep halfway through' => neg
"We can't wait for the sequel!!" => neg
'!' => neg
'?' => neg
'I cannot recommend this highly enough' => pos
'instant classic.' => pos
'Steven Seagal was amazing. His performance was Oscar-worthy.' => neg


I recall that NLTK's document classifier could tell you the top features prediciting the label of a document which is nice because it gives you a bit of insight into what the program is doing. Does SK have a feature like this?