# Load movie_reviews through sklearn

In [1]:
import sklearn
from sklearn.datasets import load_files

In [2]:
moviedir = r"C:\Users\ktlan\Documents\Data_Science\movie_reviews\movie_reviews"

In [3]:
movie_train = load_files(moviedir, shuffle=True)

In [4]:
len(movie_train.data)

2000

In [5]:
movie_train.target_names
# Target names are the classes

['neg', 'pos']

In [6]:
movie_train.data[0][:500]

b"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so cal"

In [7]:
movie_train.filenames[0]

'C:\\Users\\ktlan\\Documents\\Data_Science\\movie_reviews\\movie_reviews\\neg\\cv405_21868.txt'

In [8]:
movie_train.target[0]

0

# Try out CountVectorizer and TF-IDF

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
%pprint

Pretty printing has been turned OFF


In [11]:
import nltk
#nltk.download('all')

In [12]:
sents = ['A rose is a rose is a rose is a rose.',
        'Oh, what a fine day it is.',
        "It ain't over till it's over, I tell you!!"]

In [13]:
foovec = CountVectorizer(min_df=1, tokenizer=nltk.word_tokenize)
# Initialize a CountVectorizer to use NLTK's tokenizer instead of its default one that ignores punctuation and stop words.
# Minimum document frequency set to 1.

In [14]:
sents_counts = foovec.fit_transform(sents)
foovec.vocabulary_

{'a': 4, 'rose': 14, 'is': 9, '.': 3, 'oh': 12, ',': 2, 'what': 17, 'fine': 7, 'day': 6, 'it': 10, 'ai': 5, "n't": 11, 'over': 13, 'till': 16, "'s": 1, 'i': 8, 'tell': 15, 'you': 18, '!': 0}

In [15]:
sents_counts.shape

(3, 19)

In [16]:
sents_counts.toarray()

array([[0, 0, 0, 1, 4, 0, 0, 0, 0, 3, 0, 0, 0, 0, 4, 0, 0, 0, 0],
       [0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0],
       [2, 1, 1, 0, 0, 1, 0, 0, 1, 0, 2, 1, 0, 2, 0, 1, 1, 0, 1]], dtype=int64)

In [17]:
#TF-IDF (Term Frequency -- Inverse Document Frequency) values
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
sents_tfidf = tfidf_transformer.fit_transform(sents_counts)

In [18]:
sents_tfidf.toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.13650997,  0.54603988,
         0.        ,  0.        ,  0.        ,  0.        ,  0.40952991,
         0.        ,  0.        ,  0.        ,  0.        ,  0.71797683,
         0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.28969526,  0.28969526,  0.28969526,
         0.        ,  0.38091445,  0.38091445,  0.        ,  0.28969526,
         0.28969526,  0.        ,  0.38091445,  0.        ,  0.        ,
         0.        ,  0.        ,  0.38091445,  0.        ],
       [ 0.47282517,  0.23641258,  0.17979786,  0.        ,  0.        ,
         0.23641258,  0.        ,  0.        ,  0.23641258,  0.        ,
         0.35959573,  0.23641258,  0.        ,  0.47282517,  0.        ,
         0.23641258,  0.23641258,  0.        ,  0.23641258]])

# Transforming Movie Reviews

In [19]:
movie_vec = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)
movie_counts = movie_vec.fit_transform(movie_train.data)

In [20]:
movie_vec.vocabulary_.get('seagal')

19690

In [21]:
movie_counts.shape

(2000, 25313)

In [22]:
tfidf_transformer = TfidfTransformer()
movie_tfidf = tfidf_transformer.fit_transform(movie_counts)

In [23]:
movie_tfidf.shape

(2000, 25313)

# Training and Testing a Naive Bayes Classifier

In [24]:
# Multinominal Naive Bayes
from sklearn.naive_bayes import MultinomialNB

In [26]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(movie_tfidf, movie_train.target, test_size = 0.20, random_state = 12)

In [27]:
#Train a Multimoda Naive Bayes Classifier
clf = MultinomialNB().fit(docs_train, y_train)

In [28]:
#Predict test set results and finding accuracy
y_pred = clf.predict(docs_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

0.82250000000000001

In [29]:
#Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[176,  30],
       [ 41, 153]])

# Trying Classifier on Fake Movie Reviews

In [30]:
reviews_new = ['This movie was excellent', 'Absolute Joy ride', 'Steven Seagal was terrible', 'Steven Seagal shined through.',
              'This was certainly a movie', 'Two thumbs up', 'I fell asleep halfway through', "We can't wait for the sequel!!",
              '!', '?', 'I cannot recommend this highly enough', 'instant classic.',
                'Steven Seagal was amazing. His performance was Oscar-worthy.']
reviews_new_counts = movie_vec.transform(reviews_new)
reviews_new_tfidf = tfidf_transformer.transform(reviews_new_counts)

In [31]:
#Have classifier make a prediction
pred = clf.predict(reviews_new_tfidf)

In [32]:
for review, category in zip(reviews_new, pred):
    print('%r => %s' % (review, movie_train.target_names[category]))

'This movie was excellent' => pos
'Absolute Joy ride' => pos
'Steven Seagal was terrible' => neg
'Steven Seagal shined through.' => neg
'This was certainly a movie' => neg
'Two thumbs up' => neg
'I fell asleep halfway through' => neg
"We can't wait for the sequel!!" => neg
'!' => neg
'?' => neg
'I cannot recommend this highly enough' => pos
'instant classic.' => pos
'Steven Seagal was amazing. His performance was Oscar-worthy.' => neg
