In [74]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.stem import PorterStemmer

In [75]:
nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\merta\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [76]:
movie_reviews.categories()

['neg', 'pos']

In [77]:
len(movie_reviews.fileids())

2000

## Preprocessing

In [79]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append([movie_reviews.words(fileid), category])
        
print(len(documents))
documents[1]

2000


[['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg']

In [100]:
documents[1200]

[['the', 'first', 'image', 'in', '"', 'final', ...], 'neg']

In [120]:
random.shuffle(documents)

In [121]:
documents[1200]

[['tom', 'dicillo', 'directs', 'this', 'superficial', ...], 'neg']

In [122]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\merta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Removing stopwords

In [123]:
from nltk.corpus import stopwords
stop = stopwords.words("english")

In [124]:
stop2 = [',', '.', "'", '"', '-', ')', '(', ':', '?']
words = []
for word in movie_reviews.words():
    if word not in stop and word not in stop2:
        words.append(word.lower())
    
words_FD = nltk.FreqDist(words) #frequency distribution

In [125]:
words_FD.most_common(10)

[('film', 9517),
 ('one', 5852),
 ('movie', 5771),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2169),
 ('would', 2109),
 ('much', 2049)]

In [126]:
len(words)

717967

In [127]:
len(words_FD)

39608

In [128]:
words_FD['good']

2411

In [129]:
word_features = list(words_FD.keys())[:10000] #when you decrease the word amount, acc drops

In [130]:
len(documents)

2000

In [131]:
documents[0]

[['the', 'police', 'negotiator', 'is', 'the', 'person', ...], 'pos']

In [132]:
featuresets = []
for (review, category) in documents:
    words = set([word for word in review if word not in stop and word not in stop2])
    features = {}
    
    for w in word_features:
        features[w] = w in words    
    featuresets.append([features, category])
len(featuresets)

2000

In [134]:
featuresets[0][0]

{'plot': False,
 'two': True,
 'teen': False,
 'couples': False,
 'go': False,
 'church': False,
 'party': False,
 'drink': False,
 'drive': False,
 'get': True,
 'accident': False,
 'one': True,
 'guys': False,
 'dies': False,
 'girlfriend': False,
 'continues': False,
 'see': True,
 'life': False,
 'nightmares': False,
 'deal': True,
 'watch': False,
 'movie': True,
 'sorta': False,
 'find': False,
 'critique': False,
 'mind': False,
 'fuck': False,
 'generation': False,
 'touches': False,
 'cool': False,
 'idea': False,
 'presents': False,
 'bad': False,
 'package': False,
 'makes': False,
 'review': False,
 'even': True,
 'harder': False,
 'write': False,
 'since': False,
 'generally': False,
 'applaud': False,
 'films': False,
 'attempt': False,
 'break': False,
 'mold': False,
 'mess': False,
 'head': False,
 'lost': False,
 'highway': False,
 '&': False,
 'memento': False,
 'good': True,
 'ways': False,
 'making': False,
 'types': False,
 'folks': False,
 'snag': False,
 'correc

## Train-Test split and model training

In [135]:
training_set = featuresets[:1800] #1800 for training
testing_set = featuresets[1800:] #rest for testing (200)

In [143]:
type(training_set)

list

In [136]:
clf = nltk.NaiveBayesClassifier.train(training_set)

In [137]:
nltk.classify.accuracy(clf, testing_set)

0.845

In [138]:
clf.show_most_informative_features(10)

Most Informative Features
                  seagal = True              neg : pos    =     13.5 : 1.0
                 miscast = True              neg : pos    =     12.9 : 1.0
                  regard = True              pos : neg    =     11.1 : 1.0
                   sucks = True              neg : pos    =     10.1 : 1.0
               insulting = True              neg : pos    =      9.7 : 1.0
                  hudson = True              neg : pos    =      9.6 : 1.0
              unoriginal = True              neg : pos    =      9.6 : 1.0
               ludicrous = True              neg : pos    =      9.5 : 1.0
              accessible = True              pos : neg    =      9.1 : 1.0
                  annual = True              pos : neg    =      9.1 : 1.0


## F1-score evaluation

In [144]:
from sklearn.metrics import classification_report, confusion_matrix

In [145]:
y_test = []
y_pred = []
for i in range(len(testing_set)):
    y_val = 0 if clf.classify(testing_set[i][0]) == 'neg' else 1
    pred = 0 if testing_set[i][1] == 'neg' else 1
    y_test.append(y_val)
    y_pred.append(pred)


In [146]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84        97
           1       0.85      0.85      0.85       103

    accuracy                           0.84       200
   macro avg       0.84      0.84      0.84       200
weighted avg       0.84      0.84      0.84       200

[[81 16]
 [15 88]]
