In [31]:
import sys
import nltk
import sklearn
import random
from nltk.corpus import movie_reviews

In [32]:
# Generating Dataset
documents = [(list(movie_reviews.words(fileid)), category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]

# Shuffle in random order
random.shuffle(documents)

print('Number of docs: {}'.format(len(documents)))
print('First review: {}'.format(documents[0]))

all_words = [w.lower() for w in movie_reviews.words()]

# Arrange words in descending order of frequency
all_words = nltk.FreqDist(all_words)

print('\n Most common words: {}\n'.format(all_words.most_common(20)))
print('Times happy used: {}'.format(all_words['happy']))
print('Times worst used: {}'.format(all_words['worst']))

Number of docs: 2000
First review: (['from', 'a', 'major', 'league', 'baseball', 'radio', 'broadcast', ',', 'featuring', 'play', '-', 'by', '-', 'play', 'man', 'harry', 'canary', 'and', 'color', 'man', 'whitey', 'hashbrown', ',', 'with', 'special', 'guest', 'commentator', 'james', 'berardinelli', '.', 'hc', ':', 'as', 'we', 'go', 'to', 'the', 'top', 'of', 'the', '8th', ',', 'we', "'", 're', 'joined', 'in', 'the', 'booth', 'by', 'film', 'critic', 'james', 'berardinelli', ',', 'who', "'", 's', 'here', 'fresh', 'from', 'seeing', 'the', 'new', 'baseball', 'movie', ',', 'major', 'league', ':', 'back', 'to', 'the', 'minors', ',', 'the', 'third', 'in', 'the', 'popular', 'saga', 'taking', 'a', 'lighter', 'look', 'at', 'the', 'majors', '.', 'nice', 'to', 'see', 'you', ',', 'jim', '.', 'as', 'a', 'big', 'baseball', 'fan', 'and', 'a', 'movie', 'reviewer', ',', 'can', 'you', 'give', 'us', 'the', 'scoop', 'on', 'the', 'new', 'flick', '?', 'jb', ':', 'my', 'opinion', ':', 'the', 'producers', 'should

In [33]:
print(len(all_words))

39768


In [34]:
# Taking 4000 most common words as features
word_features = list(all_words.keys())[:4000]

In [46]:
# Check how may selected features are present in each document
def find_features(doc):
    words = set(doc)
    features = {}
    
    # Get if feature is present in doc
    for w in word_features:
        features[w] = (w in words)
    
    return features

# Example
features = find_features(movie_reviews.words('neg/cv000_29416.txt'))

print([key for (key, value) in features.items() if value == True])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', 'one', 'of', 'the', 'guys', 'dies', 'but', 'his', 'girlfriend', 'continues', 'see', 'him', 'in', 'her', 'life', 'has', 'nightmares', 'what', "'", 's', 'deal', '?', 'watch', 'movie', '"', 'sorta', 'find', 'out', 'critique', 'mind', '-', 'fuck', 'for', 'generation', 'that', 'touches', 'on', 'very', 'cool', 'idea', 'presents', 'it', 'bad', 'package', 'which', 'is', 'makes', 'this', 'review', 'even', 'harder', 'write', 'since', 'i', 'generally', 'applaud', 'films', 'attempt', 'break', 'mold', 'mess', 'with', 'your', 'head', 'such', '(', 'lost', 'highway', '&', 'memento', ')', 'there', 'are', 'good', 'ways', 'making', 'all', 'types', 'these', 'folks', 'just', 'didn', 't', 'snag', 'correctly', 'seem', 'have', 'taken', 'pretty', 'neat', 'concept', 'executed', 'terribly', 'so', 'problems', 'well', 'its', 'main', 'problem', 'simply', 'to

In [48]:
# Creating i/p dataset
feature_set = [(find_features(rev), category) for (rev, category) in documents]

In [49]:
# Generating train and test set
from sklearn import model_selection

train, test = model_selection.train_test_split(feature_set, test_size = 0.25, random_state = 1)

print(len(train))
print(len(test))

1500
500


In [50]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [56]:
# Training the model
model = SklearnClassifier(SVC(kernel = 'sigmoid'))
model.train(train)

<SklearnClassifier(SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))>

In [57]:
# Testing the model
accuracy = nltk.classify.accuracy(model, test)
print('Accuracy: {}'.format(accuracy))

Accuracy: 0.84
