<h1> Movie review classification with NLTK </h1>

In [2]:
import random
import nltk
from nltk import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
cats = movie_reviews.categories()
reviews = []
for cat in cats:
    for fid in movie_reviews.fileids(cat):
        review = (list(movie_reviews.words(fid)),cat)
        reviews.append(review)
random.shuffle(reviews)

In [4]:
all_wd_in_reviews = nltk.FreqDist(wd.lower() for wd in movie_reviews.words())
top_wd_in_reviews = [list(wds) for wds in zip(*all_wd_in_reviews.most_common(2000))][0]

In [5]:
def ext_ft(review,top_words):
    review_wds = set(review)
    ft = {}
    for wd in top_words:
        ft['word_present({})'.format(wd)] = (wd in review_wds)
    return ft

In [6]:
featuresets = [(ext_ft(d,top_wd_in_reviews), c) for (d,c) in reviews]
train_set, test_set = featuresets[200:], featuresets[:200]

In [7]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.81


In [8]:
classifier.show_most_informative_features(20)

Most Informative Features
word_present(outstanding) = True              pos : neg    =     12.7 : 1.0
    word_present(seagal) = True              neg : pos    =      8.1 : 1.0
     word_present(mulan) = True              pos : neg    =      7.7 : 1.0
    word_present(poorly) = True              neg : pos    =      7.6 : 1.0
     word_present(damon) = True              pos : neg    =      6.8 : 1.0
word_present(wonderfully) = True              pos : neg    =      5.8 : 1.0
    word_present(wasted) = True              neg : pos    =      5.8 : 1.0
word_present(ridiculous) = True              neg : pos    =      5.7 : 1.0
     word_present(awful) = True              neg : pos    =      5.7 : 1.0
     word_present(flynt) = True              pos : neg    =      5.7 : 1.0
     word_present(waste) = True              neg : pos    =      5.6 : 1.0
      word_present(zero) = True              neg : pos    =      5.5 : 1.0
      word_present(lame) = True              neg : pos    =      5.4 : 1

In [9]:
dict_vectorizer=None
def get_train_test(train_set,test_set):
    global dict_vectorizer
    dict_vectorizer = DictVectorizer(sparse=False)
    X_train, y_train = zip(*train_set)
    X_train = dict_vectorizer.fit_transform(X_train)
    X_test,y_test = zip(*test_set)
    X_test = dict_vectorizer.transform(X_test)
    return X_train,X_test,y_train,y_test

In [10]:
X_train,X_test,y_train,y_test = get_train_test(train_set,test_set)
rf = RandomForestClassifier(n_estimators=100,n_jobs=4,random_state=10)
rf.fit(X_train,y_train)

RandomForestClassifier(n_jobs=4, random_state=10)

In [11]:
preds = rf.predict(X_test)
print(accuracy_score(y_test,preds))

0.8


In [12]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
all_words_in_reviews = nltk.FreqDist(word.lower() for word in movie_reviews.words() if word not in stopwords_list)
top_words_in_reviews = [list(words) for words in zip(*all_words_in_reviews.most_common(2000))][0]

In [13]:
featuresets = [(ext_ft(d,top_words_in_reviews), c) for (d,c) in reviews]
train_set, test_set = featuresets[200:], featuresets[:200]
X_train,X_test,y_train,y_test = get_train_test(train_set,test_set)

In [14]:
rf = RandomForestClassifier(n_estimators=100,n_jobs=4,random_state=10)
rf.fit(X_train,y_train)

RandomForestClassifier(n_jobs=4, random_state=10)

In [15]:
preds = rf.predict(X_test)
print(accuracy_score(y_test,preds))

0.82


In [16]:
features_list = zip(dict_vectorizer.get_feature_names(),rf.feature_importances_)
features_list = sorted(features_list, key=lambda x: x[1], reverse=True)
print(features_list[0:20])    

[('word_present(bad)', 0.01468030405256903), ('word_present(worst)', 0.006503097968587886), ('word_present(boring)', 0.006180445684980895), ('word_present(stupid)', 0.005994795097225717), ('word_present(waste)', 0.005714925317303078), ('word_present(mess)', 0.005273943912705106), ('word_present(ridiculous)', 0.00498755305028451), ('word_present(wasted)', 0.004370299819569839), ('word_present(awful)', 0.004363014979523729), ('word_present(script)', 0.004239315608704159), ('word_present(excellent)', 0.004154720227324847), ('word_present(outstanding)', 0.0040361723850817575), ('word_present(lame)', 0.003662041251954036), ('word_present(plot)', 0.003555496526169104), ('word_present(subtle)', 0.0035096012572137557), ('word_present(dull)', 0.003482828596408981), ('word_present(also)', 0.0034293546728235637), ('word_present(performances)', 0.0033044494215313026), ('word_present(great)', 0.0032897530411859425), ('word_present(supposed)', 0.0032218151776120594)]


