In [1]:
from nltk.corpus import movie_reviews
documents=[]
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))

In [2]:
import random
random.shuffle(documents)

In [3]:
from nltk.corpus import wordnet
from nltk import pos_tag
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
lemmatizer=WordNetLemmatizer()
stops=set(stopwords.words('english'))
punctuations=string.punctuation
stops.update(list(punctuations))
def get_simple_pos_tag(tag):
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
def clean_review(words):
    clean_words=[]
    for word in words:
        if word.lower() not in stops:
            pos=pos_tag([word])[0][1]
            clean_word=lemmatizer.lemmatize(word, pos=get_simple_pos_tag(pos))
            clean_words.append(clean_word.lower())
    return clean_words

In [4]:
docs=[(clean_review(w), category) for w, category in documents]

In [5]:
all_words=[]
for tuplee in docs:
    all_words+=tuplee[0]
len(all_words)

710579

In [6]:
freq=nltk.FreqDist(all_words)
common=freq.most_common(3000)
features=[i[0] for i in common]

In [7]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'would',
 'much',
 'come',
 'also',
 'bad',
 'give',
 'life',
 'two',
 'look',
 'way',
 'know',
 'seem',
 'first',
 'end',
 '--',
 'year',
 'work',
 'thing',
 'plot',
 'say',
 'play',
 'really',
 'little',
 'show',
 'people',
 'could',
 'man',
 'star',
 'love',
 'never',
 'try',
 'great',
 'director',
 'best',
 'performance',
 'new',
 'big',
 'many',
 'action',
 'actor',
 'want',
 'u',
 'watch',
 'find',
 'think',
 'role',
 'act',
 'another',
 'back',
 'audience',
 'world',
 'something',
 'turn',
 'still',
 'day',
 'old',
 'set',
 'however',
 'use',
 'every',
 'begin',
 'though',
 'guy',
 'part',
 'comedy',
 'feel',
 'cast',
 'real',
 'enough',
 'around',
 'point',
 'interest',
 'last',
 'run',
 'write',
 'young',
 'may',
 'fact',
 'name',
 'long',
 'funny',
 'script',
 'actually',
 'right',
 'woman',
 'minute',
 'effect',
 'almost',
 'lot'

In [8]:
training_documents=docs[:1500]
testing_documents=docs[1500:]

In [9]:
def get_feature_dict(words):
    current_feature={}
    words_set=set(words)
    for w in features:
        current_feature[w]=w in words_set
    return current_feature

In [10]:
training_data=[(get_feature_dict(words), category) for words, category in training_documents]
testing_data=[(get_feature_dict(words), category) for words, category in testing_documents]

In [11]:
from nltk import NaiveBayesClassifier
clf=NaiveBayesClassifier.train(training_data)
clf

<nltk.classify.naivebayes.NaiveBayesClassifier at 0x2a0f59606a0>

In [12]:
nltk.classify.accuracy(clf, testing_data)

0.812

In [13]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
sklearnclf=LogisticRegression(n_jobs=-1, max_iter=2000, solver='saga')
clf_sklearn=SklearnClassifier(sklearnclf)
clf_sklearn.train(training_data)

<SklearnClassifier(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=-1, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False))>

In [14]:
nltk.classify.accuracy(clf_sklearn, testing_data)

0.854

In [15]:
rfc=RandomForestClassifier(n_jobs=-1, n_estimators=3000)
clf_sklearn1=SklearnClassifier(rfc)
clf_sklearn1.train(training_data)
nltk.classify.accuracy(clf_sklearn1, testing_data)

0.86

In [159]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
count_vec=TfidfVectorizer(max_features=2000, ngram_range=(1, 2), max_df=0.90)
#i have also used count vectorizer and it gave me somewhat similar accuracy.
y=[category for words, category in docs]
x=[" ".join(words) for words, category in docs]

In [160]:
x_train, x_test, y_train, y_test=train_test_split(x, y)

In [167]:
a=count_vec.fit_transform(x_train)
a

<1500x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 260958 stored elements in Compressed Sparse Row format>

In [162]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '2001',
 '30',
 '50',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accompany',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'action film',
 'action movie',
 'action scene',
 'action sequence',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'ahead',
 'aid',
 'aim',
 'air',
 'al',
 'alan',
 'alex',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'along way',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'amy',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'anne',
 'annie',
 'annoy',
 'another',
 'answer',
 'anthony',

In [163]:
x_train_features=a
x_test_features=count_vec.transform(x_test)

In [164]:
classifier=RandomForestClassifier(n_estimators=1000, n_jobs=-1)
classifier.fit(x_train_features, y_train)
classifier.score(x_test_features, y_test)

0.822

In [165]:
classifier=SVC(gamma="scale")
classifier.fit(x_train_features, y_train)
classifier.score(x_test_features, y_test)

0.824

In [166]:
classifier=LogisticRegression(solver='saga', n_jobs=-1, max_iter=2000)
classifier.fit(x_train_features, y_train)
classifier.score(x_test_features, y_test)

0.83