In [35]:
import nltk
from nltk.corpus import movie_reviews
import random
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet
import string
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
movie_reviews.categories()
lemmatizer=WordNetLemmatizer()

In [3]:
stops=stopwords.words('english')
punc=list(string.punctuation)
stops=stops+punc

In [4]:
len(movie_reviews.fileids())
#1000 -ve
#1000 +ve       

2000

In [5]:
#it returns file id with positive reviews
pos_rev=movie_reviews.fileids('pos')

In [6]:
#we need to pass the file id here to get words
movie_reviews.words(movie_reviews.fileids())

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [7]:
documents=[]
for category in movie_reviews.categories():
    for fid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fid),category))
documents[1]        

(['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg')

***Shuffling to mix pos/neg so that it becomes easy for train_test_split***

In [8]:
random.shuffle(documents)

In [9]:
def getsimplepos (tag) :
    if tag.startswith( 'J'):
        return wordnet.ADJ
    elif tag.startswith( 'V'):
        return wordnet. VERB
    elif tag.startswith( 'N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet .NOUN

**pos_tag requires an array**

In [10]:
w="better"
pos_tag([w])

[('better', 'RBR')]

**Cleaning the dataset**

In [11]:
def clean_review(words):
    output_words=[]
    for w in words:
        if(w.lower() not in stops):
            pos=pos_tag([w])
            clean_word=lemmatizer.lemmatize(w,pos=getsimplepos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words        
            

In [12]:
documents=[(clean_review(document),category) for document,category in documents]

In [13]:
documents[0]

(['must',
  'admit',
  'tad',
  'skeptical',
  'good',
  'hunt',
  'base',
  'preview',
  'first',
  'fifteen',
  'minute',
  'film',
  'main',
  'character',
  'hunt',
  'matt',
  'damon',
  'mit',
  'janitor',
  'early',
  'twenty',
  'discover',
  'einstein',
  'level',
  'closet',
  'genius',
  'solves',
  'two',
  'extraordinarily',
  'difficult',
  'math',
  'problem',
  'overnight',
  'problem',
  'tough',
  'street',
  'kid',
  'share',
  'run',
  'in',
  'law',
  'long',
  'haul',
  'assault',
  'parking',
  'lot',
  'fight',
  'professor',
  'lambeau',
  'stellan',
  'skarsgard',
  'brought',
  'math',
  'problem',
  'lecture',
  'track',
  'strike',
  'deal',
  'police',
  'release',
  'provide',
  'work',
  'lambeau',
  'math',
  'research',
  'regularly',
  'attends',
  'therapy',
  'session',
  'sound',
  'like',
  'formula',
  'mildly',
  'charm',
  'fluff',
  'good',
  'hunt',
  'rise',
  'fairly',
  'mundane',
  'premise',
  'deliver',
  'poignant',
  'clever',
  'dram

**Now creating dataset with feature as top words**

***Building the feature sets only on training data***

In [14]:
training=documents[:1500]
testing=documents[1500:]

In [15]:
#we will collect all words in an array and find frequency distribution
all_words=[]
for doc in training:
    all_words+=doc[0]

In [16]:
freq=nltk.FreqDist(all_words)
common=freq.most_common(3000)

In [17]:
features=[words for words,count in common]
#features

In [18]:
def get_feature_dict(words):
    current_features={}
    w_set=set(words)
    for w in features:
        current_features[w]=w in w_set
    return current_features    

In [19]:
training_data=[(get_feature_dict(word),category)for word,category in training]
testing_data=[(get_feature_dict(word),category)for word,category in testing]

In [20]:
training_data[0]

({'film': True,
  'movie': True,
  'one': True,
  'make': True,
  'like': True,
  'character': True,
  'get': True,
  'see': True,
  'go': False,
  'time': True,
  'well': True,
  'scene': True,
  'even': True,
  'good': True,
  'story': True,
  'take': True,
  'would': True,
  'much': True,
  'come': True,
  'also': True,
  'bad': False,
  'life': True,
  'give': True,
  'look': False,
  'two': True,
  'know': True,
  'way': True,
  'seem': True,
  'end': True,
  '--': False,
  'first': True,
  'year': True,
  'work': True,
  'thing': True,
  'plot': False,
  'say': True,
  'play': False,
  'really': True,
  'little': True,
  'show': False,
  'people': False,
  'could': True,
  'star': True,
  'man': False,
  'try': False,
  'never': False,
  'love': False,
  'director': True,
  'great': False,
  'best': False,
  'new': False,
  'big': False,
  'many': False,
  'performance': True,
  'want': True,
  'actor': False,
  'watch': False,
  'find': True,
  'think': False,
  'u': True,
  'ac

**Using NaiveBayes classifier in nltk**

In [21]:
from nltk import NaiveBayesClassifier

In [22]:
classifier=NaiveBayesClassifier.train(training_data)

In [23]:
nltk.classify.accuracy(classifier,testing_data)

0.778

In [24]:
classifier.show_most_informative_features()

Most Informative Features
                 idiotic = True              neg : pos    =     16.8 : 1.0
             outstanding = True              pos : neg    =     13.7 : 1.0
                  seagal = True              neg : pos    =     12.0 : 1.0
             beautifully = True              pos : neg    =      9.0 : 1.0
             wonderfully = True              pos : neg    =      8.9 : 1.0
                    anna = True              pos : neg    =      8.7 : 1.0
                    coen = True              pos : neg    =      8.7 : 1.0
                   idiot = True              neg : pos    =      8.1 : 1.0
               stupidity = True              neg : pos    =      7.5 : 1.0
                   mulan = True              pos : neg    =      6.8 : 1.0


**Using Sklearn classifier with the same data**

In [25]:
svc=SVC()
clf=SklearnClassifier(svc)

In [26]:
clf.train(training_data)

<SklearnClassifier(SVC())>

In [27]:
nltk.classify.accuracy(clf,testing_data)

0.836

In [28]:
rfc=RandomForestClassifier()
clf1=SklearnClassifier(rfc)

In [29]:
clf1.train(training_data)

<SklearnClassifier(RandomForestClassifier())>

In [30]:
nltk.classify.accuracy(clf1,testing_data)

0.8

**Now using countVectorizer we will convert our dataset in form of 2d array to be used by sklearn classifier**

***For using Count Vectorizer we need to convert the documents array into text by joining each word***

In [32]:
categories=[category for docs,category in documents]

In [33]:
text_doc=[" ".join(docs) for docs,category in documents]

In [34]:
x_train,x_test,y_train,y_test=train_test_split(text_doc,categories)

In [67]:
cv=CountVectorizer(max_features=2000,ngram_range=(1,2))
x_train_featrues=cv.fit_transform(x_train)
x_test_featrues=cv.transform(x_test)

In [59]:
x_train_featrues.shape,len(y_train)

((1500, 2000), 1500)

In [64]:
cv.get_feature_names_out()

array(['10 10', '10 minute', '10 scale', ..., 'young son', 'young woman',
       'zeta jones'], dtype=object)

**Now our training and testing data is ready and we can apply any sklearn classifier on it**

In [68]:
svc=SVC()
svc.fit(x_train_featrues,y_train)

In [69]:
svc.score(x_test_featrues,y_test)

0.848