# **Download IMDB Dataset**

In [1]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -P data/

--2021-02-26 05:18:05--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘data/aclImdb_v1.tar.gz’


2021-02-26 05:18:10 (16.3 MB/s) - ‘data/aclImdb_v1.tar.gz’ saved [84125825/84125825]



# **Extract the Data**

In [2]:
%%time
!tar -xzf data/aclImdb_v1.tar.gz -C data/

CPU times: user 24.3 ms, sys: 4.18 ms, total: 28.5 ms
Wall time: 7.85 s


# **Create DataFrame using Pandas Library**

In [3]:
import os
import pandas as pd
import numpy as np

In [4]:
folder = 'data/aclImdb/'
labels = {'pos': 1, 'neg': 0} 
df = pd.DataFrame()
revList = list()
for f in ('test', 'train'):    
    for l in ('pos', 'neg'):
        path = os.path.join(folder, f, l)
        for file in os.listdir (path) :
            with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
                txt = infile.read()
                revList.append((txt,labels[l]))
            #df = df.append([[txt, labels[l]]],ignore_index=True)
df = pd.DataFrame.from_records(revList)
df.columns = ['review', 'sentiment']

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,Not sure why the other comment on this film wa...,1
1,This is a well done action movie. There are pl...,1
2,The main reason for watching this picture is t...,1
3,Since the last horrid Astérix film and the fac...,1
4,Farscape totally rules! In my opinion it's ver...,1


In [6]:
df.tail(50)

Unnamed: 0,review,sentiment
49950,I hadn't heard of this film until I read an ar...,0
49951,"""Fever Pitch"" isn't a bad film; it's a terribl...",0
49952,This was a rip-off of the same garbage we had ...,0
49953,"First, I should mention that I really enjoyed ...",0
49954,This was a new alltime low among westerns. The...,0
49955,Note: I couldn't force myself to actually writ...,0
49956,Cornel Wilde and three dumbbells search for su...,0
49957,It seems as if in Science Fiction you have thi...,0
49958,"Every Sunday, a trio of buds get together at a...",0
49959,Revenge on us the viewing public perhaps. I sa...,0


In [7]:
df.loc[27000, 'review']

"I saw this recently on a cable channel. The movie is great; it's one of the few musicals I have seen that doesn't shy away from the light and dark. It portrays some of the splendour of the age along with a lot of the squalor. Some of the set piece dance sequences so much is going on, I didn't know where to look next. One day I shall go and see this on the big screen, just so that I see what's happening. But what really lifts this to another level is Oliver Reed's performance as Bill Sykes. Not only is a thoroughly mean and menacing man but there is something else, some inner demons. He gave me the impression that if you pushed him into a corner, he was capable of anything. It was almost as if the Sykes character was on the edge of madness, just awaiting the trigger. I have seen the Robert Newton's Bill Sykes from the 1948 movie, and I thought he was 'just' a bad egg, but Oliver Reed's performance intimidated me in my own living room."

In [8]:
df.loc[27000, 'sentiment']

1

# **Build Vocabulary**

In [13]:
import nltk

from nltk.tokenize import word_tokenize # import tokenizer
from nltk.corpus import stopwords # import stopwords
from nltk.stem.porter import PorterStemmer #import stemmer
from nltk.stem import WordNetLemmatizer #lemmatizer
from nltk.corpus import wordnet

In [14]:
# Install the nltk component for several tasks
nltk.download('punkt')     
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [15]:
reviews = df.review.str.cat(sep=' ') #function to split text into word
tokens = word_tokenize(reviews)
vocabulary = set(tokens)
print(len(vocabulary))

#frequency_dist = nltk.FreqDist(tokens)
#sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)[0:50]

199784


# **Remove Stopwords**

In [16]:
stop_words = set(stopwords.words('english'))
vocabulary = [w for w in vocabulary if not w in stop_words]
print (len (vocabulary))

199635


# **Build Classifier**

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer #
from sklearn.feature_extraction.text import TfidfVectorizer # from text to vector
from sklearn.naive_bayes import MultinomialNB #import naive bayes classifier
from sklearn import svm #import SVM classifier
from sklearn.metrics  import accuracy_score # accuracy measure
from sklearn.tree import DecisionTreeClassifier # Decision tree classfier
from sklearn.ensemble import RandomForestClassifier # Random Forest Classfier 

In [18]:
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values
vectorizer = TfidfVectorizer()

v=vectorizer.fit_transform(vocabulary)
#train_vectors=vectorizer.fit_transform(X_train)

train_vectors = vectorizer.transform(X_train)
test_vectors = vectorizer.transform(X_test)

print(train_vectors.shape, test_vectors.shape)

(25000, 101880) (25000, 101880)


# **Using Naive Bayes**

In [19]:
clf = MultinomialNB().fit(train_vectors, y_train)

predicted = clf.predict(test_vectors)
print(accuracy_score(y_test,predicted))

0.84124


# **Using SVM**

In [None]:
clf = svm.SVC().fit(train_vectors, y_train)

In [None]:
predicted = clf.predict(test_vectors)
print(accuracy_score(y_test,predicted))

# **Using Decision Trees**

In [None]:
clf = DecisionTreeClassifier(max_depth=5).fit(train_vectors, y_train)

In [None]:
predicted = clf.predict(test_vectors)
print(accuracy_score(y_test,predicted))

# **Using Random Forest**

In [None]:
clf=RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1).fit(train_vectors, y_train)

In [None]:
predicted = clf.predict(test_vectors)
print(accuracy_score(y_test,predicted))