In [1]:
import pandas as pd
import re


In [2]:
reviews = pd.read_csv('Train.csv')

In [3]:
reviews.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [4]:
reviews.shape

(40000, 2)

In [5]:
reviews.info

<bound method DataFrame.info of                                                   review label
0      mature intelligent and highly charged melodram...   pos
1      http://video.google.com/videoplay?docid=211772...   pos
2      Title: Opera (1987) Director: Dario Argento Ca...   pos
3      I think a lot of people just wrote this off as...   pos
4      This is a story of two dogs and a cat looking ...   pos
...                                                  ...   ...
39995  There are similarities between Ray Lawrence's ...   neg
39996  For starters, I once met the director when he ...   neg
39997  Much of "Over Her Dead Body" is so painfully u...   neg
39998  "Lifeforce" is a truly bizarre adaptation of t...   pos
39999  I saw this movie at a screener and its the bes...   pos

[40000 rows x 2 columns]>

In [6]:
reviews.label.value_counts()

pos    20011
neg    19989
Name: label, dtype: int64

In [7]:
# Replace pos with 1 and neg with 0
reviews.label.replace('pos',1,inplace=True)
reviews.label.replace('neg',0,inplace = True)


In [8]:
reviews.tail()

Unnamed: 0,review,label
39995,There are similarities between Ray Lawrence's ...,0
39996,"For starters, I once met the director when he ...",0
39997,"Much of ""Over Her Dead Body"" is so painfully u...",0
39998,"""Lifeforce"" is a truly bizarre adaptation of t...",1
39999,I saw this movie at a screener and its the bes...,1


### Creating NLTK pipeline

In [9]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [10]:
tokenizer = RegexpTokenizer(r'\w+') # Select all the words
ps = PorterStemmer()
en_sw = set(stopwords.words('english'))

In [11]:
def getStemmedReview(review):
    #convert all to lower case
    review = review.lower()
    #replace all the br with space
    review = review.replace('<br /><br />',' ')
    #Tokenize 
    #Remove the digit contained word as wwll
    review=re.sub(r"[A-Za-z]+\d+|\d+[A-Za-z]+",'',review).strip()
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_sw]
    ste_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(ste_tokens)
    return cleaned_review
    

In [12]:
reviews.review = reviews.review.apply(getStemmedReview)

In [13]:
reviews.head(10)

Unnamed: 0,review,label
0,matur intellig highli charg melodrama unbelive...,1
1,http video googl com videoplay docid 211772166...,1
2,titl opera 1987 director dario argento cast cr...,1
3,think lot peopl wrote anoth one tom cruis weir...,1
4,stori two dog cat look way back home old wise ...,1
5,steve carel come first star role 40 year old v...,1
6,go write requir howev summari put top way word...,0
7,ok riski move rent flick thought noth lose wel...,0
8,cannib pair cinemat refer delicatessen plot st...,1
9,one great modern kung fu film lot review seem ...,1


In [14]:
X = reviews.review.values
y = reviews.label.values

### Spliting the datasets into train and test

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
trainx,testx,trainy,testy = train_test_split(X,y,test_size=0.2,random_state=9)
print("Train shapes : X = {}, y = {}".format(trainx.shape,trainy.shape))
print("Test shapes : X = {}, y = {}".format(testx.shape,testy.shape))

Train shapes : X = (32000,), y = (32000,)
Test shapes : X = (8000,), y = (8000,)


In [17]:
def getStemmedFile(inputFile,outputFile):
    output = open(outputFile,'w',encoding='utf-8')
    
    with open(inputFile,encoding='utf-8') as f:
        reviews = f.readlines()
    for review in reviews:
        cleaned_review = getStemmedReview(review)
        print(cleaned_review,file = output)
    output.close()

In [18]:
#cleaned = reviews.review.apply(getStemmedReview)

# Using Sklearn for creating NLP pipeline and implementing naive bayes

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
cv = CountVectorizer(ngram_range=(1,2))

In [21]:
X = cv.fit_transform(trainx)

In [22]:
xtest = cv.transform(testx)

In [23]:
X.shape

(32000, 1918308)

In [76]:
cv.get_feature_names()

['00',
 '000',
 '0000000000001',
 '00000001',
 '00001',
 '00015',
 '001',
 '003830',
 '006',
 '007',
 '0079',
 '0080',
 '0083',
 '009',
 '0093638',
 '01',
 '0126',
 '0148',
 '02',
 '020410',
 '0230',
 '03',
 '039',
 '04',
 '044',
 '05',
 '050',
 '06',
 '07',
 '08',
 '089',
 '09',
 '10',
 '100',
 '1000',
 '10000',
 '1000000',
 '10000000000',
 '1001',
 '1004',
 '101',
 '102',
 '103',
 '104',
 '1040',
 '105',
 '1050',
 '106',
 '107',
 '108',
 '109',
 '10_',
 '10â',
 '11',
 '110',
 '1100',
 '111',
 '112',
 '113',
 '1138',
 '115',
 '116',
 '117',
 '118',
 '119',
 '12',
 '120',
 '1200',
 '1201',
 '1202',
 '121',
 '122',
 '123',
 '12383499143743701',
 '124',
 '1242',
 '1249',
 '125',
 '128',
 '129',
 '12â',
 '13',
 '130',
 '1300',
 '131',
 '132',
 '134',
 '135',
 '136',
 '137',
 '138',
 '13848',
 '139',
 '14',
 '140',
 '1408',
 '141',
 '1415',
 '1416',
 '142',
 '1433421',
 '144',
 '1453',
 '1454',
 '146',
 '1479',
 '148',
 '149',
 '1492',
 '14ã',
 '15',
 '150',
 '1500',
 '15000',
 '151',
 '15

### Implementing Algorithams

In [24]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB

In [25]:
mnb = MultinomialNB()
bnb = BernoulliNB()
gnb = GaussianNB()

In [26]:
mnb.fit(X,trainy)

MultinomialNB()

In [27]:
mnb.predict(xtest)

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

In [28]:
mnb.score(xtest,testy)

0.876375

# 85% accuracy in MNB As we create bigrams our accuracy incr to 88%

In [90]:
bnb.fit(X,trainy)

BernoulliNB()

In [91]:
bnb.score(xtest,testy)

0.840875

# 84% accurcy in BNB

In [113]:
# Due to insuuufficient storage i have to take just 100 thats why accuracy decresses

x = X[:100].toarray()
gnb.fit(x,trainy[:100])

GaussianNB()

In [114]:
gnb.predict(xtest[:100].toarray())

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0], dtype=int64)

In [115]:
gnb.score(xtest[:100].toarray(),testy[:100])

0.69

### Predicting Test dataset

In [116]:
test = pd.read_csv('Test.csv')

In [117]:
test.head(10)

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...
5,I usually really enjoy Steven Seagal movies. T...
6,JiÃ¸Ã­ Trnka made his last animated short an i...
7,This is so bad it will be my contribution to t...
8,Watching this hilariously retro but very enter...
9,"Excellent political thriller, played much quie..."


In [119]:
test = test.review.apply(getStemmedReview)

In [120]:
testX = test.values

In [121]:
testX = cv.transform(testX)

In [135]:
Y = mnb.predict(testX)

In [138]:
df = pd.DataFrame(test, columns=['review','label'])

In [140]:
df.label = Y

In [141]:
df.head(10)

Unnamed: 0,review,label
0,rememb old kung fu movi use watch friday satur...,0
1,movi anoth one list movi bother saw 40 year ag...,0
2,world thing like get dvd player home even get ...,0
3,queen damn one best vampir movi ever seen movi...,0
4,caprica episod well done pilot realli episod e...,1
5,usual realli enjoy steven seagal movi usual hi...,0
6,jiã ã trnka made last anim short indict totali...,1
7,bad contribut next bad movi parti go clear sta...,0
8,watch hilari retro entertain career girl tale ...,1
9,excel polit thriller play much quieter slower ...,1


In [145]:
df.to_csv('output.csv',index = False)