In [101]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

In [102]:
tokenizer=RegexpTokenizer(r'\w+')
ps=PorterStemmer()
en_stopwords=set(stopwords.words('english'))

In [103]:
def getStemmedReview(review):
    review=str(review).lower()
    review=review.replace("<br /><br />"," ")
    review=re.sub(r'(?i)\b(?:\d+[a-z]|[a-z]+\d)\w*\b'," ",review)
    review=re.sub(r'\d+'," ",review)
    review=review.replace("_"," ")
    
    
    tokens=tokenizer.tokenize(review)
    new_token=[token for token in tokens if token not in en_stopwords]
    stem_token=[ps.stem(token) for token in new_token]
    
    clean_review=' '.join(stem_token)
    
    return clean_review
    

In [104]:
def getStemmedDocument(inputFile):
    
   
    for review in inputFile:
        clean_review=getStemmedReview(review)
        
        #new_review = '  '.join(clean_review)
        
    return clean_review

In [105]:
import pandas as pd
import numpy as np

In [106]:
df=pd.read_csv('Train.csv')
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [107]:
x=df.values

In [108]:
X=x[:,0]
y=x[:,1]

In [109]:
print(X.shape,y.shape)

(40000,) (40000,)


In [110]:
from sklearn.model_selection import train_test_split

In [111]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)


In [112]:
print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)

(32000,) (32000,) (8000,) (8000,)


In [113]:
new_cleaned_review= [getStemmedReview(i) for i in x_train]

In [114]:
new_cleaned_review

['dvd releas movi hope buy movi name face sandra bullock pictur year film basic cover art back cover art insid cover art pictur name promin shown front cover edg disk first name list star biographi print insid case film must revolv around charact right wrong first movi play minor role watch movi everi role seem like minor role charact dog actual display person less hour watch even rememb name mani charact mayb watch sever time could actual figur plot think would worth effort oh wait rememb funni bit shoot em video game fan get kick doom cam look like first person shooter game hand gun point front camera gener person critic movi may worst movi ever seen kept expect silhouett walk across screen sit start make fun glad money spent use went chariti',
 'probabl littl old movi guy stori littl girli realli thought cool movi overal girl movi hot realli great actress alana adriann deena kimberli way cute girl play alana littl sister rachel huge pop star fan lovesick song thought aaron carter re

In [115]:
from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer()

In [116]:
x_vec=cv.fit_transform(new_cleaned_review).toarray()

In [117]:
print(x_vec.shape)

(32000, 58041)


In [118]:
print(cv.get_feature_names())

['aa', 'aaa', 'aaaaaaaaaaaahhhhhhhhhhhhhh', 'aaaaaaah', 'aaaaaaahhhhhhggg', 'aaaaagh', 'aaaaah', 'aaaaarrrrrrgggggghhhhhh', 'aaaaaw', 'aaaahhhhhh', 'aaaahhhhhhh', 'aaaarrgh', 'aaaawwwwww', 'aaaggghhhhhhh', 'aaagh', 'aaah', 'aaahhhhhhh', 'aaall', 'aaam', 'aaargh', 'aaarrrgh', 'aaaugh', 'aaawwwwnnn', 'aab', 'aachen', 'aadha', 'aag', 'aaghh', 'aah', 'aahhh', 'aaila', 'aaja', 'aak', 'aaker', 'aakrosh', 'aaliyah', 'aalox', 'aam', 'aamir', 'aamr', 'aan', 'aankh', 'aankhen', 'aaoon', 'aap', 'aapk', 'aardman', 'aardvark', 'aarf', 'aargh', 'aarika', 'aaron', 'aarp', 'aarrrgh', 'aashok', 'aasmaan', 'aasman', 'aatish', 'aaton', 'aau', 'aavjo', 'aawip', 'aaww', 'ab', 'aba', 'aback', 'abadi', 'abagail', 'abanaz', 'abandon', 'abank', 'abas', 'abashidz', 'abat', 'abba', 'abbasi', 'abbey', 'abbi', 'abbot', 'abbott', 'abbrevi', 'abbu', 'abc', 'abcd', 'abcâ', 'abd', 'abdalla', 'abdic', 'abdomen', 'abdomin', 'abdoo', 'abduct', 'abducte', 'abductor', 'abdul', 'abdulrahman', 'abe', 'abecassi', 'abedalla', 

In [119]:
from sklearn.naive_bayes import MultinomialNB

In [120]:
mnb=MultinomialNB()

In [121]:
mnb.fit(x_vec,y_train)

MultinomialNB()

In [122]:
x_test_clean=[getStemmedReview(i) for i in x_test]

In [67]:
x_test_vec=cv.transform(x_test_clean).toarray()

In [70]:
ypred=mnb.predict(x_test_vec)

In [69]:
from sklearn.metrics import confusion_matrix

In [71]:
cnf_matrix=confusion_matrix(y_test,ypred)

In [72]:
print(cnf_matrix)

[[3460  492]
 [ 666 3382]]


In [74]:
dfy=pd.read_csv('Test/Test.csv')
dfy.head()

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...


In [75]:
actual_test=dfy.values

In [76]:
actual_test.shape

(10000, 1)

In [80]:
clean_actaul_test=[getStemmedReview(i) for i in actual_test]

In [81]:
clean_actaul_test_cv=cv.transform(clean_actaul_test)

In [83]:
ypred=mnb.predict(clean_actaul_test_cv)

In [89]:
df=pd.DataFrame(data=ypred,columns=['label'])

In [94]:
df.to_csv('sub.csv',index=True)

In [95]:
df.head()

Unnamed: 0,label
0,neg
1,neg
2,neg
3,pos
4,pos


In [98]:
df.rename_axis('Id',inplace=True)

In [99]:
df.head()

Unnamed: 0_level_0,label
Id,Unnamed: 1_level_1
0,neg
1,neg
2,neg
3,pos
4,pos


In [100]:
df.to_csv('subb.csv')