In [1]:
import numpy as np
import re
import pickle
from sklearn.datasets import load_files
import pandas as pd

# Load Datasets

In [3]:
train=load_files("E:/python/dataset/train/",encoding='utf-8')

In [7]:
x_train=train.data
y_train=train.target

In [8]:
len(x_train)

1000

In [9]:
len(y_train)

1000

In [10]:
train.target_names

['neg', 'pos']

In [11]:
np.unique(train.target)

array([0, 1])

In [12]:
test=load_files("E:/python/dataset/test/",encoding='utf-8')

In [10]:
x_test=test.data
y_test=test.target

In [11]:
len(x_test)

200

In [12]:
len(y_test)

200

# Clean Data

In [13]:
train.data[2]

'I almost called HBO and demanded my money back for the month just because they\'ve been airing this movie. I can just see the movie execs sitting around going, "Okay, we need to come up with something that\'s just like Home Alone, only we\'ll add a bunch of cash for the kid, hire cut-rate actors, and oh yeah, we\'ll make it a lot less funny!"<br /><br />Okay, maybe not the last part, but that\'s basically what you\'ve got here. Not even worth seeing if someone else rents it. And as a movie for kids? Forget it. I wouldn\'t let my kids see this, not necessarily because of bad-taste jokes, but because I wouldn\'t want them to say, "What were you thinking showing us that lame piece of garbage, Dad?!?!"'

In [14]:
def clean(x):
    x=re.sub(r'<.*>', '',x)
    x=re.sub(r'[^a-zA-Z]'," ",x)
    x=re.sub(r'\s+'," ",x)
    return x.lower()

In [15]:
clean("Hello I <b></b> am 123##$ john")

'hello i am john'

In [16]:
df=pd.DataFrame(x_train,columns=["review"])
df['target']=y_train
df.head()

Unnamed: 0,review,target
0,I am a huge John Denver fan. I have a large co...,1
1,I just read the plot summary and it is the wor...,1
2,I almost called HBO and demanded my money back...,0
3,"Like his earlier film, ""In a Glass Cage"", Agus...",1
4,There are few films that leave me with the fee...,1


In [17]:
df["review"]=df.review.apply(clean)
df.head()

Unnamed: 0,review,target
0,i am a huge john denver fan i have a large col...,1
1,i just read the plot summary and it is the wor...,1
2,i almost called hbo and demanded my money back...,0
3,like his earlier film in a glass cage agust vi...,1
4,there are few films that leave me with the fee...,1


# Convert text into Numeric

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
cv=CountVectorizer(min_df=10,max_df=.6,stop_words="english",max_features=100)

In [21]:
x_new=cv.fit_transform(df.review.values).toarray()

In [None]:
#cv.get_feature_names()

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [24]:
nb = MultinomialNB()

In [25]:
nb.fit(x_new,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
nb.score(x_new,y_train)

0.745

In [27]:
log=LogisticRegression()

In [28]:
log.fit(x_new,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [29]:
log.score(x_new,y_train)

0.756

In [30]:
df_test = pd.DataFrame(x_test,columns=['review'])
df_test['target'] = y_test

In [31]:
df_test.head()

Unnamed: 0,review,target
0,"Formulaic slasher film, only this one stars th...",0
1,"Yes, I am a romantic of sorts who likes musica...",1
2,I went to an advance screening of this movie t...,1
3,Four things intrigued me as to this film - fir...,0
4,Our family (and the entire sold out sneak prev...,1


In [32]:
df_test['review'] = df_test.review.apply(clean)

In [33]:
x_test_new = cv.transform(df_test.review.values).toarray()

In [34]:
x_test_new.shape

(200, 100)

In [35]:
log.score(x_test_new,y_test)

0.725

In [36]:
nb.score(x_test_new,y_test)

0.705

In [37]:
for i in [.0001,.001,.01,.1,10,100,1000]:
    log1 = LogisticRegression(C=i)
    print('when C:',i)
    print('train:',log1.fit(x_new,y_train).score(x_new,y_train))
    print('test:',log1.score(x_test_new,y_test))
    print()

when C: 0.0001
train: 0.655
test: 0.66

when C: 0.001
train: 0.684
test: 0.69

when C: 0.01
train: 0.734
test: 0.72

when C: 0.1
train: 0.759
test: 0.695

when C: 10
train: 0.751
test: 0.725

when C: 100
train: 0.75
test: 0.72

when C: 1000
train: 0.75
test: 0.72





In [38]:
test=["I do not like this movie","I would not recommend this movie",
     "I hate this movie","I love this movie"]

In [39]:
f=[]
for i in test:
    s=clean(i)
    f.append(s)

In [40]:
f

['i do not like this movie',
 'i would not recommend this movie',
 'i hate this movie',
 'i love this movie']

In [41]:
t=cv.transform(f).toarray()

In [42]:
t.shape

(4, 100)

In [43]:
log.predict(t)

array([1, 1, 0, 1])

In [44]:
nb.predict(t)

array([0, 1, 0, 1])

In [45]:
df = pd.read_csv('movie_reviews.csv')

In [46]:
df.head()

Unnamed: 0,review,sentiment
0,This is one of those unfortunate films that su...,1
1,Okay maybe it was because I happen to be in Ya...,1
2,"Although I love this movie, I can barely watch...",1
3,"A man arrives in a strange, beautiful, sterile...",1
4,I'm sitting around going through movie listing...,1


In [47]:
df.sentiment.value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [48]:
df['review'] = df.review.apply(clean)

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
x_train,x_test,y_train,y_test = train_test_split(df.review.values,
                                                 df.sentiment.values,
                                                test_size=10000,
                                                random_state=10)

In [51]:
cv = CountVectorizer(min_df=10,max_df=.6,
                     stop_words='english',
                    max_features=20000)

In [52]:
x_new=cv.fit_transform(x_train).toarray()
x_test_new = cv.transform(x_test).toarray()

In [53]:
log3 = LogisticRegression(C=.1)
log3.fit(x_new,y_train)
log3.score(x_new,y_train)



0.919725

In [54]:
log3.score(x_test_new,y_test)

0.8546

In [55]:
t=cv.transform(f).toarray()

In [56]:
log3.predict(t)

array([0, 1, 0, 1], dtype=int64)

# save the model

In [59]:
with open('review_model.pkl','wb') as f:
    pickle.dump(log3,f)

In [60]:
### save the vectorizer
with open('cv.pkl','wb') as f:
    pickle.dump(cv,f)

In [61]:
with open('review_model.pkl','rb') as f:
    clf=pickle.load(f)

In [62]:
clf.predict(t)

array([0, 1, 0, 1], dtype=int64)