### Text classification 

** Here we have movie review dataset with two classes(neg and pos)** 

In [2]:
import nltk

In [4]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [4]:
import numpy as np
import re
import pickle
#import nltk
#from nltk.corpus import stopwords
from sklearn.datasets import load_files
import pandas as pd
#load_files:Load text files with categories as subfolder names.

### Load datasets

In [5]:
train = load_files('dataset/train',encoding='utf-8')

In [7]:
len(train.data)

1000

In [8]:
X_train = train.data
y_train = train.target

In [9]:
len(X_train)

1000

In [10]:
train.target_names

['neg', 'pos']

In [11]:
np.unique(train.target)

array([0, 1])

In [43]:
test = load_files('dataset/test/',encoding='utf-8')

In [44]:
X_test = test.data
y_test = test.target

### clean data

In [15]:
train.data[2]

'I almost called HBO and demanded my money back for the month just because they\'ve been airing this movie. I can just see the movie execs sitting around going, "Okay, we need to come up with something that\'s just like Home Alone, only we\'ll add a bunch of cash for the kid, hire cut-rate actors, and oh yeah, we\'ll make it a lot less funny!"<br /><br />Okay, maybe not the last part, but that\'s basically what you\'ve got here. Not even worth seeing if someone else rents it. And as a movie for kids? Forget it. I wouldn\'t let my kids see this, not necessarily because of bad-taste jokes, but because I wouldn\'t want them to say, "What were you thinking showing us that lame piece of garbage, Dad?!?!"'

In [16]:
def clean(x):
    #x=re.sub(r'\W',' ',x)
    #x = re.sub(r'[^a-zA-Z]',' ',x)
    
    x = re.sub(r'<.*>', '', x)
    x = re.sub(r'[^a-zA-Z]',' ',x)
    #x=re.sub(r'\s+[a-z]\s+',' ',x)    #remove single char   
    
    x = re.sub(r'\s+',' ',x)          #remove extra space's
    return x.lower()
    
#\W:matches any non-alphanumeric character; 
#this is equivalent to the set [^a-zA-Z0-9_].

In [18]:
clean('I hello  ..!  a 123#hi john <html> </>')

'i hello a hi john '

In [19]:
df = pd.DataFrame(X_train,columns=['review'])
df['target'] = y_train
df.head()

In [21]:
df['review']=df.review.apply(clean)

In [23]:
df.head()

Unnamed: 0,review,target
0,i am a huge john denver fan i have a large col...,1
1,i just read the plot summary and it is the wor...,1
2,i almost called hbo and demanded my money back...,0
3,like his earlier film in a glass cage agust vi...,1
4,there are few films that leave me with the fee...,1


### Convert text into numeric

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
cv = CountVectorizer(min_df=10,max_df=.6,
                     stop_words='english',
                    max_features=1000)

#min_df=10:exclude any word that comes in 10 or less than 10 documents
#max_df=.6:excude any word that comes more than 60% of the documents,

In [31]:
X_new = cv.fit_transform(df.review.values).toarray()

In [33]:
#cv.get_feature_names()

In [34]:
X_new.shape

(1000, 1000)

In [49]:
#cv.get_feature_names()

In [31]:
### use tfidf vectorizor

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [37]:
nb = MultinomialNB()
nb.fit(X_new,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
log = LogisticRegression()
log.fit(X_new,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [39]:
log.score(X_new,y_train)

0.994

In [40]:
nb.score(X_new,y_train)

0.897

### prediction on testing data

In [45]:
df_test = pd.DataFrame(X_test,columns=['review'])
df_test['target'] = y_test

In [46]:
df_test.head()

Unnamed: 0,review,target
0,"Formulaic slasher film, only this one stars th...",0
1,"Yes, I am a romantic of sorts who likes musica...",1
2,I went to an advance screening of this movie t...,1
3,Four things intrigued me as to this film - fir...,0
4,Our family (and the entire sold out sneak prev...,1


In [47]:
df_test['review'] = df_test.review.apply(clean)

In [48]:
X_test_new = cv.transform(df_test.review.values).toarray()

In [49]:
X_test_new.shape

(200, 1000)

In [50]:
log.score(X_test_new,y_test)

0.795

In [51]:
nb.score(X_test_new,y_test)

0.795

In [52]:
for i in [.001,.01,.1,10,100]:
    log1 = LogisticRegression(C=i)
    print('when C:',i)
    print('train:',log1.fit(X_new,y_train).score(X_new,y_train))
    print('test:',log1.score(X_test_new,y_test))
    print()

when C: 0.001
train: 0.795
test: 0.735

when C: 0.01
train: 0.873
test: 0.785

when C: 0.1
train: 0.946
test: 0.8

when C: 10
train: 0.999
test: 0.74

when C: 100
train: 1.0
test: 0.73



In [53]:
log2 = LogisticRegression(C=.1)
print(log2.fit(X_new,y_train).score(X_new,y_train))
print(log2.score(X_test_new,y_test))

0.946
0.8


In [54]:
test=["I do not like this movie","I would not recommend this movie",
     "I hate this movie","I love this movie"]

In [55]:
f=[]
for i in test:
    s=clean(i)
    f.append(s)

In [56]:
f

['i do not like this movie',
 'i would not recommend this movie',
 'i hate this movie',
 'i love this movie']

In [57]:
t=cv.transform(f).toarray()

In [58]:
t.shape

(4, 1000)

In [59]:
log.predict(t)

array([0, 0, 1, 0])

In [60]:
nb.predict(t)

array([0, 1, 0, 1])

In [61]:
df = pd.read_csv('movie_reviews.csv')

In [62]:
df.head()

Unnamed: 0,review,sentiment
0,This is one of those unfortunate films that su...,1
1,Okay maybe it was because I happen to be in Ya...,1
2,"Although I love this movie, I can barely watch...",1
3,"A man arrives in a strange, beautiful, sterile...",1
4,I'm sitting around going through movie listing...,1


In [63]:
df.shape

(50000, 2)

In [64]:
df.sentiment.value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [65]:
df['review'] = df.review.apply(clean)

In [66]:
from sklearn.model_selection import train_test_split

In [69]:
x_train,x_test,y_train,y_test = train_test_split(df.review.values,
                                                 df.sentiment.values,
                                                test_size=10000,
                                                random_state=10)

In [70]:
x_train.shape

(40000,)

In [84]:
cv = CountVectorizer(min_df=10,max_df=.6,
                     stop_words='english',
                    max_features=20000)


In [85]:
x_new=cv.fit_transform(x_train).toarray()
x_test_new = cv.transform(x_test).toarray()

In [87]:
x_new.shape

(40000, 16814)

In [88]:
x_test_new.shape

(10000, 16814)

In [77]:
log3 = LogisticRegression(C=.1)
log3.fit(x_new,y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [78]:
log3.score(x_new,y_train)

0.909125

In [79]:
log3.score(x_test_new,y_test)

0.8527

In [80]:
f

['i do not like this movie',
 'i would not recommend this movie',
 'i hate this movie',
 'i love this movie']

In [81]:
t=cv.transform(f).toarray()

In [82]:
t.shape

(4, 10000)

In [83]:
log3.predict(t)

array([0, 1, 0, 1], dtype=int64)

### save the model

In [None]:
with open('review_model.pkl','wb') as f:
    pickle.dump(log3,f)

In [None]:
### save the vectorizer
with open('cv.pkl','wb') as f:
    pickle.dump(cv,f)

In [None]:
### load model

In [66]:
with open('review_model.pkl','rb') as f:
    clf=pickle.load(f)

In [67]:
clf.predict(t)

ValueError: X has 23309 features per sample; expecting 16974