# Create NLP pipeline to Clean Reviews Data
- load input files and read reviews
- Tokenize
- Remove Stopwords
- Write Cleaned Data to Output File


In [5]:
sample_txt = '''I loved this movie since I was 7 and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all. It's a movie to watch with your family by far.<br /><br />My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language.'''

# NLTK

In [6]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [7]:
# Init Object

tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [24]:
 def getCleanedReview(review):
        review = review.lower()
        review = review.replace("<br /><br />"," ")
        
        #Tokenize
        tokens = tokenizer.tokenize(review)
        new_tokens = [token for token in tokens if token not in en_stopwords]
        stemmed_tokens = [ps.stem(token) for token in new_tokens]
        
        cleaned_review = ' '.join(stemmed_tokens) 
        
        return cleaned_review

In [25]:
cleaned_review = getCleanedReview(sample_txt)

In [26]:
print(cleaned_review)

love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag


## Multinomial Event Model 

In [27]:
x = ["This was awesome an awesome movie",
     "Great movie! I liked it a lot",
     "Happy Ending! awesome acting by the hero",
     "loved it! truly great",
     "bad not upto the mark",
     "could have been better",
     "Surely a Disappointing movie"]

y = [1,1,1,1,0,0,0] # 1 - Positive, 0 - Negative Class

In [28]:
x_test = ["I was happy & happy and I loved the acting in the movie",
          "The movie I saw was bad"]

# 1. Cleaned

In [31]:
x_clean = [getCleanedReview(i) for i in x] #List Comprehension
xt_clean = [getCleanedReview(i) for i in x_test]

In [32]:
print(x_clean)
print(xt_clean)

['awesom awesom movi', 'great movi like lot', 'happi end awesom act hero', 'love truli great', 'bad upto mark', 'could better', 'sure disappoint movi']
['happi happi love act movi', 'movi saw bad']


# 2 . Vectorization

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
cv = CountVectorizer(ngram_range=(1,2))

x_vec = cv.fit_transform(x_clean).toarray()
print(x_vec)
print(x_vec.shape)

[[0 0 2 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0]
 [0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0]]
(7, 34)


In [36]:
print(cv.get_feature_names())

['act', 'act hero', 'awesom', 'awesom act', 'awesom awesom', 'awesom movi', 'bad', 'bad upto', 'better', 'could', 'could better', 'disappoint', 'disappoint movi', 'end', 'end awesom', 'great', 'great movi', 'happi', 'happi end', 'hero', 'like', 'like lot', 'lot', 'love', 'love truli', 'mark', 'movi', 'movi like', 'sure', 'sure disappoint', 'truli', 'truli great', 'upto', 'upto mark']


In [37]:
## Vectorization on the test set
xt_vec = cv.transform(xt_clean).toarray()
print(xt_vec)
cv.get_feature_names()
print(xt_vec.shape)

[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]]
(2, 34)


## 3. Multinomial Naive Bayes

In [38]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB, GaussianNB

In [39]:
mnb = MultinomialNB()
print(mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [40]:
# Training 
mnb.fit(x_vec,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [41]:
#Predictions
mnb.predict(xt_vec)

array([1, 0])

In [42]:
# it gives posterior prabability
mnb.predict_proba(xt_vec)

array([[0.09580319, 0.90419681],
       [0.61972801, 0.38027199]])

In [43]:
mnb.score(x_vec,y)

1.0

##  4. Multivariate Bernoulli Event Model Naive Bayes

In [45]:
bnb = BernoulliNB(binarize=0.0)

In [46]:
print(bnb)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


In [47]:

bnb.fit(x_vec,y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [48]:
# it gives posterior probability
bnb.predict_proba(xt_vec)

array([[0.10638608, 0.89361392],
       [0.76046221, 0.23953779]])

In [51]:
bnb.predict(xt_vec)

array([1, 0])

In [52]:
bnb.score(x_vec,y)

1.0