### NLTK

In [4]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [5]:
# creating object of above classes 
tokenizer=RegexpTokenizer(r'\w+') #r = regular expression and w+ for all words 
ps=PorterStemmer()
en_stopwords=set(stopwords.words('english'))

In [6]:
def getStemmedReview(review):
    
    review=review.lower()
    review = review.replace("<br /><br />"," ") 
    
    #tokenise
    tokens = tokenizer.tokenize(review)
    new_token = [token for token in tokens if token not in en_stopwords]
    stem_token= [ps.stem(token) for token in new_token]
    
    clean_review=' '.join(stem_token)
    
    return clean_review

In [7]:
sample_text = """I loved this movie <br /><br /> since I was 7 and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all. It's a movie to watch with your family by far.<br /><br />My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language."""


In [8]:
getStemmedReview(sample_text)

'love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag'

In [9]:
def getStemmedDocument(inputFile,outputFile):
    
    out=open(outputFile,'w')
    
    with open(inputFile) as f:
        reviews=f.readlines()
        
    for review in reviews:
        cleaned_review=getStemmedReview(review)
        print((cleaned_review),file=out)
        
    out.close()

In [10]:
getStemmedDocument('imdb_trainX.txt','cleaned.txt')

In [11]:
#to read input file and output file from command line argument 
#inputfile=sys.argv[1]
#outputfile=sys.argv[2]


### Multinomial Naive Bayes Classifier 

In [12]:
x = ["This was awesome an awesome movie",
     "Great movie! I liked it a lot",
     "Happy Ending! awesome acting by the hero",
     "loved it! truly great",
     "bad not upto the mark",
     "could have been better",
     "Surely a Disappointing movie"]

y = [1,1,1,1,0,0,0] # 1 - Positive, 0 - Negative Class

In [13]:
x_test = ["I was happy & happy and I loved the acting in the movie",
          "The movie I saw was bad"]


In [14]:
x_clean=[getStemmedReview(i) for i in x] 
x_test_clean=[getStemmedReview(i) for i in x_test]

In [15]:
print(x_clean)

['awesom awesom movi', 'great movi like lot', 'happi end awesom act hero', 'love truli great', 'bad upto mark', 'could better', 'sure disappoint movi']


In [16]:
#Vectorisation

from sklearn.feature_extraction.text import CountVectorizer


In [17]:
cv=CountVectorizer()

In [18]:
x_vec=cv.fit_transform(x_clean).toarray()
print(x_vec)
print(x_vec.shape)
#vector contains frequency of words appearing 2 times awesome occur in one sentence 

[[0 2 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0]
 [1 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
 [0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0]]
(7, 18)


In [19]:
print(cv.get_feature_names())

['act', 'awesom', 'bad', 'better', 'could', 'disappoint', 'end', 'great', 'happi', 'hero', 'like', 'lot', 'love', 'mark', 'movi', 'sure', 'truli', 'upto']


In [20]:
x_test_vec=cv.transform(x_test_clean).toarray()
print(x_test_vec)
print(cv.get_feature_names())
print(x_test_vec.shape)

[[1 0 0 0 0 0 0 0 2 0 0 0 1 0 1 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]
['act', 'awesom', 'bad', 'better', 'could', 'disappoint', 'end', 'great', 'happi', 'hero', 'like', 'lot', 'love', 'mark', 'movi', 'sure', 'truli', 'upto']
(2, 18)


In [21]:
#training and testing using sklearn 

In [28]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

In [23]:
mnb=MultinomialNB()

In [25]:
#trainig 
mnb.fit(x_vec,y)

MultinomialNB()

In [26]:
#prediction
mnb.predict(x_test_vec)

array([1, 0])

In [27]:
mnb.predict_proba(x_test_vec) #gives posterial probability 

array([[0.09332629, 0.90667371],
       [0.61699717, 0.38300283]])

### Multivariate Bernoulli Event Model

In [30]:
bnb=BernoulliNB(binarize=0.0)

In [32]:
print(bnb)

BernoulliNB()


In [33]:
bnb.fit(x_vec,y)

BernoulliNB()

In [34]:
bnb.predict(x_test_vec)

array([1, 0])

In [35]:
bnb.predict_proba(x_test_vec)

array([[0.07647628, 0.92352372],
       [0.68830318, 0.31169682]])

In [36]:
bnb.score(x_vec,y)

1.0