# Naive Bayes 

## Classify document into Cinema or Education
- Sample document provided, using that we will read the words dictionary to classify an item as 

### Import Libraries

In [5]:
import pandas as pd
import numpy as np

## 1. Importing and Preprocessing Data

### Read the file

In [3]:
dataSet = pd.read_csv("example_train.csv")
dataSet.head()

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,education
1,Educational greatness depends on ethics,education
2,A story of great ethics and educational greatness,education
3,Sholey is a great cinema,cinema
4,good movie depends on good story,cinema


### Map the education and Cinema to 0 and 1

In [4]:
dataSet["Class"] = dataSet.Class.map({"education" : 0, "cinema" : 1})
dataSet.head()

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,0
1,Educational greatness depends on ethics,0
2,A story of great ethics and educational greatness,0
3,Sholey is a great cinema,1
4,good movie depends on good story,1


### Convert the dataFrame to numpy array

In [7]:
trainSet_array = dataSet.values
trainSet_array

array([['Upgrad is a great educational institution.', 0],
       ['Educational greatness depends on ethics', 0],
       ['A story of great ethics and educational greatness', 0],
       ['Sholey is a great cinema', 1],
       ['good movie depends on good story', 1]], dtype=object)

### Convert the array to X and y

In [8]:
X_train = trainSet_array[:, 0]
y_train = trainSet_array[:, 1]

# sklearn needs y as integers
y_train = y_train.astype("int")

print(X_train)
print(y_train)

['Upgrad is a great educational institution.'
 'Educational greatness depends on ethics'
 'A story of great ethics and educational greatness'
 'Sholey is a great cinema' 'good movie depends on good story']
[0 0 0 1 1]


## Creating a Bag Word Representation

### Import Libraries for Bag word Representation

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vec_wordBag = CountVectorizer()

In [12]:
vec_wordBag.fit(X_train)
vec_wordBag.vocabulary_

{'upgrad': 15,
 'is': 9,
 'great': 6,
 'educational': 3,
 'institution': 8,
 'greatness': 7,
 'depends': 2,
 'on': 12,
 'ethics': 4,
 'story': 14,
 'of': 11,
 'and': 0,
 'sholey': 13,
 'cinema': 1,
 'good': 5,
 'movie': 10}

`Countvectorizer()` has converted the documents into a set of unique words alphabetically sorted and indexed.

### Stop Words

We can see a few trivial words such as `'and','is','of', etc`. These words don't really make any difference in classyfying a document. These are called stop words. So we would like to get rid of them.

We can remove them by passing a parameter `stop_words='english'` while instantiating `Countvectorizer()` as follows

In [16]:
vec_wordBag = CountVectorizer(stop_words="english")
print("Stop Words", len(vec_wordBag.get_stop_words()), vec_wordBag.get_stop_words())
vec_wordBag.fit(X_train)
vec_wordBag.vocabulary_

Stop Words 318 frozenset({'mill', 'once', 'their', 'much', 'thus', 'else', 'him', 'one', 'hereby', 'then', 'down', 'nobody', 'for', 'sixty', 'never', 'fire', 'across', 'call', 'because', 'my', 'becoming', 'while', 'themselves', 'whether', 'us', 'as', 'were', 'however', 'latter', 'describe', 'six', 'off', 'whereas', 'beside', 'who', 'give', 'anywhere', 'become', 'same', 'over', 'that', 'thick', 'often', 'anyone', 'ours', 'until', 'amongst', 'any', 'back', 'wherever', 'even', 'found', 'few', 'she', 'sincere', 'thru', 'take', 'interest', 'or', 'someone', 'out', 'amount', 'be', 'ever', 'somewhere', 'done', 'also', 'herself', 'himself', 'whereupon', 'via', 'thereby', 'first', 'full', 'here', 'under', 'to', 'put', 'nevertheless', 'system', 'can', 'co', 'meanwhile', 'there', 'above', 'throughout', 'is', 'eg', 'only', 'among', 'indeed', 'do', 'becomes', 'with', 'itself', 'might', 'nor', 'you', 'thereafter', 'detail', 'of', 'its', 'see', 'herein', 'nothing', 'too', 'since', 'bottom', 'anything'

{'upgrad': 11,
 'great': 5,
 'educational': 2,
 'institution': 7,
 'greatness': 6,
 'depends': 1,
 'ethics': 3,
 'story': 10,
 'sholey': 9,
 'cinema': 0,
 'good': 4,
 'movie': 8}

Notice that the vocabulary has reduced to 12 from 15. Another way of printing the 'vocabulary' is as follows:

#### Length of features and the name of features

In [17]:
print("Length of Features", len(vec_wordBag.get_feature_names_out()), 
      "\n Feature names", vec_wordBag.get_feature_names_out())

Length of Features 12 
 Feature names ['cinema' 'depends' 'educational' 'ethics' 'good' 'great' 'greatness'
 'institution' 'movie' 'sholey' 'story' 'upgrad']


### Transforming the metric to get positions of 1s
Every document will be converted into a feature vector representing presence of these words in that document. 
Let's convert each of our training documents in to a feature vector.

In [18]:
X_transformed = vec_wordBag.transform(X_train)
X_transformed

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [19]:
print(X_transformed)

  (0, 2)	1
  (0, 5)	1
  (0, 7)	1
  (0, 11)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 6)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1
  (2, 6)	1
  (2, 10)	1
  (3, 0)	1
  (3, 5)	1
  (3, 9)	1
  (4, 1)	1
  (4, 4)	2
  (4, 8)	1
  (4, 10)	1


### Representing the X_tranformed array as dataFrame

In [21]:
# converting matrix to dataframe
pd.DataFrame(X_transformed.toarray(), 
             columns=vec_wordBag.get_feature_names_out())

Unnamed: 0,cinema,depends,educational,ethics,good,great,greatness,institution,movie,sholey,story,upgrad
0,0,0,1,0,0,1,0,1,0,0,0,1
1,0,1,1,1,0,0,1,0,0,0,0,0
2,0,0,1,1,0,1,1,0,0,0,1,0
3,1,0,0,0,0,1,0,0,0,1,0,0
4,0,1,0,0,2,0,0,0,1,0,1,0


## Load the Test Model

In [22]:
dataFrame_test = pd.read_csv("example_test.csv")
dataFrame_test.head()

Unnamed: 0,Document,Class
0,very good educational institution,education


In [23]:
dataFrame_test["Class"] = dataFrame_test.Class.map({"education" : 0, "cinema" : 1})
dataFrame_test

Unnamed: 0,Document,Class
0,very good educational institution,0


### Convert the dataFrame to numpy array

In [24]:
testSet_array = dataFrame_test.values
testSet_array

array([['very good educational institution', 0]], dtype=object)

In [25]:
X_test = testSet_array[:, 0]
y_test = testSet_array[:, 1]

In [26]:
X_test_transformed = vec_wordBag.transform(X_test)
X_test_transformed

<1x12 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

#### Let us summarise all we have done till now:

- `vect.fit(train)` learns the vocabulary of the training data
- `vect.transform(train)` uses the fitted vocabulary to build a document-term matrix from the training data
- `vect.transform(test)` uses the fitted vocabulary to build a document-term matrix from the testing data (and ignores tokens it hasn't seen before)

## 2. Building the Model: Multinomial Naive Bayes

In [27]:
# building a multinomial NB model
from sklearn.naive_bayes import MultinomialNB

#### Create the Multinomia naive bayes model
- Create the new model
- Fit the model with training set
- Predict probability with test data set

In [34]:
# creating the new model
mnb = MultinomialNB()

# Fitting the model with training set
# note that we are using the sparse matrix X_transformed, 
# though you can also use the non-sparse version
# mnb.fit(X_transformed.toarray(), y_train)
mnb.fit(X_transformed.toarray(), y_train)


# Predicting the probability
proba = mnb.predict_proba(X_test_transformed.toarray())

In [40]:
# probability of each class (test data)
print("probability of test document belonging to class EDUCATION is -> {:0.2f}".format(proba[:, 0][0] * 100))
print("probability of test document belonging to class CINEMA is -> {:0.2f}".format(proba[:, 1][0] * 100))

probability of test document belonging to class EDUCATION is -> 67.19
probability of test document belonging to class CINEMA is -> 32.81


## 3. Building the Model: Bernoulli Naive Bayes

In [43]:
from sklearn.naive_bayes import BernoulliNB

# instantiating bernoulli NB class
bnb=BernoulliNB()

# fitting the model
bnb.fit(X_transformed.toarray(), y_train)

# also works
# bnb.fit(X_transformed.toarray(), y_train)

# predicting probability of test data
bnb.predict_proba(X_test_transformed.toarray())
prob_bnb = bnb.predict_proba(X_test_transformed.toarray())


array([[0.7673626, 0.2326374]])

In [44]:
# probability of each class (test data)
print("probability of test document belonging to class EDUCATION is -> {:0.2f}".format(prob_bnb[:, 0][0] * 100))
print("probability of test document belonging to class CINEMA is -> {:0.2f}".format(prob_bnb[:, 1][0] * 100))

probability of test document belonging to class EDUCATION is -> 76.74
probability of test document belonging to class CINEMA is -> 23.26


In [1]:
# A bag A contains 3 Red and 4 Green balls and another bag B contains 4 Red and 6 Green balls. 
# One bag is selected at random and a ball is drawn from it.
# If the ball drawn is found Green , find the probability that the bag chosen was A.

# Solution
# P(E1) = P(E2) = 1/2.
# By hypothesis P(G/E1) = 4/7 and  P(G/E2) = 6/10
# P(G) = ((1/2) * (4/7)) + ((1/2) * (6/10))
# By Bayes theorem P(E1/G) = (P(G/E1) * P(E1)) / P(G)

# P(E1/G) = ((4/7) * (1/2)) / ((1/2) * (4/7)) + ((1/2) * (6/10))
# P(E1/G) = (4/14) / ((4/14) + (6/20))
# P(E1/G) = 20/41

In [None]:
# The bag A  contain 6 Green, 4 Blue ; B contains 4 Green, 6 Blue and C contains 5 Green, 
# 5 Blue balls respectively. 
# A bag is randomly selected  and a ball is drawn from it. 
# If the ball drawn is Green, find the probability that it is drawn from bag A.

# Solution
# P(E1) = P(E2) = P(E3) = 1/3
# By hypothesis P(G/E1) = 6/10, P(E2) = 4/10 and P(E3) = 5/10
# P(G) = ((1/3) * (6/10)) + ((1/3) * (4/10) + (1/3 * 5/10))
# By Bayes theorem P(E1/G) = (P(G/E1) * P(E1)) / P(G)
# P(E1/G) = ((6/10) * (1/3)) / ((1/3) * (6/10)) + ((1/3) * (4/10) + (1/3 * 5/10))
# P(E1/G) = 6/30 / (6/30 + 4/30 + 5/30)
# P(E1/G) = 6/30 / 15/30
# P(E1/G) = 6/30 * 30/15
# P(E1/G) = 6/15 = 2/5