### 1. Read dataset 

In [1]:
import pandas as pd
df=pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### 2. Sepearte the dependent and independent features

In [2]:
# Seperate Independent Feature(X) and Dependent Feature(y)
X = df['Message']
y = df['Category']

### 3. Split the data into training and testing set 

In [3]:
# Split the data into training and testing set (X_train, X_test, y_train, y_test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [6]:
# Check the shape of X_train and y_train
X_train.shape, y_train.shape

((4457,), (4457,))

### 4. Convert the text/sentences to numerical values. How ?
- TF-IDF
- Count Vectorizer
- Word Embeddings 

In [7]:
# Lets use CountVectorizer and set (binary = True) for BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True)
vectorizer.fit(X_train)
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)
X_train_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59275 stored elements and shape (4457, 7701)>

### 5. Train the model (BernoulliNB)

In [None]:
# Import the model and train the data
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train_bow, y_train)


0,1,2
,alpha,1.0
,force_alpha,True
,binarize,0.0
,fit_prior,True
,class_prior,


### 6. Evaluate the model

In [10]:
# Make prediction on test set

y_pred = bnb.predict(X_test_bow)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'],
      shape=(1115,), dtype='<U4')

In [11]:
# Find the confusion matrix and classification report 
from sklearn.metrics import classification_report, confusion_matrix
cm = confusion_matrix(y_test, y_pred)
clf = classification_report(y_test, y_pred)
print("CM:", cm)
print("CR:", clf)


CM: [[966   0]
 [ 22 127]]
CR:               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [12]:
# Make prediction on raw text
random_sentence = "I feel very happy and excited today"
bnb.predict(vectorizer.transform([random_sentence]))

array(['ham'], dtype='<U4')

**If we use TF-IDF to convert the words into numerical format, Which naive bayes algorithm is recommended?** <br>
- _Gaussian_
- _Multinominal_
- _Bernoulli_

_Okay cool!!!! Lets have a try then._ ðŸŒŸ

In [19]:
# Since X_train, X_test, y_train, y_test is already defined. 
# We can just import the TF-IDF Vectorizer and transform them 
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
X_train_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 59275 stored elements and shape (4457, 7701)>

In [15]:
# Import the model and train 
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [17]:
# Make prediction in testing data 
y_pred = model.predict(X_test_tfidf)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'],
      shape=(1115,), dtype='<U4')

In [20]:
# Evaluate the model 

# Find the confusion matrix and classification report 
from sklearn.metrics import classification_report, confusion_matrix
cm = confusion_matrix(y_test, y_pred)
clf = classification_report(y_test, y_pred)
print("CM:", cm)
print("CR:", clf)




CM: [[966   0]
 [ 39 110]]
CR:               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.74      0.85       149

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.97      0.97      0.96      1115



In [18]:
# Prediction on raw text
new_message = "Win a free ticket to the concert now"
model.predict(tfidf.transform([new_message]))

array(['ham'], dtype='<U4')

**Hurrrrryyyyy ! We just completed the workshop of Naive Bayes and NLP**ðŸŽ‰ðŸŽ‰ðŸŽ‰