In [1]:
# data cleaning
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# modeling 
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# NLP
from sklearn.feature_extraction.text import CountVectorizer

# 1.Gaussian Naive Bayes
- for continuous features, normal distribution assumed

**Load data**

In [2]:
from sklearn.datasets import fetch_openml

In [3]:
data= fetch_openml(data_id=1220)
x= pd.DataFrame(data['data'], columns= data['feature_names']).astype(float)
y= pd.Series(data['target']).astype(int)

In [4]:
x.head()

Unnamed: 0,impression,ad_id,advertiser_id,depth,position,keyword_id,title_id,description_id,user_id
0,1.0,8343295.0,11700.0,3.0,3.0,21264.0,27892.0,1559.0,0.0
1,1.0,20017077.0,23798.0,1.0,1.0,35498.0,4.0,36476.0,562934.0
2,1.0,21348354.0,36654.0,1.0,1.0,19975.0,36105.0,33292.0,11621116.0
3,1.0,20366086.0,33280.0,3.0,3.0,5942.0,4057.0,4390.0,8778348.0
4,1.0,6803526.0,10790.0,2.0,1.0,60593.0,25242.0,1679.0,12118311.0


In [5]:
y.head()

0    0
1    1
2    0
3    0
4    0
Name: click, dtype: int32

In [6]:
y.value_counts()

0    33220
1     6728
Name: click, dtype: int64

In [7]:
x.drop('user_id',axis=1,inplace=True)

**Gaussian Naive Bayes Model training**

In [8]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size= 0.25, random_state=42)

In [9]:
model= GaussianNB()

In [10]:
model.fit(x_train,y_train)

GaussianNB()

In [11]:
model.score(x_train,y_train)

0.8083174793898735

In [12]:
model.score(x_test,y_test)

0.807349554420747

In [13]:
y_pred= model.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
y_prob= model.predict_proba(x_test)

In [15]:
y_prob

array([[0.89295579, 0.10704421],
       [0.87012604, 0.12987396],
       [0.89094327, 0.10905673],
       ...,
       [0.88074645, 0.11925355],
       [0.8928076 , 0.1071924 ],
       [0.80285587, 0.19714413]])

# 2. Bernoulli Naive Bayes
- features are independent binary (0,1)

**Load data**

In [16]:
df= pd.read_excel('playgolf_data.xlsx', index_col=0)
df.head()

Unnamed: 0_level_0,Outlook,Temperature,Humidity,Wind,Play Golf
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Sunny,Hot,High,Weak,No
2,Sunny,Hot,High,Strong,No
3,Overcast,Hot,High,Weak,Yes
4,Rain,Mild,High,Weak,Yes
5,Rain,Cool,Normal,Weak,Yes


In [17]:
df.shape

(14, 5)

In [18]:
features= df.drop('Play Golf', axis= 1)
target= df[['Play Golf']]

In [19]:
target

Unnamed: 0_level_0,Play Golf
Day,Unnamed: 1_level_1
1,No
2,No
3,Yes
4,Yes
5,Yes
6,No
7,Yes
8,No
9,Yes
10,Yes


In [20]:
features= pd.get_dummies(features)
features

Unnamed: 0_level_0,Outlook_Overcast,Outlook_Rain,Outlook_Sunny,Temperature_Cool,Temperature_Hot,Temperature_Mild,Humidity_High,Humidity_Normal,Wind_Strong,Wind_Weak
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,0,1,0,1,0,1,0,0,1
2,0,0,1,0,1,0,1,0,1,0
3,1,0,0,0,1,0,1,0,0,1
4,0,1,0,0,0,1,1,0,0,1
5,0,1,0,1,0,0,0,1,0,1
6,0,1,0,1,0,0,0,1,1,0
7,1,0,0,1,0,0,0,1,1,0
8,0,0,1,0,0,1,1,0,0,1
9,0,0,1,1,0,0,0,1,0,1
10,0,1,0,0,0,1,0,1,0,1


In [21]:
from sklearn.utils.validation import column_or_1d

In [22]:
column_or_1d(target)

array(['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes',
       'Yes', 'Yes', 'Yes', 'No'], dtype=object)

In [23]:
le= LabelEncoder()
target= le.fit_transform(column_or_1d(target))

In [24]:
target

array([0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0])

**Bernoulli Naive Bayes model training**

In [25]:
model1= BernoulliNB()
model1.fit(features, target)

BernoulliNB()

In [26]:
model1.score(features,target)

0.9285714285714286

# 3. MultinomialNB 
- frequency features (document classification)

**Load data**

In [27]:
df= pd.read_csv('spam.csv',encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [28]:
df.shape

(5572, 5)

In [29]:
x= df['v2']
x.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

In [30]:
y= df['v1']
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: v1, dtype: object

**Data preprocess**

*Process y*

In [31]:
y= y.map({'ham':0,'spam':1})

In [32]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: v1, dtype: int64

In [33]:
np.array(y)

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

*Process x*

In [34]:
count=CountVectorizer()

In [35]:
count.fit(x)

CountVectorizer()

In [36]:
bag_of_words= count.transform(x)

In [37]:
bag_of_words

<5572x8672 sparse matrix of type '<class 'numpy.int64'>'
	with 73916 stored elements in Compressed Sparse Row format>

In [38]:
x= bag_of_words.toarray()

In [39]:
x.shape

(5572, 8672)

In [40]:
y.shape

(5572,)

**MultinomialNB model training**

In [41]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2, random_state= 42)

In [42]:
model2= MultinomialNB()

In [43]:
model2.fit(x_train,y_train)

MultinomialNB()

In [44]:
model2.score(x_train,y_train)

0.9943908458604442

In [45]:
model2.score(x_test,y_test)

0.97847533632287