In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB

In [3]:
data = pd.read_csv('../datasets/balloons.csv')

In [4]:
data

Unnamed: 0,color,size,act,age,inflated
0,YELLOW,SMALL,STRETCH,ADULT,T
1,YELLOW,SMALL,STRETCH,ADULT,T
2,YELLOW,SMALL,STRETCH,ADULT,T
3,YELLOW,,STRETCH,ADULT,T
4,YELLOW,SMALL,STRETCH,ADULT,T
...,...,...,...,...,...
95,PURPLE,LARGE,DIP,CHILD,F
96,PURPLE,LARGE,DIP,CHILD,F
97,PURPLE,LARGE,DIP,CHILD,F
98,PURPLE,LARGE,DIP,CHILD,F


In [5]:
data.isna().any()

color       False
size         True
act          True
age          True
inflated    False
dtype: bool

In [6]:
data.isna().sum()

color       0
size        1
act         1
age         1
inflated    0
dtype: int64

In [7]:
# data.drop('size', axis=0, inplace=True)
# data.drop('act', axis=0, inplace=True)
# data.drop('age', axis=0, inplace=True)

In [8]:
data.columns

Index(['color', 'size', 'act', 'age', 'inflated'], dtype='object')

In [9]:
data['size']

0     SMALL
1     SMALL
2     SMALL
3       NaN
4     SMALL
      ...  
95    LARGE
96    LARGE
97    LARGE
98    LARGE
99    LARGE
Name: size, Length: 100, dtype: object

In [14]:
data.isna().any()

color       False
size         True
act          True
age          True
inflated    False
dtype: bool

In [15]:
data.isna().sum()

color       0
size        1
act         1
age         1
inflated    0
dtype: int64

In [16]:
data.dropna(axis=0, inplace=True)

In [17]:
data.isna().any()

color       False
size        False
act         False
age         False
inflated    False
dtype: bool

In [18]:
data.shape

(97, 5)

In [19]:
from sklearn.preprocessing import OrdinalEncoder

In [20]:
encoder = OrdinalEncoder()

In [21]:
Y = data['inflated']
X = data.drop('inflated', axis=1)

In [22]:
feature_names = X.columns

In [24]:
Y.replace('T', 0, inplace=True)
Y.replace('F', 1, inplace=True)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=123, stratify=Y)

In [26]:
encoder.fit(X_train)

OrdinalEncoder()

In [27]:
X_train = encoder.transform(X_train)

In [28]:
X_train = pd.DataFrame(X_train, columns=feature_names)

In [29]:
X_test = encoder.transform(X_test)

In [30]:
X_test = pd.DataFrame(X_test, columns=feature_names)

In [31]:
X_train

Unnamed: 0,color,size,act,age
0,1.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,2.0
3,1.0,1.0,0.0,2.0
4,0.0,1.0,1.0,2.0
...,...,...,...,...
67,0.0,0.0,1.0,0.0
68,0.0,1.0,1.0,0.0
69,1.0,0.0,0.0,0.0
70,0.0,1.0,1.0,0.0


In [32]:
model = CategoricalNB()

In [33]:
model.fit(X_train, y_train)

CategoricalNB()

In [34]:
from sklearn.metrics import confusion_matrix, classification_report

In [37]:
def report(y, y_pred):
    print(classification_report(y, y_pred))
    print('-'*100)
    print(pd.DataFrame(confusion_matrix(y, y_pred), columns=['T', 'F'], index=['T', 'F']))

In [38]:
report(y_train, model.predict(X_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        44

    accuracy                           1.00        72
   macro avg       1.00      1.00      1.00        72
weighted avg       1.00      1.00      1.00        72

----------------------------------------------------------------------------------------------------
    T   F
T  28   0
F   0  44


In [39]:
report(y_test, model.predict(X_test))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95        10
           1       1.00      0.93      0.97        15

    accuracy                           0.96        25
   macro avg       0.95      0.97      0.96        25
weighted avg       0.96      0.96      0.96        25

----------------------------------------------------------------------------------------------------
    T   F
T  10   0
F   1  14
