In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('./datasets/balloons.csv')

In [4]:
data.head()

Unnamed: 0,color,size,act,age,inflated
0,YELLOW,SMALL,STRETCH,ADULT,T
1,YELLOW,SMALL,STRETCH,ADULT,T
2,YELLOW,SMALL,STRETCH,ADULT,T
3,YELLOW,,STRETCH,ADULT,T
4,YELLOW,SMALL,STRETCH,ADULT,T


In [5]:
data.isna().any()

color       False
size         True
act          True
age          True
inflated    False
dtype: bool

In [6]:
data.isna().sum()

color       0
size        1
act         1
age         1
inflated    0
dtype: int64

In [7]:
data.dropna(axis=0, inplace=True)

In [8]:
data.shape

(97, 5)

In [9]:
Y = data['inflated']
X = data.drop('inflated', axis=1)

In [10]:
feature_names = X.columns

In [11]:
X.shape

(97, 4)

In [12]:
Y.replace('T', 0, inplace=True)
Y.replace('F', 1, inplace=True)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=456, stratify=Y)

In [14]:
X_train.shape

(72, 4)

In [15]:
from sklearn.preprocessing import OrdinalEncoder

In [16]:
encoder = OrdinalEncoder()

In [17]:
encoder.fit(X_train)

In [18]:
X_train = encoder.transform(X_train)

In [19]:
X_train = pd.DataFrame(X_train, columns=feature_names)

In [20]:
X_test = encoder.transform(X_test)

In [21]:
X_test = pd.DataFrame(X_test, columns=feature_names)

In [22]:
X_test

Unnamed: 0,color,size,act,age
0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0
2,0.0,1.0,1.0,0.0
3,1.0,1.0,0.0,0.0
4,1.0,1.0,0.0,1.0
5,0.0,1.0,1.0,0.0
6,1.0,1.0,0.0,1.0
7,1.0,0.0,0.0,0.0
8,1.0,0.0,1.0,0.0
9,0.0,1.0,0.0,0.0


In [23]:
from sklearn.naive_bayes import CategoricalNB

In [24]:
model = CategoricalNB()

In [25]:
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_train)

In [27]:
Y.unique()

array([0, 1])

In [31]:
print(pd.DataFrame(confusion_matrix(y_train, y_pred), columns=['T', 'F'], index=['T', 'F']))

    T   F
T  28   0
F   0  44


In [32]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        44

    accuracy                           1.00        72
   macro avg       1.00      1.00      1.00        72
weighted avg       1.00      1.00      1.00        72



In [33]:
y_test_pred = model.predict(X_test)

In [34]:
print(pd.DataFrame(confusion_matrix(y_test, y_test_pred), columns=['T', 'F'], index=['T', 'F']))

    T   F
T  10   0
F   0  15


In [35]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        15

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25



In [36]:
from sklearn.feature_extraction import DictVectorizer

In [37]:
dv = DictVectorizer()

In [39]:
sparse_matrix = dv.transform(X_train)
X_train = pd.DataFrame(sparse_matrix.toarray(), columns=dv.feature_names_)
X_train.head()
