In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score,
    recall_score, f1_score, classification_report)

In [2]:
# "None" is not NaN in this scenario
df = pd.read_csv("../data/insurance.csv", na_values=[""], keep_default_na=False)
df.head()

Unnamed: 0.1,Unnamed: 0,GoodStudent,Age,SocioEcon,RiskAversion,VehicleYear,ThisCarDam,RuggedAuto,Accident,MakeModel,...,HomeBase,AntiTheft,PropCost,OtherCarCost,OtherCar,MedCost,Cushioning,Airbag,ILiCost,DrivHist
0,1,False,Adult,Prole,Adventurous,Older,Moderate,EggShell,Mild,Economy,...,City,False,TenThou,Thousand,True,Thousand,Poor,False,Thousand,Many
1,2,False,Senior,Prole,Cautious,Current,,Football,,Economy,...,City,True,Thousand,Thousand,True,Thousand,Good,True,Thousand,Zero
2,3,False,Senior,UpperMiddle,Psychopath,Current,,Football,,FamilySedan,...,City,False,Thousand,Thousand,False,Thousand,Good,True,Thousand,One
3,4,False,Adolescent,Middle,Normal,Older,,EggShell,,Economy,...,Suburb,False,Thousand,Thousand,True,Thousand,Fair,False,Thousand,Zero
4,5,False,Adolescent,Prole,Normal,Older,Moderate,Football,Moderate,Economy,...,City,False,TenThou,Thousand,False,Thousand,Fair,False,Thousand,Many


In [3]:
df = df.drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,GoodStudent,Age,SocioEcon,RiskAversion,VehicleYear,ThisCarDam,RuggedAuto,Accident,MakeModel,DrivQuality,...,HomeBase,AntiTheft,PropCost,OtherCarCost,OtherCar,MedCost,Cushioning,Airbag,ILiCost,DrivHist
0,False,Adult,Prole,Adventurous,Older,Moderate,EggShell,Mild,Economy,Poor,...,City,False,TenThou,Thousand,True,Thousand,Poor,False,Thousand,Many
1,False,Senior,Prole,Cautious,Current,,Football,,Economy,Normal,...,City,True,Thousand,Thousand,True,Thousand,Good,True,Thousand,Zero
2,False,Senior,UpperMiddle,Psychopath,Current,,Football,,FamilySedan,Excellent,...,City,False,Thousand,Thousand,False,Thousand,Good,True,Thousand,One
3,False,Adolescent,Middle,Normal,Older,,EggShell,,Economy,Normal,...,Suburb,False,Thousand,Thousand,True,Thousand,Fair,False,Thousand,Zero
4,False,Adolescent,Prole,Normal,Older,Moderate,Football,Moderate,Economy,Poor,...,City,False,TenThou,Thousand,False,Thousand,Fair,False,Thousand,Many


In [4]:
df.shape

(20000, 27)

In [5]:
y = df.iloc[:, 7].values # .values transforma para array NumPy
X = df.iloc[:, [0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]].values

In [6]:
labelenconder = LabelEncoder()

In [7]:
for i in range(X.shape[1]):
    if X[:, i].dtype == "object":
        X[:, i] = labelenconder.fit_transform(X[:, i])

In [8]:
X

array([[0, 1, 1, ..., 0, 3, 0],
       [0, 2, 1, ..., 1, 3, 2],
       [0, 2, 2, ..., 1, 3, 1],
       ...,
       [0, 2, 2, ..., 1, 3, 2],
       [0, 1, 0, ..., 1, 3, 2],
       [0, 1, 0, ..., 1, 3, 2]], shape=(20000, 26), dtype=object)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.3, random_state=1)

In [10]:
model = GaussianNB()
model.fit(X_train, y_train)

In [11]:
predicted = model.predict(X_test)

In [12]:
predicted

array(['None', 'None', 'None', ..., 'None', 'Mild', 'None'],
      shape=(6000,), dtype='<U8')

In [13]:
accuracy = accuracy_score(y_test, predicted)
precision = precision_score(y_test, predicted, average=None)
recall = recall_score(y_test, predicted, average="weighted")
f1 = f1_score(y_test, predicted, average="weighted")
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")

Accuracy: 0.8383333333333334, Precision: [0.68739206 0.3510942  0.97525439 1.        ], Recall: 0.8383333333333334, F1: 0.8134258059061705


In [14]:
report = classification_report(y_test, predicted)
print(report)

              precision    recall  f1-score   support

        Mild       0.69      0.73      0.71       542
    Moderate       0.35      0.73      0.47       505
        None       0.98      1.00      0.99      4228
      Severe       1.00      0.06      0.12       725

    accuracy                           0.84      6000
   macro avg       0.75      0.63      0.57      6000
weighted avg       0.90      0.84      0.81      6000

