In [6]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler

In [9]:
#Read the data in and remove any missing values
#Remove any row with a missing value in any of the columns.
filename='./MuhammadOwais_Imran_Dataset_BP.csv'
nan_values=[]
att_type={'BP_Status':'category'}

df=pd.read_csv(filename,dtype=att_type,na_values=nan_values)

print(df[df.isna().any(axis=1)])

print(df.shape)
print(df.isnull().sum())
print()

print(f"shape : {df.shape}")
print(df.isnull().sum())
# df.drop('Gender', axis=1, inplace=True)
# print(df.describe())
# df.head()
print(df.head())

Empty DataFrame
Columns: [Somker, Gender, Age, Height, Weight, BP_Status]
Index: []
(4403, 6)
Somker       0
Gender       0
Age          0
Height       0
Weight       0
BP_Status    0
dtype: int64

shape : (4403, 6)
Somker       0
Gender       0
Age          0
Height       0
Weight       0
BP_Status    0
dtype: int64
          Somker  Gender  Age  Height  Weight BP_Status
0  1- Non-smoker  Female   29   62.50     140    Normal
1  1- Non-smoker  Female   41   59.75     194      High
2    3- Moderate  Female   57   62.25     132      High
3  1- Non-smoker  Female   39   65.75     158    Normal
4  1- Non-smoker  Female   58   61.75     131      High


In [10]:
# Data Transformation

# mapping smoker attribute
map_dict_smoker = {
    "1- Non-smoker": "1",
    "2- Light": "2",
    "3- Moderate": "3",
    "4- Heavy": "4",
    "5- Very Heavy": "5"
}

map_dict_gender = {
    'Male': 0,
    'Female': 1
}
df['Somker'] = df['Somker'].map(map_dict_smoker)
df['Gender'] = df['Gender'].map(map_dict_gender)

scaler = MinMaxScaler()
colToScale =  ["Age", "Weight", "Height"]
df[colToScale] = scaler.fit_transform(df[colToScale])

df.head()

Unnamed: 0,Somker,Gender,Age,Height,Weight,BP_Status
0,1,1,0.0,0.44,0.313305,Normal
1,1,1,0.363636,0.33,0.545064,High
2,3,1,0.848485,0.43,0.27897,High
3,1,1,0.30303,0.57,0.390558,Normal
4,1,1,0.878788,0.41,0.274678,High


In [11]:
print(df.dtypes)
df.head()

Somker         object
Gender          int64
Age           float64
Height        float64
Weight        float64
BP_Status    category
dtype: object


Unnamed: 0,Somker,Gender,Age,Height,Weight,BP_Status
0,1,1,0.0,0.44,0.313305,Normal
1,1,1,0.363636,0.33,0.545064,High
2,3,1,0.848485,0.43,0.27897,High
3,1,1,0.30303,0.57,0.390558,Normal
4,1,1,0.878788,0.41,0.274678,High


In [12]:
# Splitting the dataset
attr = df.drop(columns = ['BP_Status'])  # features
target = df['BP_Status']  # target variable
# attr.head()
# target.head()
attr_train, attr_test, target_train, target_test  = train_test_split(attr, target, test_size = 0.3, random_state = 44, shuffle = True)
target_train.head()
# attr_train.head()
target_test.head()

nb = GaussianNB()
nb.fit(attr_train, target_train)
target_predict = nb.predict(attr_test)
print(f"Model Accuracy: {accuracy_score(target_test, target_predict)}")
print(f"Confusion Matrix:\n {confusion_matrix(target_test, target_predict)}\n")
print(f"Classification Report:\n {classification_report(target_test, target_predict)}\n")

Model Accuracy: 0.6230128690386071
Confusion Matrix:
 [[434 246]
 [252 389]]

Classification Report:
               precision    recall  f1-score   support

        High       0.63      0.64      0.64       680
      Normal       0.61      0.61      0.61       641

    accuracy                           0.62      1321
   macro avg       0.62      0.62      0.62      1321
weighted avg       0.62      0.62      0.62      1321


