We use sklearn to split the data, and the GaussianNB (Naive Bayes for continuous data) model from sklearn.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

We need to split the data into two parts:
- Features (X): All the columns that are used for prediction.
- Target (y): The column that we want to predict (the label).

In [4]:
data = pd.read_csv('../../data/encode/labelEncoded_DiseaseAndSymptoms.csv')
data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,15,130,99,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130
1,15,99,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130
2,15,130,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130
3,15,130,99,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130
4,15,130,99,72,130,130,130,130,130,130,130,130,130,130,130,130,130,130


In [7]:
# Define the features (X) and the target (y)
X = data.drop('Disease', axis=1)  
y = data['Disease']  

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Model training

In [8]:
# Initialize the Naive Bayes classifier
nb = GaussianNB()

# Train the model
nb.fit(X_train, y_train)

Model predictions and evaluation

In [9]:
# Make predictions on the test set
y_pred = nb.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation results
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

Accuracy: 0.03581526861451461
Confusion Matrix:
[[3 0 2 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [5 0 0 ... 0 0 0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.05      0.13      0.07        23
           1       0.00      0.00      0.00        22
           2       0.06      0.06      0.06        16
           3       0.00      0.00      0.00        32
           4       0.06      0.03      0.04        31
           5       0.00      0.00      0.00        22
           6       0.05      0.03      0.04        34
           7       0.06      0.04      0.05        25
           8       0.06      0.05      0.06        20
           9       0.07      0.04      0.05        27
          10       0.04      0.15      0.07        20
          11       0.07      0.13      0.09        31
          12       0.06      0.03      0.04        31
          13       0.00      0.00      0.00        2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
data.dtypes

Disease       int64
Symptom_1     int64
Symptom_2     int64
Symptom_3     int64
Symptom_4     int64
Symptom_5     int64
Symptom_6     int64
Symptom_7     int64
Symptom_8     int64
Symptom_9     int64
Symptom_10    int64
Symptom_11    int64
Symptom_12    int64
Symptom_13    int64
Symptom_14    int64
Symptom_15    int64
Symptom_16    int64
Symptom_17    int64
dtype: object