In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import random

In [2]:
# Importing the dataset  
dataset = pd.read_csv('2020_bn_nb_data.txt', sep="\t")  
dataset.head()
X = dataset.drop('QP', axis=1)
y = dataset['QP']

In [3]:
X, y

(    EC100 EC160 IT101 IT161 MA101 PH100 PH160 HS101
 0      BC    CC    BB    BC    CC    BC    AA    BB
 1      CC    BC    BB    BB    CC    BC    AB    BB
 2      AB    BB    AB    AB    BB    CC    BC    AB
 3      BC    CC    BB    BB    BB    BB    BC    BB
 4      BC    AB    CD    BC    BC    BC    BC    CD
 ..    ...   ...   ...   ...   ...   ...   ...   ...
 227    BC    BB    BC    CD    CC    BB    BC    AB
 228    BC    BC    AB    BB    BC    CC    AB    BB
 229    CC    BC    BC    BC    CC    CC    BC    BC
 230    CD    DD    BB    BB    BC    CC    CC    CC
 231     F    DD    DD    CD     F     F    CD    CC
 
 [232 rows x 8 columns],
 0      y
 1      y
 2      y
 3      y
 4      y
       ..
 227    y
 228    y
 229    y
 230    y
 231    n
 Name: QP, Length: 232, dtype: object)

In [4]:
X_encoded = X.copy()
label_encoders = {}

In [5]:
for column in X.columns:
        le = LabelEncoder()
        X_encoded[column] = le.fit_transform(X[column])
        label_encoders[column] = le

label_encoders

{'EC100': LabelEncoder(),
 'EC160': LabelEncoder(),
 'IT101': LabelEncoder(),
 'IT161': LabelEncoder(),
 'MA101': LabelEncoder(),
 'PH100': LabelEncoder(),
 'PH160': LabelEncoder(),
 'HS101': LabelEncoder()}

In [6]:
le_qp = LabelEncoder()
y_encoded = le_qp.fit_transform(y)
y_encoded, X_encoded

(array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
        1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
        1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
        0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
        1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
        0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1,
        1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]),
      EC100  EC160  IT101  IT161  MA101  PH100  PH160  HS101
 0        3      4      2      3      4      3      0      2
 1        4      3      2      2      4      3      1      2
 2        1      2      1      

In [16]:
accuracies = []
detailed_reports = []
confusionM = []

In [14]:
print(X_encoded.dtypes)
y_encoded.dtype

EC100    int32
EC160    int32
IT101    int32
IT161    int32
MA101    int32
PH100    int32
PH160    int32
HS101    int32
dtype: object


dtype('int32')

In [None]:
n_iterations = 20

for iteration in range(n_iterations):
    shuffle_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=random.randint(0, 100))

    for train_index, test_index in shuffle_split.split(X_encoded, y_encoded):
        X_train, X_test = X_encoded.iloc[train_index], X_encoded.iloc[test_index]
        y_train, y_test = y_encoded[train_index], y_encoded[test_index]

        nb_classifier = GaussianNB()
        nb_classifier.fit(X_train, y_train)

        y_pred = nb_classifier.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)

        report = classification_report(y_test, y_pred, target_names=le_qp.classes_, output_dict=True)
        detailed_reports.append(report)

        c = confusion_matrix(y_test, y_pred)
        confusionM.append(c)
        

In [33]:
confusionM

[array([[18,  4],
        [ 0, 48]], dtype=int64),
 array([[21,  1],
        [ 0, 48]], dtype=int64),
 array([[21,  1],
        [ 0, 48]], dtype=int64),
 array([[21,  1],
        [ 0, 48]], dtype=int64),
 array([[20,  2],
        [ 0, 48]], dtype=int64),
 array([[20,  2],
        [ 0, 48]], dtype=int64),
 array([[21,  1],
        [ 0, 48]], dtype=int64),
 array([[20,  2],
        [ 0, 48]], dtype=int64),
 array([[22,  0],
        [ 0, 48]], dtype=int64),
 array([[21,  1],
        [ 0, 48]], dtype=int64),
 array([[21,  1],
        [ 0, 48]], dtype=int64),
 array([[21,  1],
        [ 0, 48]], dtype=int64),
 array([[20,  2],
        [ 0, 48]], dtype=int64),
 array([[19,  3],
        [ 0, 48]], dtype=int64),
 array([[21,  1],
        [ 0, 48]], dtype=int64),
 array([[21,  1],
        [ 0, 48]], dtype=int64),
 array([[21,  1],
        [ 0, 48]], dtype=int64),
 array([[21,  1],
        [ 0, 48]], dtype=int64),
 array([[19,  3],
        [ 0, 48]], dtype=int64),
 array([[19,  3],
        [ 0, 

In [25]:
detailed_reports

[{'n': {'precision': 1.0,
   'recall': 0.8636363636363636,
   'f1-score': 0.926829268292683,
   'support': 22.0},
  'y': {'precision': 0.9411764705882353,
   'recall': 1.0,
   'f1-score': 0.9696969696969697,
   'support': 48.0},
  'accuracy': 0.9571428571428572,
  'macro avg': {'precision': 0.9705882352941176,
   'recall': 0.9318181818181819,
   'f1-score': 0.9482631189948263,
   'support': 70.0},
  'weighted avg': {'precision': 0.9596638655462184,
   'recall': 0.9571428571428572,
   'f1-score': 0.9562242635413367,
   'support': 70.0}},
 {'n': {'precision': 1.0,
   'recall': 0.8636363636363636,
   'f1-score': 0.926829268292683,
   'support': 22.0},
  'y': {'precision': 0.9411764705882353,
   'recall': 1.0,
   'f1-score': 0.9696969696969697,
   'support': 48.0},
  'accuracy': 0.9571428571428572,
  'macro avg': {'precision': 0.9705882352941176,
   'recall': 0.9318181818181819,
   'f1-score': 0.9482631189948263,
   'support': 70.0},
  'weighted avg': {'precision': 0.9596638655462184,
   '

In [21]:
print("\nNaive Bayes Classifier Performance Summary:")
print(f"Number of Iterations: {n_iterations}")
print(f"Accuracy Statistics:")
print(f"Mean Accuracy: {np.mean(accuracies):.4f}")
print(f"Standard Deviation: {np.std(accuracies):.4f}")
print(f"Minimum Accuracy: {np.min(accuracies):.4f}")
print(f"Maximum Accuracy: {np.max(accuracies):.4f}")


Naive Bayes Classifier Performance Summary:
Number of Iterations: 20
Accuracy Statistics:
Mean Accuracy: 0.9783
Standard Deviation: 0.0178
Minimum Accuracy: 0.9286
Maximum Accuracy: 1.0000
