In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix

# Update the file path to the local dataset file
file_path = r'C:\Users\pc\Desktop\lab 13 AI full\dermatology.data'

# Load the Dermatology dataset
column_names = ['erythema', 'scaling', 'definite_borders', 'itching', 'koebner_phenomenon', 'polygonal_papules',
                'follicular_papules', 'oral_mucosal_involvement', 'knee_and_elbow_involvement', 'scalp_involvement',
                'family_history', 'melanin_incontinence', 'eosinophils_infiltrate', 'PNL_infiltrate',
                'fibrosis_of_the_papillary_dermis', 'exocytosis', 'acanthosis', 'hyperkeratosis',
                'parakeratosis', 'clubbing_of_the_rete_ridges', 'elongation_of_the_rete_ridges',
                'thinning_of_the_suprapapillary_epidermis', 'spongiform_pustule', 'munro_microabcess',
                'focal_hypergranulosis', 'disappearance_of_the_granular_layer', 'vacuolisation_and_damage_of_basal_layer',
                'spongiosis', 'saw_tooth_appearance_of_retes', 'follicular_horn_plug', 'perifollicular_parakeratosis',
                'inflammatory_monoluclear_inflitrate', 'band_like_infiltrate', 'age', 'class']
data = pd.read_csv(file_path, names=column_names)

# Preprocessing: separating features and target variable
X = data.drop('class', axis=1)
y = data['class']

# Replace '?' with NaN values
X = X.replace('?', np.nan)

# Impute missing values with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Train/test split
train_sizes = [0.6, 0.7, 0.8]
for train_size in train_sizes:
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, stratify=y, random_state=42)

    # Naïve Bayes classification
    nb_classifier = GaussianNB()
    nb_classifier.fit(X_train, y_train)

    # Perform k-fold cross-validation
    k = 5  # Number of folds
    cv_scores = cross_val_score(nb_classifier, X_train, y_train, cv=StratifiedKFold(n_splits=k, shuffle=True))

    # Display cross-validation scores
    print(f"Train/Test Split: {train_size * 100}% train / {100 - train_size * 100}% test")
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Average accuracy: {cv_scores.mean()}")

    # Make predictions on the test set
    y_pred = nb_classifier.predict(X_test)

    # Compute confusion matrix
    confusion = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n{confusion}\n")

    # Normalize confusion matrix
    normalized_confusion = confusion.astype('float') / confusion.sum(axis=1)[:, np.newaxis]
    print(f"Normalized Confusion Matrix:\n{normalized_confusion}\n")

Train/Test Split: 60.0% train / 40.0% test
Cross-validation scores: [0.84090909 0.88636364 0.88636364 0.95454545 0.90697674]
Average accuracy: 0.8950317124735729
Confusion Matrix:
[[45  0  0  0  0  0]
 [ 0  7  0 16  0  1]
 [ 0  0 29  0  0  0]
 [ 1  0  0 19  0  0]
 [ 0  0  0  0 21  0]
 [ 0  0  0  0  0  8]]

Normalized Confusion Matrix:
[[1.         0.         0.         0.         0.         0.        ]
 [0.         0.29166667 0.         0.66666667 0.         0.04166667]
 [0.         0.         1.         0.         0.         0.        ]
 [0.05       0.         0.         0.95       0.         0.        ]
 [0.         0.         0.         0.         1.         0.        ]
 [0.         0.         0.         0.         0.         1.        ]]

Train/Test Split: 70.0% train / 30.0% test
Cross-validation scores: [0.90384615 0.90196078 0.90196078 0.88235294 0.92156863]
Average accuracy: 0.9023378582202112
Confusion Matrix:
[[34  0  0  0  0  0]
 [ 0  5  0 12  0  1]
 [ 0  0 22  0  0  0]
 [ 1

In [None]:
r'C:\Users\pc\Desktop\lab 13 AI full\data_banknote_authentication