In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RepeatedStratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

Konkretne dane -> podział na testowe i treningowe -> trenowanie modelu -> predykcja na danych testowych -> ocena modelu

In [2]:
df = pd.DataFrame({
    'feature1': [1, 2, 3, 4, 9, 2, 1, 3, 5, 5, 12],
    'feature2': [5, 4, 3, 2, 1, 3, 4, 1, 8, 9, 10],
    'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
})

print (df, "\n")

X = df[['feature1', 'feature2']]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # test_size -> 80% training and 20% test
print ("X_test = ", X_test.shape)
print ("Y_test = ", y_test.shape,)
print ("X_train = ", X_train.shape)
print ("Y_train = ", y_train.shape)

    feature1  feature2  label
0          1         5      0
1          2         4      1
2          3         3      0
3          4         2      1
4          9         1      0
5          2         3      1
6          1         4      0
7          3         1      1
8          5         8      0
9          5         9      1
10        12        10      0 

X_test =  (3, 2)
Y_test =  (3,)
X_train =  (8, 2)
Y_train =  (8,)


In [3]:
# Walidacja krzyżowa k - fold
# Dzielimy na k równych podzbiorów (w których każdy z podzbiorów raz występuje jako zbiór uczący, a pozostała, połączona 
# część zbioru jest wykorzystywana jako zbiór testowy)

k = 4
cv = StratifiedKFold(n_splits=k)
model = GaussianNB()

cross_val_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')

fold = 0
for fold, (train, test) in enumerate(cv.split(X_train, y_train)):
    print("Fold {} ({} w TS, {} w VS)".format(fold, len(train), len(test)))
    print(train)

print("\nCross-Validation Scores: \n", cross_val_scores)
print("Mean Cross-Validation Score: ", cross_val_scores.mean())

param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("\nBest Hyperparameters: \n", grid_search.best_params_)

# Train the model with the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions and evaluation on the test set
y_pred = best_model.predict(X_test)
print("\nTest Set Predictions: \n", y_pred)

# Display learned parameters (specific to Naive Bayes)
print("\nClass Priors: \n", best_model.class_prior_)
print("\nMeans: \n", best_model.theta_)

# Classification report
print("\nClassification Report: \n", classification_report(y_test, y_pred))

# Cross-Validation Scores with the best model
cross_val_scores = cross_val_score(best_model, X, y, cv=cv)
print("\nCross-Validation Scores: \n", cross_val_scores)
print("Mean Cross-Validation Score: ", cross_val_scores.mean())

Fold 0 (6 w TS, 2 w VS)
[1 3 4 5 6 7]
Fold 1 (6 w TS, 2 w VS)
[0 2 4 5 6 7]
Fold 2 (6 w TS, 2 w VS)
[0 1 2 3 6 7]
Fold 3 (6 w TS, 2 w VS)
[0 1 2 3 4 5]

Cross-Validation Scores: 
 [0.5 0.5 0.5 1. ]
Mean Cross-Validation Score:  0.625

Best Hyperparameters: 
 {'var_smoothing': 1e-09}

Test Set Predictions: 
 [0 0 1]

Class Priors: 
 [0.5 0.5]

Means: 
 [[4.75 6.75]
 [2.75 2.5 ]]

Classification Report: 
               precision    recall  f1-score   support

           0       0.50      0.50      0.50         2
           1       0.00      0.00      0.00         1

    accuracy                           0.33         3
   macro avg       0.25      0.25      0.25         3
weighted avg       0.33      0.33      0.33         3


Cross-Validation Scores: 
 [0.66666667 0.66666667 0.66666667 0.5       ]
Mean Cross-Validation Score:  0.625
