In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
# from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, make_scorer, accuracy_score, roc_auc_score
from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

In [None]:
data = pd.read_csv('148141-imbalanced.txt', sep='\t')
# data = data.sample(frac=0.05)
data.head()

In [None]:
print(data.shape)
X=data.iloc[:,0:-1].to_numpy()
y=data.iloc[:,-1].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def get_metrics_cv(classifier, X, y):
    a = []
    for scoring in [accuracy_score, geometric_mean_score, roc_auc_score]:
        scores = cross_val_score(classifier, X, y, cv=10, scoring=make_scorer(scoring))
        a.append((scores.mean(), scores.std()))
    return a

In [None]:
((len(data[data['class'] == 1]) / len(data)) * (len(data[data['class'] == 0]) / len(data)))

In [None]:
# data.hist(figsize=(50, 30), bins=100)
plt.tight_layout()
plt.show()

In [None]:
# data.iloc[:,:-1].boxplot(figsize=(70, 10))
plt.xticks(rotation=25)
plt.tight_layout()
plt.show()

In [None]:
pca = PCA(n_components=2)
X2D = pca.fit_transform(data)
print(pca.explained_variance_ratio_)
print(np.sum(pca.explained_variance_ratio_))
plt.scatter(X2D[:, 0], X2D[:, 1], c=data['class'])
plt.tight_layout()
plt.title('PCA 2D')
plt.show()

In [None]:
# %matplotlib widget
pca = PCA(n_components=3)
X2D = pca.fit_transform(data)
print(pca.explained_variance_ratio_)
print(np.sum(pca.explained_variance_ratio_))
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(X2D[:, 0], X2D[:, 1], X2D[:, 2], c=data['class'])
plt.title('PCA 3D')
plt.tight_layout()
plt.show()

In [None]:
pca = PCA(n_components=len(data.columns)-1)
_ = pca.fit_transform(data)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Liczba wymiarów')
plt.xticks(np.arange(0, len(pca.explained_variance_ratio_), 10.0))
plt.ylabel('Zachowana wariancja')
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
metrics = {

}

metrics['KNeighborsClassifier'] = get_metrics_cv(make_pipeline(SMOTE(), KNeighborsClassifier()).fit(X, y), X, y)
metrics['DecisionTreeClassifier'] = get_metrics_cv(make_pipeline(SMOTE(), DecisionTreeClassifier()).fit(X, y), X, y)
metrics['RandomForestClassifier'] = get_metrics_cv(make_pipeline(SMOTE(), RandomForestClassifier()).fit(X, y), X, y)
metrics['SVC'] = get_metrics_cv(make_pipeline(SMOTE(), SVC()).fit(X, y), X, y)
metrics['MLPClassifier'] = get_metrics_cv(make_pipeline(SMOTE(), MLPClassifier()).fit(X, y), X, y)
metrics['GaussianNB'] = get_metrics_cv(make_pipeline(SMOTE(), GaussianNB()).fit(X, y), X, y)
metrics['QuadraticDiscriminantAnalysis'] = get_metrics_cv(make_pipeline(SMOTE(), QuadraticDiscriminantAnalysis()).fit(X, y), X, y)
metrics['ZeroR'] = get_metrics_cv(make_pipeline(DummyClassifier()).fit(X, y), X, y)


In [None]:
for clf, values in metrics.items():
    print(f'{clf}: {[(round(x[0], 4),round(x[1],4)) for x in values]}')

In [None]:
models = list(metrics.keys())

plt.tight_layout()
fig, ax = plt.subplots(figsize=(10, 6))

bar_width = 0.15
index = range(len(models))

bar1 = ax.bar(index, [val[0][0] for val in metrics.values()], bar_width, yerr=[val[0][1] for val in metrics.values()], label='Accuracy')
bar2 = ax.bar([i + bar_width for i in index], [val[1][0] for val in metrics.values()], bar_width, yerr=[val[1][1] for val in metrics.values()], label='G-mean')
bar3 = ax.bar([i + 2*bar_width for i in index], [val[2][0] for val in metrics.values()], bar_width, yerr=[val[2][1] for val in metrics.values()], label='ROC AUC')

ax.set_xlabel('Modele')
ax.set_ylabel('Wartości metryk')
ax.set_title('Porównanie modeli do klasyfikacji niezbalansowanych danych')
ax.set_xticks([i + bar_width/2 for i in index])
ax.set_xticklabels(models, rotation=45, ha='right')
ax.legend(loc='lower left')

plt.show()


In [None]:
metrics_norm = {

}

metrics_norm['KNeighborsClassifier'] = get_metrics_cv(make_pipeline(StandardScaler(),SMOTE(),  KNeighborsClassifier()).fit(X, y), X, y)
metrics_norm['DecisionTreeClassifier'] = get_metrics_cv(make_pipeline(StandardScaler(), SMOTE(), DecisionTreeClassifier()).fit(X, y), X, y)
metrics_norm['RandomForestClassifier'] = get_metrics_cv(make_pipeline(StandardScaler(), SMOTE(), RandomForestClassifier()).fit(X, y), X, y)
metrics_norm['SVC'] = get_metrics_cv(make_pipeline(StandardScaler(), SMOTE(), SVC()).fit(X, y), X, y)
metrics_norm['MLPClassifier'] = get_metrics_cv(make_pipeline( StandardScaler(), SMOTE(),MLPClassifier()).fit(X, y), X, y)
metrics_norm['GaussianNB'] = get_metrics_cv(make_pipeline(StandardScaler(),SMOTE(),  GaussianNB()).fit(X, y), X, y)
metrics_norm['QuadraticDiscriminantAnalysis'] = get_metrics_cv(make_pipeline(StandardScaler(), SMOTE(), QuadraticDiscriminantAnalysis()).fit(X, y), X, y)
metrics_norm['ZeroR'] = get_metrics_cv(make_pipeline(StandardScaler(), DummyClassifier()).fit(X, y), X, y)

In [None]:
for clf, values in metrics_norm.items():
    print(f'{clf}: {[(round(x[0], 4),round(x[1],4)) for x in values]}')

In [None]:
models = list(metrics_norm.keys())

plt.tight_layout()
fig, ax = plt.subplots(figsize=(10, 6))

bar_width = 0.15
index = range(len(models))

bar1 = ax.bar(index, [val[0][0] for val in metrics_norm.values()], bar_width, yerr=[val[0][1] for val in metrics_norm.values()], label='Accuracy')
bar2 = ax.bar([i + bar_width for i in index], [val[1][0] for val in metrics_norm.values()], bar_width, yerr=[val[1][1] for val in metrics_norm.values()], label='G-mean')
bar3 = ax.bar([i + 2*bar_width for i in index], [val[2][0] for val in metrics_norm.values()], bar_width, yerr=[val[2][1] for val in metrics_norm.values()], label='ROC AUC')

ax.set_xlabel('Modele')
ax.set_ylabel('Wartości metryk')
ax.set_title('Porównanie modeli do klasyfikacji niezbalansowanych danych po standaryzacji')
ax.set_xticks([i + bar_width/2 for i in index])
ax.set_xticklabels(models, rotation=45, ha='right')
ax.legend(loc='lower left')

plt.show()

In [None]:
models = list(metrics_norm.keys())

plt.tight_layout()
fig, ax = plt.subplots(figsize=(10, 6))

bar_width = 0.15
index = range(len(models))

bar_1_vals = []
bar_2_vals = []
bar_3_vals = []
for key in metrics_norm:
    bar_1_vals.append(metrics_norm[key][0][0] - metrics[key][0][0])
    bar_2_vals.append(metrics_norm[key][1][0] - metrics[key][1][0])
    bar_3_vals.append(metrics_norm[key][2][0] - metrics[key][2][0])


bar1 = ax.bar(index, bar_1_vals, bar_width, label='Accuracy')
bar2 = ax.bar([i + bar_width for i in index], bar_2_vals, bar_width, label='G-mean')
bar3 = ax.bar([i + 2*bar_width for i in index], bar_3_vals, bar_width, label='ROC AUC')

ax.set_xlabel('Modele')
ax.set_ylabel('Wartości metryk')
ax.set_title('Porównanie modeli po standaryzacji do modeli bez standaryzacji')
ax.set_xticks([i + bar_width/2 for i in index])
ax.set_xticklabels(models, rotation=45, ha='right')
ax.legend(loc='lower left')

plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': [1, 0.1, 0.01, 0.001],
    'svc__kernel': ['rbf', 'linear', 'poly']
}

grid_search = GridSearchCV(estimator=make_pipeline(SMOTE(), SVC()), param_grid=param_grid, scoring=make_scorer(geometric_mean_score),cv=5, n_jobs=4, verbose=2)
grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

In [None]:
import seaborn as sns

results = grid_search.cv_results_
df_results = pd.DataFrame(results)
df_results = df_results[df_results['param_svc__kernel'] == 'poly']
heatmap_data = df_results.pivot(index='param_svc__C', columns='param_svc__gamma', values='mean_test_score')
plt.tight_layout()
plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data, annot=True, cmap="viridis", fmt=".4g")
plt.title('Wartości G-mean w Grid Search CV dla SVC-poly')
plt.xlabel('Gamma')
plt.ylabel('C')
plt.show()

In [None]:
import shap
model_exp = make_pipeline(SMOTE(), SVC(kernel='rbf', C=0.1, gamma=0.1, probability=True))
model_exp.fit(X_train, y_train)
print(get_metrics_cv(model_exp, X, y))

In [None]:
explainer = shap.Explainer(model_exp.predict, X_test, feature_names=data.columns[:-1])
shap_values = explainer(X_test)

In [None]:
shap.plots.bar(shap_values, max_display=30)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


class_weights = [
    {0: 0.5, 1: 0.5},
    {0: 0.95, 1: 0.05},
    'balanced',
    {0: 0.99, 1: 0.01},
    {0: 0.995, 1: 0.005},
    {0: 0.999, 1: 0.001},
]

# Lista do przechowywania wyników
results = []
weights_labels = []

fig, axs = plt.subplots(3, 2)
fig.set_size_inches(8, 10)

for i, weight in enumerate(class_weights):
    # Trening modelu
    clf = DecisionTreeClassifier(class_weight=weight, max_depth=10)
    clf.fit(X_train, y_train)
    
    # Predykcja
    y_pred = clf.predict(X_test)
    
    # Macierz pomyłek
    cm = confusion_matrix(y_test, y_pred)
    results.append((weight, cm))

    weights_labels.append(str(weight))
    # Wyświetlanie macierzy pomyłek
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(ax=axs.reshape(-1)[i])
    axs.reshape(-1)[i].set_title(f'Wagi: {weight}')

print(get_metrics_cv(clf, X, y))
fig.show()
# Analiza wyników
false_positives = [cm[0, 1] for _, cm in results]
false_negatives = [cm[1, 0] for _, cm in results]

# Wykres FP i FN
plt.tight_layout()
plt.figure(figsize=(10, 5))
plt.plot(weights_labels, false_positives, marker='o', label='FP')
plt.plot(weights_labels, false_negatives, marker='o', label='FN')
plt.xlabel('Wagi')
plt.ylabel('Ilość pomyłek')
plt.title('Rodzaje pomyłek w zależności od wag')
plt.legend()
plt.grid(True)
plt.show()