In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve, auc
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from datetime import datetime
from pm_utils import get_data
from keras.src.callbacks import EarlyStopping
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

random_seed = 812
keras.utils.set_random_seed(random_seed)
np.random.seed(random_seed)

In [None]:
train_percent = 0.9
validation_percent = 0.1

T = 6
T_shift = 4
path = 'C:/Users/mkedzia/Desktop/praca-magisterska/binance-data/aaa/data/BNB/BNB-merged-data.csv'

columns, input_data, targets = get_data(path=path, base_m=True, trade_metrics=True, trade_metrics_m=False,
                                        google=True, t=T, t_shift=T_shift)
targets = (targets >= 0.0)
k_features = 18

mi = mutual_info_classif(input_data, targets, random_state=random_seed)

sorted_indices = np.argsort(mi)[::-1]
sorted_features = np.array(columns)[sorted_indices]
sorted_mi = mi[sorted_indices]

selector = SelectKBest(mutual_info_classif, k=k_features)
selected_features = selector.fit_transform(input_data, targets)
selected_mask = selector.get_support()
# input_data = selected_features

selected_features_labels = [col for col, is_selected in zip(columns, selected_mask) if is_selected]
rejected_features_labels = [col for col, is_selected in zip(columns, selected_mask) if not is_selected]

print("Wybrane cechy:", selected_features_labels)
print("Odrzucone cechy:", rejected_features_labels)

D = input_data.shape[1]

plt.figure(figsize=(12, 8))
plt.bar(sorted_features, sorted_mi, color='skyblue')
plt.xlabel('Cecha')
plt.ylabel('Informacja wzajemna')
plt.title('Wartości informacji wzajemnej dla każdej cechy')
plt.xticks(rotation=90)
plt.show()

In [None]:
train_ = int(len(input_data) * train_percent)
validation_ = train_ + int(len(input_data) * validation_percent)

X_train = input_data[:train_]
Y_train = targets[:train_]

X_test = input_data[train_:]
Y_test = targets[train_:]

In [None]:
early_stopping = EarlyStopping(monitor='loss', mode='min', verbose=1, patience=5)

class_models = [
    {
        'model': SVC(random_state=random_seed),
        'search_space': {
            'clf__C': [0.1, 1, 10, 100],
            'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'clf__gamma': ['scale', 'auto'],
        }
    },
    {
        'model': LogisticRegression(random_state=random_seed, max_iter=300),
        'search_space': {
            'clf__C': [0.001, 0.01, 0.1, 1, 10],
            'clf__solver': ['liblinear', 'sag', 'saga'],
            'clf__penalty': ['l2'],
            'clf__class_weight': ['balanced']
        }
    },
    {
        'model': RandomForestClassifier(random_state=random_seed),
        'search_space': {
            'clf__n_estimators': [100, 200, 300, 400, 500],
            'clf__max_features': ['auto', 'sqrt', 'log2'],
            'clf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
            'clf__min_samples_split': [2, 5, 10],
            'clf__min_samples_leaf': [1, 2, 4]
        }
    },
    {
        'model': GradientBoostingClassifier(random_state=random_seed),
        'search_space': {
            'clf__n_estimators': [100, 200, 300, 400, 500],
            'clf__learning_rate': [0.01, 0.05, 0.1, 0.2],
            'clf__max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
            'clf__min_samples_split': [2, 5, 10],
            'clf__min_samples_leaf': [1, 2, 4]
        }
    }
]

mdl = class_models[1]
clf, param_grid = mdl['model'], mdl['search_space']
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=np.nan, average='binary'),
    'recall': make_scorer(recall_score, zero_division=np.nan, average='binary'),
    'f1_score': make_scorer(f1_score, zero_division=np.nan, average='binary'),
    'roc_auc': make_scorer(roc_auc_score)
}

pipeline = Pipeline([
    ('scaler', (MinMaxScaler(feature_range=(-1, 1)))),
    ('clf', clf)
])

random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, verbose=0, n_iter=32,
                                   cv=TimeSeriesSplit(n_splits=6),
                                   scoring=scoring, refit='accuracy', return_train_score=False,
                                   random_state=random_seed)
random_result = random_search.fit(X_train, Y_train)

print("Najlepsze parametry: %s" % random_result.best_params_)
print("Najlepsze wyniki dla accuracy: %s" % random_result.best_score_)

results = random_result.cv_results_
best_model = random_result.best_estimator_
Y_pred = best_model.predict(X_test)

best_index = random_result.best_index_
print(f"Mean test accuracy: {results['mean_test_accuracy'][best_index]:.3f}")
print(f"Mean test precision: {results['mean_test_precision'][best_index]:.3f}")
print(f"Mean test recall: {results['mean_test_recall'][best_index]:.3f}")
print(f"Mean test f1_score: {results['mean_test_f1_score'][best_index]:.3f}")
print(f"Mean test roc_auc: {results['mean_test_roc_auc'][best_index]:.3f}")

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average='binary', zero_division=np.nan)
recall = recall_score(Y_test, Y_pred, average='binary', zero_division=np.nan)
f1 = f1_score(Y_test, Y_pred, average='binary', zero_division=np.nan)
roc_auc = roc_auc_score(Y_test, Y_pred)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
print(f"ROC/AUC: {roc_auc:.3f}")

In [None]:
for mean, std, params in zip(results['mean_test_accuracy'], results['std_test_accuracy'], results['params']):
    print("%0.3f (+/-%0.03f) dla %r" % (mean, std * 2, params))
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(Y_test, Y_pred, labels=[True, False]),
                              display_labels=['Long', 'Short'])

disp.plot()
plt.xlabel('Przewidywane klasy')
plt.ylabel('Prawdziwe klasy')
plt.title('Macierz pomyłek')
plt.show()

best_model = random_search.best_estimator_
Y_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(Y_test, Y_proba)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('Wskaźnik fałszywie pozytywnych')
plt.ylabel('Wskaźnik prawdziwie pozytywnych')
plt.title('Krzywa ROC dla najlepszego modelu')
plt.legend(loc="lower right")
plt.show()