Step: Load in packages

In [None]:
%load_ext lab_black

In [None]:
import xgboost as xgb
import numpy as np
from skimpy import clean_columns
import pandas as pd
from pyprojroot import here
import os
import mlflow
import pandas as pd
import sklearn
from skopt import BayesSearchCV
from joblib import dump, load
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, f1_score
from sklearn.metrics import plot_precision_recall_curve
from sklearn.model_selection import train_test_split
import joblib
from analysis.fun_class_positive_predictions import fun_class_positive_predictions
from sklearn.model_selection import RepeatedKFold
from analysis.fun_classifer_xgboost_bayes import fun_classifer_xgboost_bayes
from analysis.fun_imbalanced_threshold_locator import fun_imbalanced_threshold_locator
from analysis.fun_class_accuracy_calcaulator import fun_class_accuracy_calcaulator
from analysis.fun_drop_high_correlation import fun_drop_high_correlation
import lightgbm as lgb
from analysis.fun_imbalanced_threshold import fun_imbalanced_threshold
from analysis.fun_class_predictions_id import fun_class_predictions_id

path_data = here("./data")
os.chdir(path_data)
data_trading_analysis = pd.read_parquet("data_trading_good_features.parquet")

Step: Add an id, because you are lazy want to use your pre-built function

In [None]:
data_trading_analysis_id = data_trading_analysis.reset_index()

Step: Remove highly correlated features

In [None]:
data_trading_analysis_low_corr = fun_drop_high_correlation(
    data=data_trading_analysis_id,
    outcome=["result"],
)

data_trading_analysis_low_corr.to_parquet("data_trading_analysis_low_corr.parquet")

Step: Split test and training

In [None]:
x = data_trading_analysis_low_corr.drop(columns=["result"])
y = data_trading_analysis_low_corr["result"]

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

Step: Get the imbalance measurement

In [None]:
from analysis.fun_scale_pos_weight import fun_scale_pos_weight

scale_pos_weight = fun_scale_pos_weight(y_train=y_train, outcome="result")
scale_pos_weight

Step: Run ML 

In [None]:
results_xgb = fun_classifer_xgboost_bayes(
    scale_pos_weight=scale_pos_weight,
    x_train=x_train,
    y_train=y_train,
    n_splits=3,
    n_repeats=3,
    id_var="index",
)

path_outputs = here("./outputs")
os.chdir(path_outputs)
joblib.dump(results_xgb, "results_xgb.jlib")

Step: Get predictions for all data and just training

In [None]:
data_prob_all = fun_class_positive_predictions(
    model=results_xgb,
    x_train_or_test=data_trading_analysis_low_corr,
    id_vars=["index", "result"],
)

data_prob_train = fun_class_positive_predictions(
    model=results_xgb,
    x_train_or_test=x_train,
    id_vars=["index"],
)

data_prob_test = fun_class_positive_predictions(
    model=results_xgb,
    x_train_or_test=x_test,
    id_vars=["index"],
)

Step: Get threshold for imbalanced data

In [None]:
from sklearn import metrics

In [None]:
def fun_imbalanced_threshold_balanced_accuracy(y_train, y_train_probability):
    """Looks the point that maximizes the recall and precision among the predicted probability to identify better thresholds for
    imbalanced data using the training data to avoid data leakage.  Only returns the threshold

    Args:
        y_train (int): y_test from a test train split
        y_train_probability (int): Predicted probability using training data.


    Returns:
        array: An array of updated predicted values (i.e., 1's and 0's) based on the threshold.  And the threshold value
    """
    fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_probability, pos_label=1)
    balanced_accuracy = (fpr + tpr) / 2

    # locate the index of the largest f score
    ix = np.argmax(balanced_accuracy)
    threshold = thresholds[ix]
    return threshold

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, data_prob_train, pos_label=1)
balanced_accuracy = (fpr + tpr) / 2

In [None]:
threshold = fun_imbalanced_threshold_balanced_accuracy(
    y_train=y_train, y_train_probability=data_prob_train
)
threshold

Step: Get classification for all data

In [32]:
data_trading_classification = fun_class_predictions_id(
    data_id=data_trading_analysis_low_corr["index"],
    outcome_probability=data_prob_all,
    threshold=0.5,
)

Step: Get predicted y for training

In [63]:
def fun_imbalanced_balanced_accuracy_threshold_locator(y_train, threshold):
    threshold = pd.Series(np.arange(0.01, 1, 0.01))
    data_best_accuracy_out = []
    for x in range(len(threshold)):
        y_threshold = np.where(y_train > threshold[x], 1, 0)
        results_balanced_accuracy = balanced_accuracy_score(
            y_train, y_threshold, adjusted=True
        )
        data_best_accuracy_out.append(results_balanced_accuracy)
    threshold_pd = pd.DataFrame(pd.concat([threshold])).rename(columns={0: "threshold"})
    data_best_accuracy_out_pd = pd.DataFrame(data_best_accuracy_out).rename(
        columns={0: "balanced_accuracy"}
    )
    data_threshold_accuracy = pd.concat(
        [threshold_pd, data_best_accuracy_out_pd], axis=1
    )
    data_threshold_value = data_threshold_accuracy[
        data_threshold_accuracy.balanced_accuracy
        == data_threshold_accuracy.balanced_accuracy.max()
    ]
    data_threshold_value = data_threshold_value["threshold"]
    return data_threshold_value

In [43]:
threshold = pd.Series(np.arange(0.01, 1, 0.01))
data_best_accuracy_out = []
for x in range(len(threshold)):
    y_threshold = np.where(data_prob_train > threshold[x], 1, 0)
    results_balanced_accuracy = balanced_accuracy_score(
        y_train, y_threshold, adjusted=True
    )
    data_best_accuracy_out.append(results_balanced_accuracy)
threshold_pd = pd.DataFrame(pd.concat([threshold])).rename(columns={0: "threshold"})
data_best_accuracy_out_pd = pd.DataFrame(data_best_accuracy_out).rename(
    columns={0: "balanced_accuracy"}
)
data_threshold_accuracy = pd.concat([threshold_pd, data_best_accuracy_out_pd], axis=1)

In [58]:
threshold_pd = pd.DataFrame(pd.concat([threshold])).rename(columns={0: "threshold"})
data_best_accuracy_out_pd = pd.DataFrame(data_best_accuracy_out).rename(
    columns={0: "balanced_accuracy"}
)
data_threshold_accuracy = pd.concat([threshold_pd, data_best_accuracy_out_pd], axis=1)
data_threshold_accuracy

Unnamed: 0,threshold,balanced_accuracy
0,0.01,0.000000
1,0.02,0.002094
2,0.03,0.009424
3,0.04,0.015707
4,0.05,0.034555
...,...,...
94,0.95,0.134927
95,0.96,0.094024
96,0.97,0.054980
97,0.98,0.026826


In [62]:
data_threshold_value = data_threshold_accuracy[
    data_threshold_accuracy.balanced_accuracy
    == data_threshold_accuracy.balanced_accuracy.max()
]
data_threshold_value = data_threshold_value["threshold"]
data_threshold_value

49    0.5
Name: threshold, dtype: float64

In [None]:
y_predicted = fun_imbalanced_threshold_locator(
    y_train=y_train,
    y_train_probability=data_prob_train,
    y_test_probability=data_prob_test,
)

Step: Get accuracy

In [36]:
results_accuracy = fun_class_accuracy_calcaulator(
    y_test=y_test, y_predicted=y_predicted
)
results_accuracy

ValueError: Found input variables with inconsistent numbers of samples: [1180, 4720]