In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.feature_selection import RFECV
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, PrecisionRecallDisplay, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.pipeline import make_pipeline
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import matplotlib.pyplot as plt

pio.renderers.default = "browser"

In [None]:
fs_prefix = './'


In [None]:
scaler = MinMaxScaler()

df = pd.read_csv(fs_prefix+'data/last_report/prep_data_target_7.csv')
# y = df[['target']].values
df_old = pd.read_csv(fs_prefix+'data/existing_model_training.csv')
df.set_index(df['datetime'], inplace=True)
df_old.set_index(df_old['datetime'], inplace=True)
df.rename(columns={'Температура окр. среды(C)': 'Температура окр.среды(C)'}, inplace=True)

In [None]:
# required_columns = [
#     'Вес на крюке(тс)',
#     'Положение крюкоблока(м)',
#     'Момент на СВП(кН*м)',
#     'Обороты СВП(об/мин)',
#     'Расход на входе(л/с)',
#     'Температура окр.среды(C)',
#     'Глубина инструмента(м)',
#     'Нагрузка на долото(тс)',
#     'Наработка каната(т*км)',
# ]

required_columns = [
    'Вес на крюке(тс)',
    'Положение крюкоблока(м)',
    'Момент на СВП(кН*м)',
    'Обороты СВП(об/мин)',
    'Расход на входе(л/с)',
    # 'Температура окр.среды(C)',
    # 'Глубина инструмента(м)',
    # 'Нагрузка на долото(тс)',
    # 'Наработка каната(т*км)',
    # 'Давление в манифольде(МПа)'
]

metrics = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'F1': f1_score,
    'ROC-AUC': roc_auc_score
}

In [None]:
column_labels_index = {
            'target': 'binary_target'
        }
    
df.rename(columns=column_labels_index, inplace=True)
y_test = df[['binary_target']].values

In [None]:
scaler.fit(df_old[required_columns]) 
scaled_features = scaler.transform(df[required_columns])
class_counts = df[['binary_target']].value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]


In [None]:
version = f'_6_13'
filename = f"Отчет_{version}.txt"
models = {
    # 'Decision Tree': DecisionTreeClassifier(random_state=42, class_weight="balanced"),
    # 'SVM': SVC(class_weight="balanced",  random_state=42, probability=True),
    'RandomForest': RandomForestClassifier(class_weight="balanced_subsample", random_state=42, n_jobs=-1),
    # 'LightGBM': LGBMClassifier(class_weight="balanced", reg_lambda = 0.5, objective='binary', random_state=42, n_jobs = -1),
    # 'XGboost' : xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, reg_lambda = 0.5, objective='binary:logistic', random_state=42, n_jobs = -1),
    'CatBoost': CatBoostClassifier(random_state=42, silent=True, iterations=500, loss_function='Logloss', eval_metric='Recall', early_stopping_rounds=20),
    # 'HistGB' : HistGradientBoostingClassifier(n_iter_no_change=3, scoring='roc_auc',class_weight='balanced', random_state=42)
}

In [None]:
from sklearn.metrics import confusion_matrix
def confm(y_true, y_pred):
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Setting the attributes
    fig, px = plt.subplots(figsize=(7.5, 7.5))
    px.matshow(conf_matrix, cmap=plt.cm.YlOrRd, alpha=0.5)
    for m in range(conf_matrix.shape[0]):
        for n in range(conf_matrix.shape[1]):
            px.text(x=m, y=n, s=conf_matrix[m, n], va="center", ha="center", size="xx-large")
    
    # Sets the labels
    plt.xlabel("Actuals", fontsize=16)
    plt.ylabel("Predictions", fontsize=16)
    plt.title(f"Confusion Matrix | {model_name}", fontsize=15)
    plt.show()

# Модели отдельно

In [None]:
data_total = dict()
metrics_total = dict()
test_df = pd.DataFrame(index=models.keys(), columns=metrics.keys())

for model_name in models:
    X_test = pd.DataFrame(data=scaled_features, columns=df[required_columns].columns)
    X_test.columns = [s.replace(" ", "_") for s in X_test.columns.tolist()]
    
    with open(fs_prefix + f'models/RFECV_{model_name}{version}.pkl', 'rb') as f:
        print(f'Opening RFECV_{model_name}{version}.pkl')
        selector = pickle.load(f)
        print(type(selector))
    # 
    selected_features = X_test.columns[selector.support_]
    X_test = X_test[selected_features]

    with open(fs_prefix + f'models/{model_name}{version}.pkl', 'rb') as f:
        print(f'Opening {model_name}{version}.pkl')
        model = pickle.load(f)
        # print(model)
    
    # model = make_pipeline(selector, model)
    
    log_probs = model.predict_proba(X_test)
    print(model.classes_[1])
    data = pd.DataFrame({
        'y_pred': model.predict(X_test),
        'log_probs_0': log_probs[:,0],
        'log_probs_1':log_probs[:,1],
        'class_pred': np.argmax(log_probs, axis=1)
        })
    # data = data.join(df['datetime'])
    data.set_index(df['datetime'], inplace=True)
    data_total.update({model_name: data})
    # Convert log probabilities to class predictions
    results_test = {}
    for metric_name, metric_func in metrics.items():
        if metric_name == 'ROC-AUC':
            # results[metric_name] = roc_auc_score(y_test, grid_search.best_estimator_.predict_proba(X_test)[:, 1])
            # y_pred_proba = log_probs[:, 0]
            roc_auc = metric_func(y_test, data['log_probs_1'])
            results_test[metric_name] = roc_auc
            RocCurveDisplay.from_estimator(model, X_test, y_test)
        elif metric_name == 'Accuracy':
            results_test[metric_name] = metric_func(y_test, data['y_pred'])
        else:
            results_test[metric_name] = metric_func(y_test, data['class_pred'], average='weighted')
    confm(y_test, data['y_pred'])
    test_df.loc[model_name, :] = results_test
    print(test_df.loc[model_name, :])
    test_df = round(test_df, 4).sort_values(by='ROC-AUC', ascending=False)
    with open(filename, 'a+') as file:
        file.write(f"Метрики на тесте по 11 бригаде: \n{test_df.loc[model_name, :]}\n")
    
    metrics_total.update({model_name: results_test})

# Комбинированные предикты

In [None]:
# TODO: использовать make_pipeline() вместе с селектором при передаче эстиматоров

from sklearn.linear_model import LogisticRegression

df_old.rename(columns=column_labels_index, inplace=True)
df_old.columns = [s.replace(" ", "_") for s in df_old.columns.tolist()]

X_test = pd.DataFrame(data=scaled_features, columns=df[required_columns].columns)
X_test.columns = [s.replace(" ", "_") for s in X_test.columns.tolist()]

# required_columns = [s.replace(" ", "_") for s in required_columns]


clf = StackingClassifier(
    estimators=[
        (
            'rf',
            make_pipeline(
                pickle.load(open(fs_prefix + f'models/RFECV_RandomForest{version}.pkl', 'rb')),
                pickle.load(open(fs_prefix + f'models/RandomForest{version}.pkl', 'rb')),
            )
        ),
        (
            'cb',
            make_pipeline(
                pickle.load(open(fs_prefix + f'models/RFECV_CatBoost{version}.pkl', 'rb')),
                pickle.load(open(fs_prefix + f'models/CatBoost{version}.pkl', 'rb')),
            )
        )
    ],
    final_estimator=LogisticRegression(),
    cv='prefit'
)

combined_model = clf.fit(df_old[[s.replace(" ", "_") for s in required_columns]], df_old['binary_target'])

In [None]:
log_probs = combined_model.predict_proba(X_test)
print(combined_model.classes_[1])
data_combined = pd.DataFrame({
    'y_pred': combined_model.predict(X_test),
    'log_probs_0': log_probs[:,0],
    'log_probs_1':log_probs[:,1],
    'class_pred': np.argmax(log_probs, axis=1)
})
    
results_test = {}
for metric_name, metric_func in metrics.items():
    if metric_name == 'ROC-AUC':
        # results[metric_name] = roc_auc_score(y_test, grid_search.best_estimator_.predict_proba(X_test)[:, 1])
        # y_pred_proba = log_probs[:, 0]
        roc_auc = metric_func(y_test, data_combined['log_probs_1'])
        results_test[metric_name] = roc_auc
        RocCurveDisplay.from_estimator(combined_model, X_test, y_test)
    elif metric_name == 'Accuracy':
        results_test[metric_name] = metric_func(y_test, data_combined['y_pred'])
    else:
        results_test[metric_name] = metric_func(y_test, data_combined['class_pred'], average='weighted')
print(results_test)
confm(y_test, data_combined['y_pred'])

In [None]:
# TODO: завернуть в циклы. 

y_pred_array = np.array([data_total['RandomForest']['y_pred'], data_total['CatBoost']['y_pred']])
class_pred_array = np.array([data_total['RandomForest']['class_pred'], data_total['CatBoost']['class_pred']])
log_probs_0_array = np.array([data_total['RandomForest']['log_probs_0'], data_total['CatBoost']['log_probs_0']])
log_probs_1_array = np.array([data_total['RandomForest']['log_probs_1'], data_total['CatBoost']['log_probs_1']])

y_pred_array = np.mean(y_pred_array, axis=0)
class_pred_array = np.mean(class_pred_array, axis=0)
log_probs_0_array = np.mean(log_probs_0_array, axis=0)
log_probs_1_array = np.mean(log_probs_1_array, axis=0)

y_pred_array = np.round(y_pred_array)
class_pred_array = np.round(class_pred_array)

data_mean = pd.DataFrame({
    'y_pred': y_pred_array,
    'log_probs_0': log_probs_0_array,
    'log_probs_1': log_probs_1_array,
    'class_pred': class_pred_array
})

results_test = {}
for metric_name, metric_func in metrics.items():
        if metric_name == 'ROC-AUC':
            # results[metric_name] = roc_auc_score(y_test, grid_search.best_estimator_.predict_proba(X_test)[:, 1])
            # y_pred_proba = log_probs[:, 0]
            roc_auc = metric_func(y_test, log_probs_1_array)
            results_test[metric_name] = roc_auc
            RocCurveDisplay.from_predictions(y_test, log_probs_1_array)
        elif metric_name == 'Accuracy':
            results_test[metric_name] = metric_func(y_test, y_pred_array)
        else:
            results_test[metric_name] = metric_func(y_test, class_pred_array, average='weighted')
print(results_test)
confm(y_test, y_pred_array)
            

In [None]:
from plotly.subplots import make_subplots
fig = make_subplots(
    rows=len(data_total)+3,
    cols=1,
    subplot_titles=['Факторы'] + list(data_total.keys()) + ['Combined', 'Mean'],
    shared_xaxes=True
    )
for coln in required_columns:
    print(df.head())
    fig.add_trace(
        go.Scatter(
            x=df.index,
            y=df[coln],
            mode='lines',
            name=coln
        ),
        col=1,
        row=1
    )
i = 2

for data_name, data_i in data_total.items():
    # for coln in data_i.columns:
    fig.add_trace(
        go.Scatter(
            x=data_i.index,
            y=data_i['y_pred'],
            mode='lines',
            name='y_pred'
        ),
        col=1,
        row=i
    )

    i += 1

fig.add_trace(
        go.Scatter(
            x=data_combined.index,
            y=data_combined['y_pred'],
            mode='lines',
            name='y_pred'
        ),
        col=1,
        row=i
    )
i += 1

fig.add_trace(
        go.Scatter(
            x=df.index,
            y=data_mean['y_pred'],
            mode='lines',
            name='y_pred'
        ),
        col=1,
        row=i
    )
i += 1

fig.show()