# Import

In [None]:
import os
import json
import pandas as pd
import numpy as np

# explainability
import shap

# plotting
from statsmodels.graphics.gofplots import ProbPlot
import seaborn as sns
import matplotlib.pyplot as plt

# Init

In [None]:
# pandas columns setting
pd.set_option('display.max_columns', 50)

# pretty matplotlib plots
plt.style.use('seaborn') 

# disable warning
import warnings
warnings.filterwarnings('ignore')

# notebook width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

## additional evaluation funcs

In [None]:
def eval_func(**kwrgs):
    score = 0
    
    return score

# Load data

In [None]:
# load data


In [None]:
# load results


# Validation

## Scores and Parameters

## predictions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20,6))

# plot1 object
sns.distplot(true_vals, ax=axes[0])

# plot1 
axes[0].set_title('true_vals histogram')

# plot2 object
sns.distplot(predictions, ax=axes[1])

# plot2 
axes[1].set_title('predictios histogram')

# show plot
plt.show()

### Confusion Matrix

In [None]:
labels = [0, 1]

#### Counts

In [None]:
# regular
cm = confusion_matrix(experiment_data.has_applicants, experiment_data.prediction, labels=labels)
df_cm = pd.DataFrame(cm, index=labels, columns=labels)

sns.heatmap(df_cm, annot=True, fmt="d")
plt.ylabel('True label')
plt.xlabel('Predicted label')

#### Normalized

In [None]:
# normalized
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
df_cm_normalized = pd.DataFrame(cm_normalized, index=labels, columns=labels)

sns.heatmap(df_cm_normalized, annot=True)
plt.ylabel('True label')
plt.xlabel('Predicted label')

In [None]:
slack_bot.post_msg('Model Validation - {} - confusion matrix analysis was done'.format('logitstic_classifier_xgboost'))

### Precision-Recall Curve

In [None]:
# calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(true_vals, prediction_probs)

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the precision-recall curve for the model
plt.plot(recall, precision, marker='.')

# show the plot
plt.show()

### ROC Curve

In [None]:
fpr, tpr, thresholds = roc_curve(true_vals, prediction_probs)

# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')

# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
plt.xlabel('fpr')
plt.ylabel('tpr')

# show the plot
plt.show()

## Residuals plot

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(20,6))

# plot1 object
sns.distplot(true_vals, ax=axes)

# plot1 
axes.set_title('residuals histogram')

# show plot
plt.show()

In [None]:
residuals.abs().describe()

### Residuals vs Fitted

In [None]:
plt.figure(figsize=(20,8))
plot_lm = sns.residplot(predictions, 
                        true_vals, 
                        lowess=True, 
                        scatter_kws={'alpha': 0.5}, 
                        line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})

plot_lm.set_ylim(-0.3, 0.3)
# 
plot_lm.set_title('Residuals vs Fitted')
plot_lm.set_xlabel('Fitted values')
plot_lm.set_ylabel('Residuals')

# annotations
abs_resid = vals
abs_resid_top_3 = abs_resid[:3]


for i in abs_resid_top_3.index:
    plot_lm.annotate(abs_resid_top_3.loc[i ,'abs_resid'], 
                     xy=(abs_resid_top_3.prediction[i], abs_resid_top_3.residuals[i]))

### Residuals vs Prediction Distributions

In [None]:
plt.figure(figsize=(20,8))
sns.jointplot(x=predictions, y=residuals)

### Residuals vs Target Distributions

In [None]:
plt.figure(figsize=(20,8))
sns.jointplot(x=true_vals, y=residuals)

### Residuals QQ plot

In [None]:
model_norm_residuals = (residuals - residuals.mean()) / (residuals.std())
QQ = ProbPlot(model_norm_residuals)
plot_lm_2 = QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', lw=1)

plot_lm_2.set_figheight(8)
plot_lm_2.set_figwidth(12)

plot_lm_2.axes[0].set_title('Normal Q-Q')
plot_lm_2.axes[0].set_xlabel('Theoretical Quantiles')
plot_lm_2.axes[0].set_ylabel('Standardized Residuals');

# annotations
abs_norm_resid = np.flip(np.argsort(np.abs(model_norm_residuals)), 0)
abs_norm_resid_top_3 = abs_norm_resid[:3]

### Outliers

### Features Residuals

In [None]:
sns.boxplot(x=low_card_discrete, y=residuals)

In [None]:
plot = sns.catplot(x=ordinal_discrete, y=abs_resid, kind="point", height=5, aspect=2)

In [None]:
plt.figure(figsize=(20,8))
sns.regplot(x=continous_val, y=residuals)

In [None]:
plt.figure(figsize=(24,10))
s = plt.scatter(x=continous_feature1, y=continous_feature2, s=10, c=residuals, alpha=.9, cmap='brg_r')

plt.colorbar(s)
plt.show()

### Residuals time series

In [None]:
fig, axes = plt.subplots(1, 1,figsize=(20,4))

# plot1 object
sns.lineplot(x=timestamp_val, y=abs_resid, ax=axes)

# show plot
plt.show()

## CV Results

## Importance

In [None]:
importance_df = pd.DataFrame({'features': feature_names, 'importances': features_importances})
importance_df = importance_df.sort_values('importances', ascending=False).reset_index(drop=True)
importance_df

## Shap

In [None]:
shap.initjs()


### Force plot

### summarized features effect