# Imbalanced bainary classification

## Data Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_excel('data/Training.xlsb', index_col='ID', engine='pyxlsb')
pd.set_option('display.max_columns', 30)
df.head()

Splitting features at categorical and numeric

In [None]:
num_features = []
cat_features = []
for col in df.columns[:-1]:
    if df[col].dtype == 'object':
        cat_features.append(col)
    else:
        num_features.append(col)
df.shape, num_features, cat_features

In [None]:
df.describe()

Let's take a look how well power transformation will scale our data. Save it to `scaled`, in order to include this transformation into pipeline

In [None]:
from sklearn.preprocessing import power_transform

scaled = pd.DataFrame(power_transform(df[num_features]), columns=num_features)

In [None]:
scaled.describe()

Histogram and correlogram below show that power transformation did great job in scaling data

In [None]:
plt.hist(scaled.L);

In [None]:
sns.pairplot(scaled[num_features[:-1]]);

Scatter plot below shows how imbalanced classes are

In [None]:
a = scaled.columns[0]
b = scaled.columns[2]
plt.scatter(scaled[a], scaled[b], c=df['MARKER']);

I found out that features below (`N`, `P`, `V`) are good to encode ordinary

In [None]:
cat_to_ord_encode = ['N', 'P', 'V']
for cat in cat_to_ord_encode:
    print(cat)
    print(df[cat].value_counts(), '\n')
    if cat in cat_features:
        cat_features.remove(cat)

In [None]:
from sklearn.model_selection import train_test_split

X, y = df.drop('MARKER', axis=1), df['MARKER']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)

## Logistic Regression

### Pipeline construction

Since classes are very imbalanced it's not representative to use accuracy for scoring. That's why we'll focus on sensitivity score and `classification_report`.

In [None]:
from imblearn.metrics import specificity_score
from imblearn.metrics import sensitivity_score
from imblearn.metrics import classification_report_imbalanced

def scoring(y_test, y_pred, display_report=False):
    """
    Function to display specificity and sensitivity scores. Also displays classification report
    ---
    params:
        y_test - iterable, one-dimensional array, actual values.
        y_pred - iterable, one-dimensional array, predicted values.
        display_report - wether dispaly or not classification report
    """
    print(f"Specificity: {specificity_score(y_test, y_pred)}")
    print(f"Sensitivity: {sensitivity_score(y_test, y_pred)}")
    if display_report:
        print(classification_report_imbalanced(y_test, y_pred))

To balance classes I will perform over-sampling using Adaptive Synthetic Sampling approach. I have chosen over-sampling but not under-sampling in order to save dependencies that might be useful for learning process.

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN


preprocessing = make_column_transformer(
    (OrdinalEncoder(), cat_to_ord_encode),
    (OneHotEncoder(), cat_features),
    (PowerTransformer(), num_features)
)

logreg_pip = make_pipeline(
    preprocessing,
    ADASYN(n_jobs=-1),
    LogisticRegression(n_jobs=-1, max_iter=5000)
)

In [None]:
logreg_pip.fit(X_train, y_train)
y_pred_logreg = logreg_pip.predict(X_test)

### Interpreting results

In [None]:
scoring(y_test, y_pred_logreg, display_report=True)

## Boosting algorithms

### Pipeline construction

In [None]:
from imblearn.ensemble import EasyEnsembleClassifier, RUSBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBRFClassifier

rus_params = {
    'rusboostclassifier__n_estimators': [50, 100, 200],
    'rusboostclassifier__sampling_strategy': [0.1, 0.3, 0.5, 'auto'],
    'rusboostclassifier__learning_rate': [0.7, 1, 1.3]
}

xgb_params = {
    'xgbrfclassifier__n_estimators': [50, 100, 200],
    'xgbrfclassifier__learning_rate': [0.7, 1, 1.3],
    'xgbrfclassifier__reg_lambda': [1, 0.7, 0.5, 0.25, 0.1]
}

rus_pip = make_pipeline(
    preprocessing,
    ADASYN(n_jobs=-1),
    RUSBoostClassifier()
)

xgb_pip = make_pipeline(
    preprocessing,
    ADASYN(n_jobs=-1),
    XGBRFClassifier(
        n_jobs=-1, predictor='cpu_predictor',
        use_label_encoder=False, verbosity=1
    )
)

rus_grid = GridSearchCV(rus_pip, param_grid=rus_params, n_jobs=-1)
xgb_grid = GridSearchCV(xgb_pip, param_grid=xgb_params, n_jobs=-1)

In [None]:
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    rus_grid.fit(X_train, y_train);

### Results for RUSBoostClassifier

Since RUSBoost [uses under-sampling](https://imbalanced-learn.org/stable/references/generated/imblearn.ensemble.RUSBoostClassifier.html) to balance data, there is no point in over-sampling

In [None]:
y_pred_rus = rus_grid.predict(X_test)
scoring(y_test, y_pred_rus, display_report=True)
rus_grid.best_params_

### Results for XGBoostClassifier

For XGBoostClassifier I used same preprocessing pipeline as for Logistic Regression.

In [None]:
xgb_grid.fit(X_train, y_train);

In [None]:
y_pred_xgb = xgb_grid.predict(X_test)
scoring(y_test, y_pred_xgb, display_report=True)
xgb_grid.best_params_

**Thank you for your time! I will apreciate to see your feedback and hope for further cooperation.**