In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


# Introduction

This is a follow up to my previous baseline work on [😌 ICR - Naive Approach - XGB, weighted classes](https://www.kaggle.com/code/wamateusz/icr-naive-approach-xgb-weighted-classes)

Most notebooks in this competition:
- make use of the `greeks.csv`
- use Ensamble methods
- remove outliers

In this notebook, I want to evaluate impact of the above on the performance of the models.

I also want to elucidate how and why those methods are used.


## Credits
Following notebooks were used to extract some of the methods:

https://www.kaggle.com/code/gokifujiya/icr-identifying-age-related-conditions-baseline
https://www.kaggle.com/code/vadimkamaev/icr-identify-age



# Import files

In [3]:
from collections import defaultdict
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import scipy.stats as stats

from scipy.stats import gaussian_kde, probplot
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.ensemble import VotingClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, brier_score_loss, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, PowerTransformer, FunctionTransformer, Binarizer, OrdinalEncoder
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

# 📚 Preparing the dataset

In [4]:
# Load the data.
train_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
greeks_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')
sample_submission_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')

# Add 'Class' to Greeks
greeks_df = greeks_df.join(train_df.Class)

# What about the Greeks?
## The indirect approach

In notebook: [ICR: The devil is in the Greeks](https://www.kaggle.com/code/sugataghosh/icr-the-devil-is-in-the-greeks) [Sugata Ghosh](https://www.kaggle.com/sugataghosh) discussed the value of `greeks.csv` at lenght. 
- `greeks.csv` supplemental metadata is far more useful in predicting `Class` than data from the actual `train.csv` dataset
- `Alpha` directly determines class (`Alpha = 'A'` => `Class = '0', 'Alpha = 'B'` =>  `Class = 1`)
    - Therefore, we do not use `Alpha` in our analysis
- Epsilon contains a date

In his model, he suggested training models that use the training features to predict `Beta`, `Gamma` and `Delta` classes, and then using them to predict `Class` (via a model trained on the original `greeks.csv` dataset).

He concluded that models using Greeks and training data has similar performance to a model using just Greeks, which implies that there is little useful information contained in the *primary* dataset.

I haven't seen this data being used in many high-performing datasets.

## Epsilon
However, a different approach surfaced in a number of other notebooks, eg. in [Vadim Kamenev's](https://www.kaggle.com/vadimkamaev) widely copied [icr-identify-age](https://www.kaggle.com/code/vadimkamaev/icr-identify-age).

Vadim used only `Epsilon`, which is a timestamp datapoint, in the training process.
In the test dataset, he used the maximum value of epsilon plus 1 - in other words, he implied that all test samples were gathered after training dataset.

This assumption is sound because in the competition's description we are told:

> The date the data for this subject was collected. Note that all of the data in the test set was collected after the training set was collected.

### Diving into Vadim's code

```python
train_pred_and_time = pd.concat((train_df[predictor_columns], times), axis=1)
test_predictors = np.array(test_df[predictor_columns])
test_pred_and_time = np.concatenate((test_predictors, np.zeros((len(test_predictors), 1)) + train_pred_and_time.Epsilon.max() + 1), axis=1)
```

To explain idiosyncracies: 

- `train_df[predictor_columns]` - is just the training dataset, minus `Class` and `Id` columns
- `test_predictors` - is just the test dataset, minus `Id` column
- `train_pred_and_time` - training dataset plus the epsilon column (converted to ordinals)
- `test_pred_and_time` - test dataset, plus synthetic epsilon

## Rolling mean: Are data points later in time more likely to be of Class 1?

In [5]:
rolling_mean_class = (
    greeks_df[["Epsilon", "Class"]]
    .assign(Epsilon=pd.to_datetime(greeks_df.Epsilon, errors="coerce"))
    .dropna()
    .sort_values(by="Epsilon")
    .rolling(window="365D", on="Epsilon")
    .mean()
)

fig = px.line(
    rolling_mean_class,
    x="Epsilon",
    y="Class",
    height=540,
    width=840,
    color_discrete_sequence=["#010D36"],
    symbol_sequence=["x"],
    line_shape="spline",
    markers=True,
    title="Class Trend - Rolling Mean over 365 Days",
)
fig.update_layout(
    title_font_size=18,
)
fig.update_traces(marker=dict(size=6, color="#FF2079", opacity=0.7))
fig.show()

### Conclusion

Yes, that seems to be the case.

# Meta:
What am I trying to do here?
1. Visualize the rolling mean of class, as a function of time (to determine whether there's a higher likelihood of getting a positive class at a certain point in time).
2. Add Epsilon data to the training dataset (possibly as Ordial?)
3. Add Epsilon data to the test dataset (as max epsilon, ordial, plus 1).
4. Rewrite my preprocessing code from the above as sklearn's preprocessing pipeline.

Points 2. and 3. should be handled by a preprocessing pipeline.

# Data Preprocessing

## Make two datasets

In [6]:
# Make sure values are non-zero
if np.all(np.isclose(test_df.select_dtypes("number").sum(), 0)):
    test_numeric_cols = test_df.select_dtypes("number").columns
    test_df[test_numeric_cols] += 1e-9

In [7]:
def get_times_from_greeks(greeks_df):
    def convert_to_ordinal(date_str):
        if date_str != 'Unknown':
            return datetime.strptime(date_str,'%m/%d/%Y').toordinal()
        else:
            return np.nan

    times = greeks_df.Epsilon.map(convert_to_ordinal)
    return times

greeks_times = get_times_from_greeks(greeks_df)

# With Greeks
train_with_greeks_df = train_df.copy()
train_with_greeks_df['Epsilon'] = greeks_times

# Without Greeks
train_no_greeks_df = train_df.copy()

# Test, with Greeks
test_with_greeks_df = test_df
test_with_greeks_df['Epsilon'] = train_with_greeks_df.Epsilon.max() + 1

# Test, without Greeks
test_no_greeks_df = test_df

In [8]:
# Box-Cox requires that values in any column are not identical. 
# This is extremely unlikely to be a case for any of the actual test cases, but is the case for the dummy test data provided in the dataset.

def fix_columns_in_test_ds(df):
    float_rows = df.dtypes == float
    
    for column in df.columns:
        values = df[column].values
        if np.all(values == values[0]) and df[column].dtype == float:
            df.loc[0, float_rows] += 1

fix_columns_in_test_ds(test_with_greeks_df)
fix_columns_in_test_ds(test_no_greeks_df)

## Normalization functions

In [9]:
def scale_epsilon(df):
    if "Epsilon" not in df.columns:
        return df
    
    scaling_pipeline = make_pipeline(
    make_column_transformer(
        (
            MinMaxScaler((1, df.Epsilon.max() - df.Epsilon.min() + 2)),
            make_column_selector("Epsilon"),
        ),
        remainder="passthrough",
        verbose_feature_names_out=False,
    ),)
    df_scaled = pd.DataFrame(
        scaling_pipeline.fit_transform(df),
        columns=scaling_pipeline.get_feature_names_out(),
        index=df.index,
    )
    df_scaled = df_scaled.astype(df.dtypes)
    return df_scaled

def get_r2_scores(df):
    numeric_columns = df.select_dtypes("number").drop("Class",
                                                   axis=1).columns.tolist()
    df = scale_epsilon(df)    
    r2_scores = defaultdict(tuple)
    
    for feature in numeric_columns:
        orig = df[feature].dropna()
        _, (*_, R_orig) = probplot(orig, rvalue=True)
        _, (*_, R_log) = probplot(np.log(orig), rvalue=True)
        _, (*_, R_sqrt) = probplot(np.sqrt(orig), rvalue=True)
        _, (*_, R_reci) = probplot(np.reciprocal(orig), rvalue=True)
        _, (*_, R_boxcox) = probplot(stats.boxcox(orig)[0], rvalue=True)
        _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)
        r2_scores[feature] = (
            R_orig * R_orig,
            R_log * R_log,
            R_sqrt * R_sqrt,
            R_reci * R_reci,
            R_boxcox * R_boxcox,
            R_yeojohn * R_yeojohn,
        )
        
    r2_scores = pd.DataFrame(
        r2_scores,
        index=(
            "Original",
            "Log",
            "Sqrt",
            "Reciprocal",
            "BoxCox",
            "YeoJohnson",
        ),
    ).T
    r2_scores["Winner"] = r2_scores.idxmax(axis=1)
    return r2_scores

def get_transform_cols(r2_scores):
    no_transform_cols = r2_scores.query("Winner == 'Original'").index
    log_transform_cols = r2_scores.query("Winner == 'Log'").index
    sqrt_transform_cols = r2_scores.query("Winner == 'Sqrt'").index
    reciprocal_transform_cols = r2_scores.query("Winner == 'Reciprocal'").index
    boxcox_transform_cols = r2_scores.query("Winner == 'BoxCox'").index
    yeojohnson_transform_cols = r2_scores.query("Winner == 'YeoJohnson'").index

    transform_cols = {
        "no_transform": no_transform_cols,
        "log_transform": log_transform_cols,
        "sqrt_transform": sqrt_transform_cols,
        "reciprocal_transform": reciprocal_transform_cols,
        "boxcox_transform": boxcox_transform_cols,
        "yeojohnson_transform": yeojohnson_transform_cols,
    }

    return transform_cols

In [10]:
def make_preprocess_pipeline(transform_cols):
    no_transform_cols = transform_cols["no_transform"]
    log_transform_cols = transform_cols["log_transform"]
    sqrt_transform_cols = transform_cols["sqrt_transform"]
    reciprocal_transform_cols = transform_cols["reciprocal_transform"]
    boxcox_transform_cols = transform_cols["boxcox_transform"]
    yeojohnson_transform_cols = transform_cols["yeojohnson_transform"]

    preprocess_pipeline = make_pipeline(
        make_column_transformer(
            (
                StandardScaler(),
                no_transform_cols.to_list(),
            ),
            (
                make_pipeline(
                    FunctionTransformer(
                        func=np.log, feature_names_out="one-to-one"
                    ),
                    StandardScaler(),
                ),
                log_transform_cols.to_list(),
            ),
            (
                make_pipeline(
                    FunctionTransformer(
                        func=np.log, feature_names_out="one-to-one"
                    ),
                    StandardScaler(),
                ),
                sqrt_transform_cols.to_list(),
            ),
            (
                make_pipeline(
                    FunctionTransformer(
                        func=np.reciprocal, feature_names_out="one-to-one"
                    ),
                    StandardScaler(),
                ),
                reciprocal_transform_cols.to_list(),
            ),
            (
                PowerTransformer(method="box-cox", standardize=True),
                boxcox_transform_cols.to_list(),
            ),
            (
                PowerTransformer(method="yeo-johnson", standardize=True),
                yeojohnson_transform_cols.to_list(),
            ),
            (
                make_pipeline(
                    SimpleImputer(strategy="most_frequent"),
                    OrdinalEncoder(
                        handle_unknown="use_encoded_value", unknown_value=-1
                    ),
                ),
                make_column_selector(dtype_include=object),  # type: ignore
            ),
            remainder="passthrough",
            verbose_feature_names_out=False,
        ),
        KNNImputer(n_neighbors=10, weights="distance"),
    )
    return preprocess_pipeline

In [11]:
train_with_greeks_transform_cols = get_transform_cols(get_r2_scores(train_with_greeks_df))
train_with_no_greeks_transform_cols = get_transform_cols(get_r2_scores(train_no_greeks_df))

greeks_preprocess_pipeline = make_preprocess_pipeline(train_with_greeks_transform_cols)
no_greeks_preprocess_pipeline = make_preprocess_pipeline(train_with_no_greeks_transform_cols)

train_with_greeks_df = scale_epsilon(train_with_greeks_df)
test_with_greeks_df = scale_epsilon(test_with_greeks_df)

## TODO: Add visualization for how well epsilons were regularized

# 🤖 Train the model 

## Helper functions

In [12]:
def get_undersampling_fraction(y_true):
    N0, N1 = np.bincount(y_true)
    return 1 - N1 / N0

def assert_balanced_learning(y_train, n_samples_tol=1):
    N0, N1 = np.bincount(y_train)
    assert np.isclose(N0, N1, atol=n_samples_tol)
    
def balanced_log_loss(y_true, y_pred, **kwargs):
    """Competition evaluation metric - balanced logarithmic loss.
    The overall effect is such that each class is roughly equally
    important for the final score."""
    N0, N1 = np.bincount(y_true)

    y0 = np.where(y_true == 0, 1, 0)
    y1 = np.where(y_true == 1, 1, 0)

    eps = kwargs.get("eps", 1e-15)
    y_pred = np.clip(y_pred, eps, 1 - eps)
    p0 = np.log(1 - y_pred)
    p1 = np.log(y_pred)

    return -(1 / N0 * np.sum(y0 * p0) + 1 / N1 * np.sum(y1 * p1)) * 0.5

def average_scores(scores):
    average_scores = {}
    for key, values in scores.items():
        if values:
            average_scores[key] = sum(values) / len(values)
        else:
            average_scores[key] = None
    return average_scores

# Individual model hyperparameter fine-tuning 

Have a look at suggested balancing methods: https://www.kaggle.com/competitions/icr-identify-age-related-conditions/discussion/412507

In [24]:
seed = 42

lgbm_params = {
    "boosting_type": ["gbdt", "dart"],
    "n_estimators": [20, 50, 100, 150, 200, 250, 300],
    "reg_alpha": [0, 0.001, 0.01, 0.1],
    "reg_lambda": [0, 0.001, 0.01, 0.1],
}
lgbm_clf = LGBMClassifier(class_weight="balanced")
lgbm_name = "LGBMClassifier"
lgbm_arg = {"clf": lgbm_clf, "clf_name": lgbm_name, "param_grid": lgbm_params}

xgb_params = {
    "booster": ["gbtree", "gblinear", "dart"],
    "max_depth": [2, 4, 6, 8],
    "lambda": [0, 0.001, 0.01, 0.1, 1],
    "alpha": [0, 0.001, 0.01, 0.1, 1],
}
xgb_clf = XGBClassifier(scale_pos_weight=4.71, random_state=seed)
xgb_name = "XGBlassifier"
xgb_arg = {"clf": xgb_clf, "clf_name": xgb_name, "param_grid": xgb_params}


classifier_params = [lgbm_arg, xgb_arg]
dataset_params = [
    {"ds_name": "greeks", "ds": train_with_greeks_df, "pipeline": greeks_preprocess_pipeline}, 
    {"ds_name": "no_greeks", "ds": train_no_greeks_df, "pipeline": no_greeks_preprocess_pipeline},
]

optimal_hyperparameters = {}

def tune_hyperparameters(df, preprocessing_pipeline, classifier, classifier_name, param_grid):
    assert classifier_name != "preprocessing"
    
    X = df.drop("Class", axis=1)
    y = df.Class

    param_grid = {f"{classifier_name}__{k}": v for k, v in param_grid.items()}

    fine_tuninng_pipeline = Pipeline(steps=[("preprocessing", preprocessing_pipeline), (classifier_name, classifier)])
    grid_search = GridSearchCV(fine_tuninng_pipeline, param_grid, scoring="neg_log_loss")
    grid_search.fit(X, y)

    print(f"Finished hyperparameter fine-tuning in {classifier_name}.")
    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    print(grid_search.best_params_)
    return grid_search.best_params_

In [None]:
for ds in dataset_params:
    optimal_hyperparameters[ds["ds_name"]] = {}
    for clf_params in classifier_params:
        print(f"optimizing {clf_params['clf_name']}")
        optimal_hyperparameters[ds["ds_name"]]["clf_name"] = tune_hyperparameters(
            df=ds["ds"], 
            preprocessing_pipeline=ds["pipeline"], 
            classifier=clf_params["clf"],
            classifier_name=clf_params["clf_name"],
            param_grid=clf_params["param_grid"]
        )

optimizing LGBMClassifier
Finished hyperparameter fine-tuning in LGBMClassifier.
Best parameter (CV score=-0.149):
{'LGBMClassifier__boosting_type': 'dart', 'LGBMClassifier__n_estimators': 300, 'LGBMClassifier__reg_alpha': 0, 'LGBMClassifier__reg_lambda': 0.1}
optimizing XGBlassifier
Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" } are not used.

Parameters: { "max_depth" }

## Optimal hyperparameters are:

## Fine-tune voting

In [None]:
ensamble_params = {
    "weight": tuples = [(round(1-a, 1), round(a,1)) for a in np.arange(1, -0.1, -0.1)],
}

ensamble_classifier = VotingClassifier(
    [
        ("lgbm",
         LGBMClassifier(random_state=seed, **lgbm_params)),
        (
        "xgb", XGBClassifier(random_state=seed, **xgb_params)),
        ("svc", SVC(random_state=seed, **svc_params)),
    ],
    voting="soft",
),
ensamble_name = "XGBlassifier"
ensamble_arg = {"clf": ensamble_classifier, "clf_name": "VotingClassifier", "param_grid": ensamble_params}
optimal_hyperparameters_for_voting = {}

for ds in dataset_params:
    optimal_hyperparameters_for_voting[ds["ds_name"]] = {}
    print(f"optimizing {ensamble_arg['clf_name']}")
    optimal_hyperparameters_for_voting[ds["ds_name"]] = tune_hyperparameters(
        df=ds["ds"], 
        preprocessing_pipeline=ds["pipeline"], 
        classifier=ensamble_arg["clf"],
        classifier_name=ensamble_arg["clf_name"],
        param_grid=ensamble_arg["param_grid"]
    )

## Make models

In [None]:
def get_ensamble_pipeline(preprocess_pipeline, seed):
    lgbm_params = {
        "max_depth": 4,
        "num_leaves": 9,
        "min_child_samples": 17,
        "n_estimators": 200,
        "learning_rate": 0.15,
        "colsample_bytree": 0.4,
        "min_split_gain": 1e-4,
        "reg_alpha": 1e-2,
        "reg_lambda": 5e-3,
    }

    xgb_params = {
        "max_depth": 2,
        "n_estimators": 200,
        "learning_rate": 0.4,
        "subsample": 0.6,
        "min_child_weight": 0.1,
        "max_delta_step": 0.35,
        "colsample_bytree": 0.3,
        "colsample_bylevel": 0.7,
        "min_split_loss": 1e-4,
        "reg_alpha": 2e-3,
        "reg_lambda": 6e-2,
    }
    seed = 42

    voting_classifier_weights = (0.5, 0.5)
    
    current_ensemble = make_pipeline(
                preprocess_pipeline,
                VotingClassifier(
                    [
                        ("lgbm",
                         LGBMClassifier(random_state=seed, **lgbm_params)),
                        (
                        "xgb", XGBClassifier(random_state=seed, **xgb_params)),
                        ("svc", SVC(random_state=seed, **svc_params)),
                    ],
                    voting="soft",
                    weights=voting_classifier_weights,
                ),
    )
    return current_ensemble

In [None]:
greeks_ensamble_pipeline = get_ensamble_pipeline(greeks_preprocess_pipeline, 42)

In [None]:
# def train(df, preprocess_pipeline):

df = train_with_greeks_df
preprocess_pipeline = greeks_preprocess_pipeline

seed = 42
n_splits = 10
# Elements of the training function
# Make an Ensable Classifier
# It should contain three other classifiers
ensamble_classifier = get_ensamble_pipeline(preprocess_pipeline, 42)

# Data: no need for train / test split, Stratified K-Fold will be used instead
# Split the dataset into X and y
X = df.drop("Class", axis=1)
y = df.Class

# Data: classes need to be weighted
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Use Stratified K-Fold
skfold = StratifiedKFold(
    n_splits=n_splits, 
    shuffle=True,
    random_state=seed
)

scores = defaultdict(str)

scores = {
    "balanced_log_loss": [],
    "accuracy": [],
    "precision": [],
    "recall": [],
    "specificity": [],
    "f1": [],
    "roc_auc": [],
}
      

for fold_idx, (train_idx, test_idx) in enumerate(skfold.split(X_resampled, y_resampled)):
    X_train, X_test = X_resampled.iloc[train_idx], X_resampled.iloc[test_idx]
    Y_train, Y_test = y_resampled.iloc[train_idx], y_resampled.iloc[test_idx]

    ensamble_classifier.fit(X_train, Y_train)
    predicted_probabilities = ensamble_classifier.predict_proba(X_test)[:, 1]
    predicted_classes = ensamble_classifier.predict(X_test)
    
    scores["balanced_log_loss"].append(balanced_log_loss(Y_test, predicted_probabilities))
    scores["accuracy"].append(accuracy_score(Y_test, predicted_classes))
    scores["precision"].append(precision_score(Y_test, predicted_classes))
    scores["recall"].append(recall_score(Y_test, predicted_classes))
    scores["specificity"].append(recall_score(Y_test, predicted_classes, pos_label=0))
    scores["f1"].append(f1_score(Y_test, predicted_classes))
    scores["roc_auc"].append(roc_auc_score(Y_test, predicted_probabilities))
    
final_scores = average_scores(scores)

In [None]:
final_scores

In [None]:

#         print('---------------------------------------------------------------')
        
        

#         ## GradientBoosting

#         gb_md = GradientBoostingClassifier(n_estimators = 500, 
#                                        max_depth = 7, 
#                                        learning_rate = 0.01,
#                                        min_samples_split = 10, 
#                                        min_samples_leaf = 20).fit(X_train, Y_train) 

#         gb_pred_1 = gb_md.predict_proba(X_test[X_test['generated'] == 1])[:, 1]
#         gb_pred_2 = gb_md.predict_proba(test)[:, 1]

#         gb_score_fold = roc_auc_score(Y_test[X_test['generated'] == 1], gb_pred_1)
#         gb_cv_scores.append(gb_score_fold)
#         gb_preds.append(gb_pred_2)

#         print('Fold', i+1, '==> GradientBoositng oof ROC-AUC score is ==>', gb_score_fold)

#         ## HistGradientBoosting 

#         hist_md = HistGradientBoostingClassifier(l2_regularization = 0.01,
#                                                  early_stopping = False,
#                                                  learning_rate = 0.01,
#                                                  max_iter = 1000,
#                                                  max_depth = 15,
#                                                  max_bins = 255,
#                                                  min_samples_leaf = 30,
#                                                  max_leaf_nodes = 30).fit(X_train, Y_train)

#         hist_pred_1 = hist_md.predict_proba(X_test[X_test['generated'] == 1])[:, 1]
#         hist_pred_2 = hist_md.predict_proba(test)[:, 1]

#         hist_score_fold = roc_auc_score(Y_test[X_test['generated'] == 1], hist_pred_1)
#         hist_cv_scores.append(hist_score_fold)
#         hist_preds.append(hist_pred_2)

#         print('Fold', i+1, '==> HistGradient oof ROC-AUC score is ==>', hist_score_fold)


#         ## LightGBM

#         lgb_md.fit(X_train, Y_train)

#         lgb_pred_1 = lgb_md.predict_proba(X_test[X_test['generated'] == 1])[:, 1]
#         lgb_pred_2 = lgb_md.predict_proba(test)[:, 1]

#         lgb_score_fold = roc_auc_score(Y_test[X_test['generated'] == 1], lgb_pred_1)    
#         lgb_cv_scores.append(lgb_score_fold)
#         lgb_preds.append(lgb_pred_2)

#         print('Fold', i+1, '==> LightGBM oof ROC-AUC score is ==>', lgb_score_fold)

#         ## XGBoost 

#         xgb_md.fit(X_train, Y_train)

#         xgb_pred_1 = xgb_md.predict_proba(X_test[X_test['generated'] == 1])[:, 1]
#         xgb_pred_2 = xgb_md.predict_proba(test)[:, 1]

#         xgb_score_fold = roc_auc_score(Y_test[X_test['generated'] == 1], xgb_pred_1)    
#         xgb_cv_scores.append(xgb_score_fold)
#         xgb_preds.append(xgb_pred_2)

#         print('Fold', i+1, '==> XGBoost oof ROC-AUC score is ==>', xgb_score_fold)

#         ## Ensemble 

#         ens_pred_1 = gb_pred_1 + hist_pred_1 + lgb_pred_1 + xgb_pred_1
#         ens_pred_2 = gb_pred_2 + hist_pred_2 + lgb_pred_2 + xgb_pred_2

#         ens_score_fold = roc_auc_score(Y_test[X_test['generated'] == 1], ens_pred_1)
#         ens_cv_scores.append(ens_score_fold)
#         ens_preds.append(ens_pred_2)

#         print('Fold', i+1, '==> Ensemble oof ROC-AUC score is ==>', ens_score_fold)

In [None]:
no_greeks_probability_averaged, no_greeks_classifiers = train(train_no_greeks_df, no_greeks_preprocess_pipeline)
greeks_probability_averaged, greeks_classifiers = train(train_with_greeks_df, greeks_preprocess_pipeline)

In [None]:
# no_greeks_probability_averaged, no_greeks_classifiers = train(train_no_greeks_df)
# greeks_probability_averaged, greeks_classifiers = train(train_with_greeks_df)

def print_scores(df, proba_averaged):
    print(f"Balanced Log Loss: {balanced_log_loss(train_no_greeks_df.Class, greeks_probability_averaged):.5f}")
    print(f"Brier Score Loss: {brier_score_loss(train_no_greeks_df.Class, greeks_probability_averaged):.5f}")
    
print("No Greeks: ")
print_scores(train_no_greeks_df, no_greeks_probability_averaged)

print("With Greeks: ")
print_scores(train_with_greeks_df, no_greeks_probability_averaged)

### Evaluate model on test data

In [None]:
def evaluate_model(model, y_actual, y_predicted):
    print('LogLoss: ', model.score(y_actual, y_predicted))
    print('Accuracy: ', accuracy_score(y_actual, y_predicted))
    
evaluate_model(model, y_valid, y_valid_predictions)

In [None]:
y_pred = model.predict(X_valid)

def get_classification_report(y_actual, y_predicted):
    # Create a classification report for the model.
    print('Classification Report:')
    print(classification_report(y_actual, y_predicted))

    # Create a confusion matrix for the model.
    print('Confusion Matrix:')
    print(confusion_matrix(y_actual, y_predicted))
    
get_classification_report(y_actual=y_valid, y_predicted=y_pred)

In [None]:
cm = confusion_matrix(y_valid, y_pred)

# Define the class names.
class_names = ['Normal', 'Diseased']

# Create the heatmap with class names as tick labels.
ax = sns.heatmap(cm, annot = True, fmt = '.0f', cmap = "Blues", annot_kws = {"size": 16},\
           xticklabels = class_names, yticklabels = class_names)

# Set the axis labels.
ax.set_xlabel("Prediction")
ax.set_ylabel("Truth")

# 🏋️‍  Train model for submission
## Train on full data

In [None]:
X_full = scaler.fit_transform(X)
grid_search_for_submission = train_xgb(X_full, y)
model = grid_search_for_submission.best_estimator_
y_full_predictions = model.predict(X_full)

print("Results on full training data:")
evaluate_model(model, y, y_full_predictions)

## Save model

In [None]:
import joblib

model_name = 'xgb_model.joblib'

# Save the model to disk.
joblib.dump(model, model_name)

# Load the saved model from disk.
model_loaded = joblib.load(model_name)

# 🛫 Submission of naive model

In [None]:
X_submission = test_df.iloc[:, 1:]

In [None]:
# Use the loaded model to make predictions on test data.
X_submission = scaler.transform(X_submission)

# Predict the results.
y_submission_pred = model.predict_proba(X_submission)

y_submission_pred

In [None]:
submission = pd.DataFrame(test_df["Id"], columns = ["Id"])
y_pred_df = pd.DataFrame(y_submission_pred, columns = ['0', '1'])

submission["class_0"] = y_pred_df['0']
submission["class_1"] = y_pred_df['1']

submission

In [None]:
submission.to_csv('submission.csv',index = False)