In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import polars as pl
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings('ignore')

ROOT = '../../inputs'

In [2]:
start_time_utc = datetime.datetime.now()
print(f'Notebook Start Time (UTC): {start_time_utc}')

start_time_kst = start_time_utc + datetime.timedelta(hours=9)
print(f"Notebook Start Time (KST): {start_time_kst}")

Notebook Start Time (UTC): 2024-05-23 00:33:50.432665
Notebook Start Time (KST): 2024-05-23 09:33:50.432665


In [3]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  #!!?
                df = df.with_columns(pl.col(col).dt.total_days()) # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.7:
                    df = df.drop(col)
        
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        
        return df



class Aggregator:
    # Please add or subtract features yourself, be aware that too many features will take up too much space.
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        expr_median = [pl.median(col).alias(f"median_{col}") for col in cols]
        expr_var = [pl.var(col).alias(f"var_{col}") for col in cols]

        return expr_max + expr_last + expr_mean 

    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        expr_median = [pl.median(col).alias(f"median_{col}") for col in cols]

        return expr_max + expr_last + expr_mean 

    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        # expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
        return expr_max + expr_last  # +expr_count

    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return expr_max + expr_last

    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return expr_max + expr_last

    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [4]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df


def read_files(regex_path, depth=None):
    chunks = []
    
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

In [5]:
from polars import selectors as cs


def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    display(df_base.select(cs.ends_with('D')))
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base


def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

In [6]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [7]:
ROOT            = Path(ROOT)

TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_applprev_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2)
    ]
}

In [8]:
df_train = feature_eng(**data_store)
print("train data shape:\t", df_train.shape)
del data_store

display(df_train.to_pandas().dtypes.value_counts())

df_train = df_train.pipe(Pipeline.filter_cols)
gc.collect()

assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,dateofbirth_337D,dateofbirth_342D,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,datefirstoffer_1144D,datelastinstal40dpd_247D,datelastunpaid_3546854D,dtlastpmtallstes_4499206D,firstclxcampaign_1125D,firstdatedue_489D,lastactivateddate_801D,lastapplicationdate_877D,lastapprdate_640D,lastdelinqdate_224D,lastrejectdate_50D,lastrepayingdate_696D,maxdpdinstldate_3546855D,payvacationpostpone_4187118D,validfrom_1069D,max_approvaldate_319D,max_creationdate_885D,max_dateactivated_425D,max_dtlastpmt_581D,max_dtlastpmtallstes_3545839D,max_employedfrom_700D,max_firstnonzeroinstldate_307D,last_approvaldate_319D,last_creationdate_885D,last_dateactivated_425D,last_dtlastpmt_581D,last_dtlastpmtallstes_3545839D,last_employedfrom_700D,…,mean_numberofoverdueinstlmaxdat_641D,mean_overdueamountmax2date_1002D,mean_overdueamountmax2date_1142D,mean_refreshdate_3813885D,max_contractdate_551D,max_contractmaturitydate_151D,max_lastupdate_260D,last_contractdate_551D,last_contractmaturitydate_151D,last_lastupdate_260D,mean_contractdate_551D,mean_contractmaturitydate_151D,mean_lastupdate_260D,max_birth_259D,max_birthdate_87D,max_empl_employedfrom_271D,last_birth_259D,last_birthdate_87D,last_empl_employedfrom_271D,mean_birth_259D,mean_birthdate_87D,mean_empl_employedfrom_271D,max_contractenddate_991D,max_openingdate_313D,last_contractenddate_991D,last_openingdate_313D,mean_contractenddate_991D,mean_openingdate_313D,max_openingdate_857D,last_openingdate_857D,mean_openingdate_857D,max_pmts_date_1107D,last_pmts_date_1107D,mean_pmts_date_1107D,max_empls_employedfrom_796D,last_empls_employedfrom_796D,mean_empls_employedfrom_796D
date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,…,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,1986-07-01,,2017-09-15,,,,1986-07-01,,2017-09-15,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,1957-08-01,,2008-10-29,,,,1957-08-01,,2008-10-29,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,2013-04-03,,,2013-04-03,,,,,,2013-04-03,,,,2010-02-15,2013-05-04,,2013-04-03,,,,2010-02-15,…,,,,,,,,,,,,,,1974-12-01,,2010-02-15,,,,1974-12-01,,2010-02-15,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,2019-01-07,,,2019-01-07,,,,,,2019-01-07,,,,2018-05-15,2019-02-07,,2019-01-07,,,,2018-05-15,…,,,,,,,,,,,,,,1993-08-01,,2018-05-15,,,,1993-08-01,,2018-05-15,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,2019-01-08,,,,,,,,,2019-01-08,,,,,2019-02-08,,2019-01-08,,,,,…,,,,,,,,,,,,,,1994-01-01,,2014-12-15,,,,1994-01-01,,2014-12-15,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
,,2018-01-11,,1960-01-01,,,,2020-10-19,2013-02-19,,2016-09-15,,2016-05-09,2007-07-13,2019-10-16,2019-10-09,2019-10-09,2016-09-15,2017-11-30,,2016-08-15,,,2019-10-09,2019-10-09,2019-10-16,2020-10-08,2020-10-08,2015-01-15,2019-11-08,2007-06-12,2007-06-12,2007-06-19,,,1998-08-15,…,,2015-11-17,,2020-05-14,,,,,,,,,,1960-01-01,,,1960-01-01,,,1960-01-01,,,,,,,,,,,,,,,,,
,,2005-06-15,,1950-11-01,,,,2020-10-19,2016-03-01,,2020-05-30,,2016-03-01,2015-08-17,2019-10-07,2019-09-30,2019-09-30,2020-05-30,,,2020-05-30,,,2019-09-30,2019-09-30,2019-10-07,2020-09-28,2020-10-13,,2019-10-30,2015-07-17,2015-07-17,2015-07-17,2016-12-17,2016-12-17,,…,,,,2020-05-14,,,,,,,,,,1950-11-01,,,,,,1950-11-01,,,,,,,,,,,,,,,,,
,,,,1977-08-01,,,,2020-10-19,,,2019-03-16,2019-10-21,,2018-10-16,2019-08-23,2020-08-21,2019-08-19,2019-03-16,,,2019-02-16,,,2019-08-19,2020-08-21,2019-08-23,2019-10-21,2019-10-21,2018-02-01,2020-09-21,2018-09-16,2018-09-16,2018-09-24,2019-03-19,2019-03-19,2018-02-01,…,2020-05-13,,2020-05-13,2020-05-14,,,,,,,,,,1977-08-01,,,1977-08-01,,,1977-08-01,,,,,,,,,,,,,,,,,
,,2008-02-15,,1950-02-01,,,,2020-10-17,2014-04-30,,2018-01-15,,2017-06-21,2013-07-29,2019-12-23,2019-12-18,2019-12-18,2018-01-15,2013-06-29,,2015-08-15,,,2019-12-18,2019-12-18,2019-12-23,2019-12-17,2020-10-09,,2020-01-18,,2013-06-29,,,,,…,,2015-06-20,,2020-05-14,,,,,,,,,,1950-02-01,,,,,,1950-02-01,,,2018-05-28,2015-05-29,,2014-08-18,2018-05-28,2015-01-07,2015-05-29,2014-08-18,2015-01-07,,,,,,


train data shape:	 (1526659, 861)


float64    659
object     192
int64        7
int8         2
bool         1
Name: count, dtype: int64

0

In [None]:
df_train, cat_cols = to_pandas(df_train)
# df_train = reduce_mem_usage(df_train)
# print("train data shape:\t", df_train.shape)
# nums=df_train.select_dtypes(exclude='category').columns
# from itertools import combinations, permutations
# #df_train=df_train[nums]
# nans_df = df_train[nums].isna()
# nans_groups={}
# for col in nums:
#     cur_group = nans_df[col].sum()
#     try:
#         nans_groups[cur_group].append(col)
#     except:
#         nans_groups[cur_group]=[col]
# del nans_df; x=gc.collect()

# def reduce_group(grps):
#     use = []
#     for g in grps:
#         mx = 0; vx = g[0]
#         for gg in g:
#             n = df_train[gg].nunique()
#             if n>mx:
#                 mx = n
#                 vx = gg
#             #print(str(gg)+'-'+str(n),', ',end='')
#         use.append(vx)
#         #print()
#     print('Use these',use)
#     return use

# def group_columns_by_correlation(matrix, threshold=0.8):
#     # 计算列之间的相关性
#     correlation_matrix = matrix.corr()

#     # 分组列
#     groups = []
#     remaining_cols = list(matrix.columns)
#     while remaining_cols:
#         col = remaining_cols.pop(0)
#         group = [col]
#         correlated_cols = [col]
#         for c in remaining_cols:
#             if correlation_matrix.loc[col, c] >= threshold:
#                 group.append(c)
#                 correlated_cols.append(c)
#         groups.append(group)
#         remaining_cols = [c for c in remaining_cols if c not in correlated_cols]
    
#     return groups

# uses=[]
# for k,v in nans_groups.items():
#     if len(v)>1:
#             Vs = nans_groups[k]
#             #cross_features=list(combinations(Vs, 2))
#             #make_corr(Vs)
#             grps= group_columns_by_correlation(df_train[Vs], threshold=0.8)
#             use=reduce_group(grps)
#             uses=uses+use
#             #make_corr(use)
#     else:
#         uses=uses+v
#     print('####### NAN count =',k)
# print(uses)
# print(len(uses))
# uses=uses+list(df_train.select_dtypes(include='category').columns)
# print(len(uses))
# df_train=df_train[uses]
# # df_train.drop(['requesttype_4525192L_cnt','max_empl_employedtotal_800L_cnt', 'max_empl_industry_691L_cnt'], axis=1, inplace=True)

In [None]:
df_train.to_parquet('../../dataset/depth_data_dash.parquet')

categorical_features = df_train.select_dtypes(include='category').columns
np.save('../../dataset/categorical_features_dash.npy', categorical_features)
print(len(categorical_features))

display(df_train.dtypes.value_counts())

In [None]:
y = df_train["target"]
weeks = df_train["WEEK_NUM"]
oof = df_train[['WEEK_NUM', 'target']]
oof['probability'] = np.zeros(len(oof))
display(oof)

df_train= df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
# df_train, y = SMOTE().fit_resample(df_train, y)

display(df_train)

In [None]:
df_train

In [None]:
n_split = 5
cv = StratifiedGroupKFold(n_splits=n_split, shuffle=False)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "sample_weight":'balanced',
    "device": "cpu", 
    "verbose": -1,
}

fitted_models = []
cv_scores = []

for idx_train, idx_valid in cv.split(df_train, y, groups=weeks):#   Because it takes a long time to divide the data set, 
    X_train, y_train = df_train.iloc[idx_train], y.iloc[idx_train]# each time the data set is divided, two models are trained to each other twice, which saves time.
    X_valid, y_valid = df_train.iloc[idx_valid], y.iloc[idx_valid]
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)] )
    fitted_models.append(model)
    y_pred_valid = model.predict_proba(X_valid)[:,1]
    oof[idx_valid] = y_pred_valid
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores.append(auc_score)
    
print("CV AUC scores: ", cv_scores)
print("AVG CV AUC score: ", np.mean(cv_scores))
print("Maximum CV AUC score: ", max(cv_scores))

In [None]:
best_idx = np.argmax(cv_scores)
best_idx

In [None]:
lgb.plot_importance(fitted_models[best_idx], importance_type="split", figsize=(10, 50))
plt.show()

In [None]:
import joblib

joblib.dump(fitted_models, 'lgb_models.joblib')

notebook_info = {
    'notebook_start_time': start_time_kst,
    'description': 'Add notebook info dict to store cols and cat_cols',
    'cols': df_train.columns.to_list(),
    'cat_cols': cat_cols,
}
joblib.dump(notebook_info, 'notebook_info.joblib')

In [None]:
!ls -al

In [None]:
print("CV AUC scores: ", cv_scores)
print("AVG CV AUC score: ", np.mean(cv_scores))
print("Maximum CV AUC score: ", max(cv_scores))

In [None]:
from pathlib import Path
from typing import Optional, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import auc, roc_curve, roc_auc_score


class Evaluator:
    def __init__(
            self,
            oof: pd.DataFrame,
            save_path: Optional[Path]=None
        ):

        self.oof = oof
        self.save_path = save_path

        assert 'WEEK_NUM' in self.oof.columns
        assert 'target' in self.oof.columns
        assert 'probability' in self.oof.columns

    def plot_pred(self, is_log: bool=False) -> None:
        _, ax = plt.subplots()
        sns.histplot(data=self.oof, x='probability', hue='target', bins=50, ax=ax)
        if is_log:
            ax.set_yscale('log')
        if self.save_path is not None:
            plt.savefig(Path.joinpath(self.save_path, 'hist_pred.png'))
        plt.show()

    def plot_roc(self) -> None:
        fpr, tpr, _ = roc_curve(self.oof['target'], self.oof['probability'])
        _, ax = plt.subplots()
        ax.plot(fpr, tpr, label=f'ROC curve (AUC = {auc(fpr, tpr):.2f})')
        ax.plot([0, 1], [0, 1], linestyle='--', color='k', label='Random')
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.legend()
        if self.save_path is not None:
            plt.savefig(Path.joinpath(self.save_path, 'roc_curve.png'))
        plt.show()

    def plot_gini(self) -> Tuple[pd.DataFrame, float]:
        gini_per_week = (
            self.oof.
            groupby('WEEK_NUM')
            .apply(
                lambda g: 2 * roc_auc_score(g['target'], g['probability']) - 1,
                include_groups=False
            )
        )
        gini_per_week.name = 'gini'
        gini_per_week = gini_per_week.reset_index().sort_values('WEEK_NUM')

        linear_regression = LinearRegression()
        linear_regression.fit(gini_per_week[['WEEK_NUM']], gini_per_week[['gini']])
        a = linear_regression.coef_[0].item()
        b = linear_regression.intercept_.item()

        gini_per_week['regression'] = a * gini_per_week['WEEK_NUM'] + b
        gini_per_week['residuals'] = gini_per_week['gini'] - gini_per_week['regression']
        stability = gini_per_week['gini'].mean() + 88.0 * min([0, a]) - 0.5 * gini_per_week['residuals'].std()

        _, ax = plt.subplots()
        ax.scatter(gini_per_week['WEEK_NUM'], gini_per_week['gini'], alpha=0.5, label='Gini coefficient')
        ax.plot(
            a * np.arange(0, 92) + b,
            label=f'y = {a:.4f}x + {b:.4f}',
            color='tab:orange'
        )
        ax.set(
            xlabel='WEEK_NUM',
            ylabel='Gini coefficient',
            ylim=[0, 1],
            title='stability: {:.4f}'.format(stability)
        )
        ax.legend()
        if self.save_path is not None:
            plt.savefig(Path.joinpath(self.save_path, 'gini_weeks.png'))
        plt.show()

        outcome = pd.DataFrame([[stability, a, b]], columns=['stability', 'slope', 'intercept'])
        return gini_per_week, outcome

In [None]:
evaluater = Evaluator(oof.query('probability!=-1'))
gini_per_week, outcome = evaluater.plot_gini()
display(gini_per_week)
print(outcome)

logger.info(
    f'stability: {outcome["stability"].item()}\n'
    + f'slope: {outcome["slope"].item()}\n'
    + f'intercept: {outcome["intercept"].item()}\n'
)

outcome.to_csv(paths.output_dir.joinpath('outcome.csv'), index=False)