In [1]:
import os
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

In [None]:
# import file

dir=r".\storage\data"
train=pd.read_csv(os.path.join(dir,"train.csv"))
test=pd.read_csv(os.path.join(dir,"test.csv"))
sub_=pd.read_csv(os.path.join(dir,"sample_submission.csv"))
orig=pd.read_csv(os.path.join(dir,"loan_dataset_20000.csv"))

print("train shape :", train.shape)
print("test shape :",test.shape)
print("orig shape :",orig.shape)
train.head()

train shape : (593994, 13)
test shape : (254569, 12)
orig shape : (20000, 22)


Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [3]:
TARGET = 'loan_paid_back'
CATS = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']
BASE = [col for col in train.columns if col not in ['id', TARGET]]

In [4]:
# Step_1
from itertools import combinations

INTER=[]

for col1,col2 in combinations(BASE,2):
    new_col_name=f"{col1}_{col2}"
    INTER.append(new_col_name)
    for df in [train,test,orig]:
        df[new_col_name]=df[col1].astype(str)+"_"+df[col2].astype(str)

print(f"{len(INTER)} Features.")

55 Features.


In [5]:
ORIG=[]

for col in BASE:
    mean_map=orig.groupby(col)[TARGET].mean().reset_index(name=f"orig_mean_{col}")
    train=train.merge(mean_map,on=col,how="left")
    test=test.merge(mean_map,on=col,how="left")
    ORIG.append(f"orig_mean_{col}")

    count_map=orig.groupby(col).size().reset_index(name=f"orig_count_{col}")
    train=train.merge(count_map,on=col,how="left")
    test=test.merge(count_map,on=col,how="left")
    ORIG.append(f"orig_count_{col}")

print(f"{len(ORIG) }Features")

22Features


In [6]:
FEATURES = BASE + ORIG + INTER
print(len(FEATURES), 'Features.')

88 Features.


In [7]:
X = train[FEATURES]
y = train[TARGET]

In [8]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold,KFold
from lightgbm import LGBMClassifier
import lightgbm as lgb
import random
from sklearn.base import  BaseEstimator,TransformerMixin
import xgboost as xgb
from xgboost import XGBClassifier
import catboost as cat
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [12]:

class TargetEncoder(BaseEstimator,TransformerMixin):
    def __init__(self,cols_to_encode,aggs=["mean"],cv=5,smooth="auto",drop_original=False):
        self.cols_to_encode=cols_to_encode
        self.aggs=aggs
        self.cv=cv
        self.smooth=smooth
        self.drop_original=drop_original
        self.mappings_={}
        self.global_stats={}

    def fit(self,X,y):

        temp_df=X.copy()
        temp_df["target"]=y

        for agg_func in self.aggs:
            self.global_stats[agg_func]=y.agg(agg_func)

        for col in self.cols_to_encode:
            self.mappings_[col]={}
            for agg_func in self.aggs:
                self.mappings_[col][agg_func]=temp_df.groupby(col)["target"].agg(agg_func)
        return self
        
    def transform(self,X):
        X_transformed=X.copy()
        for col in self.cols_to_encode:
            for agg_func in self.aggs:
                col_name=f"TE_{col}_{agg_func}"
                X_transformed[col_name]=X_transformed[col].map(self.mappings_[col][agg_func])
                X_transformed[col_name].fillna(self.global_stats[agg_func],inplace=True)
        if self.drop_original:
            X_transformed.drop(columns=self.cols_to_encode,inplace=True)

        return X_transformed

    def fit_transform(self,X,y):
        self.fit(X,y)
        encoded_features=pd.DataFrame(index=X.index)
        kf=KFold(n_splits=self.cv,shuffle=True,random_state=42)
        for train_idx,val_idx in kf.split(X,y):
            X_train,y_train=X.iloc[train_idx],y.iloc[train_idx]
            X_val=X.iloc[val_idx]

            temp_df_train=X_train.copy()
            temp_df_train["target"]=y_train

            for col in self.cols_to_encode:
                for agg_func in self.aggs:
                    col_name=f"TE_{col}_{agg_func}"
                    fold_global_stats=y_train.agg(agg_func)
                    mapping=temp_df_train.groupby(col)["target"].agg(agg_func)
                if agg_func=="mean":
                    counts=temp_df_train.groupby(col)["target"].count()
                    m=self.smooth

                    if m =="auto":
                        variance_between=mapping.var()
                        avg_variance_within=temp_df_train.groupby(col)["target"].var().mean()
                        if variance_between>0:
                            m=avg_variance_within/variance_between
                        else:m=0
                    smoothed_mapping=(counts*mapping+m*fold_global_stats)/(m+counts)
                    encoded_values=X_val[col].map(smoothed_mapping)
                else:
                    encoded_values=X_val[col].map(mapping)
                encoded_features.loc[X_val.index,col_name]=encoded_values.fillna(fold_global_stats)
        X_transformed=X.copy()
        X_transformed=pd.concat([X_transformed,encoded_features],axis=1)
        if self.drop_original:
            X_transformed.drop(columns=self.cols_to_encode,inplace=True)
        return X_transformed
            

In [9]:
#This took a lot of time to code, but I've already finished and saved it as a  file, 
# which you can directly import to get the mapping.
import joblib
os.makedirs("./storage",exist_ok=True)
# joblib.dump(mapping, "./storage/mapping.joblib")
mapping=joblib.load("./storage/mapping.joblib")

In [33]:
# Pre-encoding the target can significantly reduce the time required for optuna optimization and regular training.
cv,SEED=5,42
kf=StratifiedKFold(n_splits=cv,random_state=SEED,shuffle=True)
mapping=[]
for i,(train_idx,val_idx) in enumerate(kf.split(train,train[TARGET])):
    print(f"< {i+1}/{cv} > is Encoding !")

    x_train,y_train=train.iloc[train_idx][FEATURES],train.iloc[train_idx][TARGET]
    x_val,y_val=train.iloc[val_idx][FEATURES],train.iloc[val_idx][TARGET]
    x_test=test[FEATURES]

    TE=TargetEncoder(cols_to_encode=INTER,cv=5,smooth="auto",aggs=["mean"],drop_original=True)
    x_train=TE.fit_transform(x_train,y_train)
    x_val=TE.transform(x_val)
    x_test=TE.transform(x_test)

    mapping.append([(x_train,y_train),(x_val,y_val),x_test])
print("mapping Created")

< 1/5 > is Encoding !
< 2/5 > is Encoding !
< 3/5 > is Encoding !
< 4/5 > is Encoding !
< 5/5 > is Encoding !
mapping Created
