In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.pipeline import Pipeline

import FeatureGenerator as fg

from lightgbm import LGBMClassifier

In [2]:
data_path = "../inputs/train.csv"
df_train = pd.read_csv(data_path)
df_train.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,target
0,28104,50000.0,2,1,1,31,1,2,2,0,...,50332.0,29690.0,30246.0,2200.0,4.0,2300.0,1100.0,1400.0,1200.0,1
1,29094,330000.0,2,2,2,59,0,0,0,0,...,80589.0,76180.0,61693.0,20000.0,3500.0,19000.0,15000.0,3000.0,2139.0,0
2,11280,220000.0,2,1,2,41,-1,-1,-2,-2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,28435,50000.0,2,2,1,45,0,0,0,0,...,8469.0,8411.0,8361.0,2124.0,2037.0,1130.0,295.0,302.0,296.0,0
4,10873,480000.0,2,3,1,42,-2,-2,-2,-2,...,0.0,790.0,0.0,0.0,0.0,0.0,790.0,0.0,0.0,0


In [3]:
#selecionando as variáveis categóricas para definir no modelo
cat_cols = ['SEX',
 'EDUCATION',
 'MARRIAGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6']

In [4]:
#inicializando as classes que serão utilizadas para geração de features
fbf = fg.FeatureByFeature(features_num=["PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"],
                         features_denom=["BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"])

diff1 = fg.DiffFeatures(features=["PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"])

diff2 = fg.DiffFeatures(features=["BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"])

agpf = fg.GroupFeatures(group_columns=["EDUCATION"], features=["LIMIT_BAL", "BILL_AMT1", 
                                                               "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", 
                                                               "BILL_AMT5", "BILL_AMT6"])

In [6]:
X_train, y_train = df_train.drop(columns=["ID", "target"], axis=1), df_train.target.values

In [7]:
fbf.fit(X_train)
X_train = fbf.transform(X_train)

In [8]:
diff1.fit(X_train)
X_train = diff1.transform(X_train)

In [9]:
diff2.fit(X_train)
X_train = diff2.transform(X_train)

In [10]:
agpf.fit(X_train)
X_train = agpf.transform(X_train)

In [11]:
#o lightgbm só aceita index para identificação das colunas categoricas
cat_idx = []
for c in cat_cols:
    idx = X_train.columns.get_loc(c)
    cat_idx.append(idx)

In [12]:
base_estimator = LGBMClassifier(class_weight="balanced",
                               categorical_features=cat_idx,
                               random_state=42)

In [13]:
base_estimator.fit(X_train, y_train)

LGBMClassifier(categorical_features=[1, 2, 3, 5, 6, 7, 8, 9, 10],
               class_weight='balanced', random_state=42)

In [17]:
# importance of each attribute
fea_imp_ = pd.DataFrame({'cols':X_train.columns, 'fea_imp':base_estimator.feature_importances_})
fea_imp_.loc[fea_imp_.fea_imp > 0].sort_values(by=['fea_imp'], ascending = False)

Unnamed: 0,cols,fea_imp
34,BILL_AMT2_minus_BILL_AMT1,121
27,PAY_AMT5/BILL_AMT5,115
28,PAY_AMT6/BILL_AMT6,109
35,BILL_AMT3_minus_BILL_AMT2,104
26,PAY_AMT4/BILL_AMT4,96
...,...,...
46,EDUCATION_LIMIT_BAL_max,4
47,EDUCATION_BILL_AMT1_max,3
40,EDUCATION_BILL_AMT1_mean,3
39,EDUCATION_LIMIT_BAL_mean,2


In [18]:
fea_imp_ = fea_imp_.sort_values(by="fea_imp", ascending=False)
fea_imp_ = fea_imp_.reset_index(drop=True)

In [19]:
total_importance = fea_imp_["fea_imp"].sum()
fea_imp_["total_importance"] = total_importance
fea_imp_["cumulative_sum"] = fea_imp_["fea_imp"].cumsum()
fea_imp_["relative_cumulative_importance"] = fea_imp_["cumulative_sum"]/fea_imp_["total_importance"]

In [20]:
fea_imp_.head()

Unnamed: 0,cols,fea_imp,total_importance,cumulative_sum,relative_cumulative_importance
0,BILL_AMT2_minus_BILL_AMT1,121,3000,121,0.040333
1,PAY_AMT5/BILL_AMT5,115,3000,236,0.078667
2,PAY_AMT6/BILL_AMT6,109,3000,345,0.115
3,BILL_AMT3_minus_BILL_AMT2,104,3000,449,0.149667
4,PAY_AMT4/BILL_AMT4,96,3000,545,0.181667


In [21]:
fea_imp_.shape

(81, 5)

In [23]:
df_selection = fea_imp_[fea_imp_["relative_cumulative_importance"] <= 0.82]

In [24]:
df_selection.shape

(34, 5)

In [25]:
final_columns = list(df_selection["cols"].values)
final_columns

['BILL_AMT2_minus_BILL_AMT1',
 'PAY_AMT5/BILL_AMT5',
 'PAY_AMT6/BILL_AMT6',
 'BILL_AMT3_minus_BILL_AMT2',
 'PAY_AMT4/BILL_AMT4',
 'PAY_AMT1/BILL_AMT1',
 'PAY_AMT4_minus_PAY_AMT3',
 'PAY_AMT2/BILL_AMT2',
 'BILL_AMT5_minus_BILL_AMT4',
 'BILL_AMT4_minus_BILL_AMT3',
 'AGE',
 'BILL_AMT6_minus_BILL_AMT5',
 'PAY_AMT3/BILL_AMT3',
 'PAY_AMT2_minus_PAY_AMT1',
 'PAY_AMT6_minus_PAY_AMT5',
 'PAY_AMT5_minus_PAY_AMT4',
 'LIMIT_BAL/EDUCATION_max',
 'PAY_AMT2',
 'PAY_AMT3_minus_PAY_AMT2',
 'LIMIT_BAL/EDUCATION_mean',
 'PAY_AMT1',
 'BILL_AMT5/EDUCATION_min',
 'LIMIT_BAL',
 'PAY_AMT6',
 'PAY_0',
 'PAY_AMT3',
 'PAY_AMT4',
 'BILL_AMT1',
 'PAY_AMT5',
 'BILL_AMT1/EDUCATION_max',
 'BILL_AMT4/EDUCATION_min',
 'BILL_AMT2/EDUCATION_min',
 'BILL_AMT3/EDUCATION_min',
 'BILL_AMT6/EDUCATION_min']