In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.pipeline import Pipeline

import FeatureGenerator as fg

from lightgbm import LGBMClassifier

In [None]:
data_path = "../inputs/train.csv"
df_train = pd.read_csv(data_path)
df_train.head()

In [None]:
#selecionando as variáveis categóricas para definir no modelo
cat_cols = ['SEX',
 'EDUCATION',
 'MARRIAGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6']

In [None]:
#inicializando as classes que serão utilizadas para geração de features
fbf = fg.FeatureByFeature(features_num=["PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"],
                         features_denom=["BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"])

diff1 = fg.DiffFeatures(features=["PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"])

diff2 = fg.DiffFeatures(features=["BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"])

agpf = fg.GroupFeatures(group_columns=["EDUCATION"], features=["LIMIT_BAL", "BILL_AMT1", 
                                                               "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", 
                                                               "BILL_AMT5", "BILL_AMT6"])

In [None]:
X_train, y_train = df_train.drop(columns=["ID", "target"], axis=1), df_train.target.values

In [None]:
#o lightgbm só aceita index para identificação das colunas categoricas
cat_idx = []
for c in cat_cols:
    idx = X_train.columns.get_loc(c)
    cat_idx.append(idx)

In [None]:
base_estimator = LGBMClassifier(class_weight="balanced",
                               categorical_features=cat_idx,
                               random_state=42)

In [None]:
pipe = Pipeline(steps=[("feature_by_feature", fbf),
                      ("diff_features_1", diff1),
                      ("diff_features_2", diff2),
                      ("aggroup_features", agpf),
                      ("estimator", base_estimator)])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
# importance of each attribute
fea_imp_ = pd.DataFrame({'cols':base_estimator.feature_name(), 'fea_imp':base_estimator.feature_importances_})
fea_imp_.loc[fea_imp_.fea_imp > 0].sort_values(by=['fea_imp'], ascending = False)

In [None]:
agpf.fit(X_train)

In [None]:
X_train = agpf.transform(X_train)

In [None]:
X_train