In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,ParameterGrid
#import optuna.integration.lightgbm as lgb
from sklearn.model_selection import cross_val_score
import optuna
from sklearn.metrics import accuracy_score,f1_score
from sklearn.metrics import roc_auc_score
import featuretools as ft
import lightgbm as lgb

%matplotlib inline

In [34]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,MinMaxScaler,RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,ParameterGrid
from sklearn.metrics import accuracy_score,f1_score

In [None]:
app_train = pd.read_csv('application_train.csv').sort_values('SK_ID_CURR').reset_index(drop = True).loc[:1000, :]
app_test = pd.read_csv('application_test.csv').sort_values('SK_ID_CURR').reset_index(drop = True).loc[:1000, :]
bureau_balance = pd.read_csv('bureau_balance.csv').sort_values('SK_ID_BUREAU').reset_index(drop = True).loc[:1000, :]
bureau = pd.read_csv('bureau.csv').sort_values(['SK_ID_CURR', 'SK_ID_BUREAU']).reset_index(drop = True).loc[:1000, :]
credit = pd.read_csv('credit_card_balance.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]
installments =  pd.read_csv('installments_payments.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]
cash = pd.read_csv('POS_CASH_balance.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]
previous = pd.read_csv('previous_application.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]

In [None]:
# Add identifying column
app_train['set'] = 'train'
app_test['set'] = 'test'
app_test["TARGET"] = np.nan
# Append the dataframes
app = app_train.append(app_test, ignore_index = True)

In [None]:
#Entity set with id applications
es = ft.EntitySet(id = 'clients')

In [None]:
#Entitiy with a unique index
es = es.add_dataframe(dataframe_name = 'app_train', dataframe = app, index = 'SK_ID_CURR')
es = es.add_dataframe(dataframe_name = 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU')
es = es.add_dataframe(dataframe_name = 'previous', dataframe = previous, index = 'SK_ID_PREV')
#Entities that do not have a unique index
es = es.add_dataframe(dataframe_name = 'bureau_balance', dataframe = bureau_balance,make_index = True, index = 'bureaubalance_index')
es = es.add_dataframe(dataframe_name = 'cash', dataframe = cash,make_index = True, index = 'cash_index')
es = es.add_dataframe(dataframe_name = 'installments', dataframe = installments,make_index = True, index = 'installments_index')
es = es.add_dataframe(dataframe_name = 'credit', dataframe = credit, make_index = True, index = 'credit_index')

In [None]:
#Relationship between app and breau
r_app_bureau = es.add_relationship('app_train','SK_ID_CURR','bureau','SK_ID_CURR')
#Relationship between bureau and bureau balance
r_bureau_balance = es.add_relationship('bureau','SK_ID_BUREAU','bureau_balance','SK_ID_BUREAU')
# Relationship between current app and previous apps
r_app_previous = es.add_relationship('app_train','SK_ID_CURR','previous','SK_ID_CURR')
# Relationships between previous apps and cash, installments, and credit
r_previous_cash = es.add_relationship('previous','SK_ID_PREV','cash','SK_ID_PREV')
r_previous_installments = es.add_relationship('previous','SK_ID_PREV','installments','SK_ID_PREV')
r_previous_credit = es.add_relationship('previous','SK_ID_PREV','credit','SK_ID_PREV')

In [None]:
feature_matrix,feature_name = ft.dfs(entityset=es,target_dataframe_name='app_train',
                                     agg_primitives = ['sum', 'count', 'min', 'max', 'mean','mode'],
                                     max_depth = 2, features_only = False, verbose = True)

In [None]:
feature_matrix.info()

In [None]:
df_num = feature_matrix.select_dtypes(include='number')
df_cat = feature_matrix.select_dtypes(include='category')

In [None]:
numeric_features = []
categorical_features = []  
numeric_features =  df_num.columns.values
#numeric_features
categorical_features = df_cat.columns.values
#categorical_features

In [42]:
#パイプライン構築

##特徴量
numeric_features = df_num.columns.values
categorical_features = df_cat.columns.values 
###数値型
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer()),#欠損値補完
    ('scaler',StandardScaler())#標準化
])

###カテゴリ型
categorical_transformer =  Pipeline(steps=[
    ('imputer',SimpleImputer()),
    ('transformer',OneHotEncoder(handle_unknown='ignore'))
]) 

###特徴量を変換器にかける
preprocesser = ColumnTransformer(transformers=[
    ('num_transform',numeric_transformer,numeric_features),
    ('cat_transform',categorical_transformer,categorical_features)
])

pipeline = Pipeline(steps=[#変換器パイプラインから予測器へ
    ('preprocesser',preprocesser),
    ('classifier',lgb.LGBMClassifier())#パラメータを後で調整する
])

In [43]:
set_config(display='diagram')
pipeline