In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,ParameterGrid
#import optuna.integration.lightgbm as lgb
from sklearn.model_selection import cross_val_score
import optuna
from sklearn.metrics import accuracy_score,f1_score
from sklearn.metrics import roc_auc_score
import featuretools as ft
import lightgbm as lgb

%matplotlib inline

In [34]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,MinMaxScaler,RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,ParameterGrid
from sklearn.metrics import accuracy_score,f1_score

In [None]:
app_train = pd.read_csv('application_train.csv').sort_values('SK_ID_CURR').reset_index(drop = True).loc[:1000, :]
app_test = pd.read_csv('application_test.csv').sort_values('SK_ID_CURR').reset_index(drop = True).loc[:1000, :]
bureau_balance = pd.read_csv('bureau_balance.csv').sort_values('SK_ID_BUREAU').reset_index(drop = True).loc[:1000, :]
bureau = pd.read_csv('bureau.csv').sort_values(['SK_ID_CURR', 'SK_ID_BUREAU']).reset_index(drop = True).loc[:1000, :]
credit = pd.read_csv('credit_card_balance.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]
installments =  pd.read_csv('installments_payments.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]
cash = pd.read_csv('POS_CASH_balance.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]
previous = pd.read_csv('previous_application.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]

In [None]:
# Add identifying column
app_train['set'] = 'train'
app_test['set'] = 'test'
app_test["TARGET"] = np.nan
# Append the dataframes
app = app_train.append(app_test, ignore_index = True)

In [None]:
#Entity set with id applications
es = ft.EntitySet(id = 'clients')

In [None]:
#Entitiy with a unique index
es = es.add_dataframe(dataframe_name = 'app_train', dataframe = app, index = 'SK_ID_CURR')
es = es.add_dataframe(dataframe_name = 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU')
es = es.add_dataframe(dataframe_name = 'previous', dataframe = previous, index = 'SK_ID_PREV')
#Entities that do not have a unique index
es = es.add_dataframe(dataframe_name = 'bureau_balance', dataframe = bureau_balance,make_index = True, index = 'bureaubalance_index')
es = es.add_dataframe(dataframe_name = 'cash', dataframe = cash,make_index = True, index = 'cash_index')
es = es.add_dataframe(dataframe_name = 'installments', dataframe = installments,make_index = True, index = 'installments_index')
es = es.add_dataframe(dataframe_name = 'credit', dataframe = credit, make_index = True, index = 'credit_index')

In [None]:
#Relationship between app and breau
r_app_bureau = es.add_relationship('app_train','SK_ID_CURR','bureau','SK_ID_CURR')
#Relationship between bureau and bureau balance
r_bureau_balance = es.add_relationship('bureau','SK_ID_BUREAU','bureau_balance','SK_ID_BUREAU')
# Relationship between current app and previous apps
r_app_previous = es.add_relationship('app_train','SK_ID_CURR','previous','SK_ID_CURR')
# Relationships between previous apps and cash, installments, and credit
r_previous_cash = es.add_relationship('previous','SK_ID_PREV','cash','SK_ID_PREV')
r_previous_installments = es.add_relationship('previous','SK_ID_PREV','installments','SK_ID_PREV')
r_previous_credit = es.add_relationship('previous','SK_ID_PREV','credit','SK_ID_PREV')

In [79]:
feature_matrix,feature_name = ft.dfs(entityset=es,target_dataframe_name='app_train',
                                     agg_primitives = ['sum', 'count', 'min', 'max', 'mean','mode'],
                                     max_depth = 2, features_only = False, verbose = True)

Built 1211 features
Elapsed: 00:05 | Progress: 100%|██████████


In [45]:
feature_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2002 entries, 100002 to 106873
Columns: 1211 entries, TARGET to SUM(credit.previous.SELLERPLACE_AREA)
dtypes: Int64(6), category(93), float64(1073), int64(39)
memory usage: 17.3 MB


In [123]:
df_num = feature_matrix.select_dtypes(include='number')
df_cat = feature_matrix.select_dtypes(include='category')
df_num1 = df_num.drop(columns=['TARGET'])

In [125]:
numeric_features = []
categorical_features = []  
numeric_features =  df_num.columns.values
#numeric_features
categorical_features = df_cat.columns.values
numeric_features1 = df_num1.columns.values
numeric_features1

array(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', ...,
       'SUM(credit.previous.RATE_INTEREST_PRIMARY)',
       'SUM(credit.previous.RATE_INTEREST_PRIVILEGED)',
       'SUM(credit.previous.SELLERPLACE_AREA)'], dtype=object)

In [163]:
X = pd.DataFrame(feature_matrix.drop(columns=['TARGET']))

y = pd.DataFrame(feature_matrix['TARGET'])
X_train,X_valid,y_train,y_valid = train_test_split(X,y,shuffle=True)

In [165]:
lgb_train = lgb.Dataset(X_train, y_train,
                        categorical_feature=categorical_features,
                        free_raw_data=False)

lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train,
                       categorical_feature=categorical_features,
                       free_raw_data=False)

In [167]:
lgb_train

<lightgbm.basic.Dataset at 0x7fd3706dcd00>

In [172]:
y.value_counts()

TARGET
0.0       931
1.0        70
dtype: int64

In [174]:
y1 = y.fillna(0)
y1.value_counts()

TARGET
0.0       1932
1.0         70
dtype: int64

In [155]:
#パイプライン構築

###特徴量
#numeric_features = df_num.columns.values
categorical_features = df_cat.columns.values 
###数値型
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),#欠損値補完
    ('scaler',StandardScaler())#標準化
])

###カテゴリ型
categorical_transformer =  Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('transformer',OneHotEncoder(handle_unknown='ignore'))
]) 

###特徴量を変換器にかける
preprocesser = ColumnTransformer(transformers=[
    ('num_transform',numeric_transformer,numeric_features1),
    ('cat_transform',categorical_transformer,categorical_features)
])

pipeline = Pipeline(steps=[#変換器パイプラインから予測器へ
    ('preprocesser',preprocesser),
    ('classifier',lgb.LGBMClassifier())#パラメータを後で調整する
])

In [43]:
set_config(display='diagram')
pipeline

In [169]:
pipeline.fit(X_train,y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [139]:
def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 0.1,0.2),
    n_estimators, = trial.suggest_int('n_estimators', 20, 200),
    max_depth, = trial.suggest_int('max_depth', 3, 9),
    min_child_weight = trial.suggest_loguniform('min_child_weight', 0.5, 2),
    min_child_samples, = trial.suggest_int('min_child_samples', 5, 20),
    classifier = lgb.LGBMClassifier(learning_rate=learning_rate, 
                                    n_estimators=n_estimators,
                                    max_depth=max_depth, 
                                    min_child_weight=min_child_weight,
                                    min_child_samples=min_child_samples,
                                    subsample=0.8, colsample_bytree=0.8,
                                    verbose=-1, num_leaves=80)
    classifier.fit(X_train, y_train)
    #return classifier.score(X_train, y_train) # 正答率（train） の最適化
    return np.linalg.norm(y_train - classifier.predict_proba(X_train)[:, 1], ord=1) # 尤度の最適化

In [129]:
study = optuna.create_study(direction='minimize') # 最小化

[32m[I 2022-01-05 17:09:48,861][0m A new study created in memory with name: no-name-f21e57fd-e208-48d0-988c-d00fc4699516[0m


In [130]:
study.optimize(objective, n_trials=100)

[33m[W 2022-01-05 17:10:01,656][0m Trial 0 failed because of the following error: ValueError("Input contains NaN, infinity or a value too large for dtype('float64').")[0m
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-128-8d849f51be3f>", line 14, in objective
    classifier.fit(X_train, y_train)
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/sklearn.py", line 922, in fit
    _LGBMAssertAllFinite(y)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 124, in assert_all_finite
    _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 103, in _assert_all_finite
    raise V

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').