In [234]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,ParameterGrid
#import optuna.integration.lightgbm as lgb
from sklearn.model_selection import cross_val_score
import optuna
from sklearn.metrics import accuracy_score,f1_score
from sklearn.metrics import roc_auc_score
import featuretools as ft
import lightgbm as lgb

%matplotlib inline

In [235]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,MinMaxScaler,RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,ParameterGrid
from sklearn.metrics import accuracy_score,f1_score

In [236]:
app_train = pd.read_csv('application_train.csv').sort_values('SK_ID_CURR').reset_index(drop = True).loc[:1000, :]
app_test = pd.read_csv('application_test.csv').sort_values('SK_ID_CURR').reset_index(drop = True).loc[:1000, :]
bureau_balance = pd.read_csv('bureau_balance.csv').sort_values('SK_ID_BUREAU').reset_index(drop = True).loc[:1000, :]
bureau = pd.read_csv('bureau.csv').sort_values(['SK_ID_CURR', 'SK_ID_BUREAU']).reset_index(drop = True).loc[:1000, :]
credit = pd.read_csv('credit_card_balance.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]
installments =  pd.read_csv('installments_payments.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]
cash = pd.read_csv('POS_CASH_balance.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]
previous = pd.read_csv('previous_application.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True).loc[:1000, :]

In [239]:
# Add identifying column
app_train['set'] = 'train'
app_test['set'] = 'test'
app_test["TARGET"] = np.nan

# Append the dataframes
app = app_train.append(app_test, ignore_index = True)

In [240]:
#Entity set with id applications
es = ft.EntitySet(id = 'clients')

In [241]:
#Entitiy with a unique index
es = es.add_dataframe(dataframe_name = 'app', dataframe = app, index = 'SK_ID_CURR')
es = es.add_dataframe(dataframe_name = 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU')
es = es.add_dataframe(dataframe_name = 'previous', dataframe = previous, index = 'SK_ID_PREV')
#Entities that do not have a unique index
es = es.add_dataframe(dataframe_name = 'bureau_balance', dataframe = bureau_balance,make_index = True, index = 'bureaubalance_index')
es = es.add_dataframe(dataframe_name = 'cash', dataframe = cash,make_index = True, index = 'cash_index')
es = es.add_dataframe(dataframe_name = 'installments', dataframe = installments,make_index = True, index = 'installments_index')
es = es.add_dataframe(dataframe_name = 'credit', dataframe = credit, make_index = True, index = 'credit_index')

In [243]:
#Relationship between app and breau
r_app_bureau = es.add_relationship('app','SK_ID_CURR','bureau','SK_ID_CURR')
#Relationship between bureau and bureau balance
r_bureau_balance = es.add_relationship('bureau','SK_ID_BUREAU','bureau_balance','SK_ID_BUREAU')
# Relationship between current app and previous apps
r_app_previous = es.add_relationship('app','SK_ID_CURR','previous','SK_ID_CURR')
# Relationships between previous apps and cash, installments, and credit
r_previous_cash = es.add_relationship('previous','SK_ID_PREV','cash','SK_ID_PREV')
r_previous_installments = es.add_relationship('previous','SK_ID_PREV','installments','SK_ID_PREV')
r_previous_credit = es.add_relationship('previous','SK_ID_PREV','credit','SK_ID_PREV')

In [244]:
feature_matrix,feature_name = ft.dfs(entityset=es,target_dataframe_name='app',
                                     agg_primitives = ['sum', 'count', 'min', 'max', 'mean','mode'],
                                     max_depth = 2, features_only = False, verbose = True)

Built 1211 features
Elapsed: 00:04 | Progress: 100%|██████████


In [245]:
from featuretools import selection

# Remove features with only one unique value
feature_matrix2 = selection.remove_low_information_features(feature_matrix)

print('Removed %d features' % (feature_matrix.shape[1]- feature_matrix2.shape[1]))

Removed 121 features


In [246]:
#Separate out the train and test sets
train = feature_matrix2[feature_matrix2['set'] == 'train']
test = feature_matrix2[feature_matrix2['set'] == 'test']

# One hot encoding
#train = pd.get_dummies(train)
#test = pd.get_dummies(test)

# Align dataframes on the columns
train, test = train.align(test, join = 'inner', axis = 1)
test = test.drop(columns = ['TARGET'])

print('Final Training Shape: ', train.shape)
print('Final Testing Shape: ', test.shape)

Final Training Shape:  (1001, 1090)
Final Testing Shape:  (1001, 1089)


In [313]:
import re
X1 = train.drop(columns=['TARGET']).dropna(axis=1)
y1 = pd.DataFrame(train['TARGET'])

X_train1,X_valid1,y_train1,y_valid1 = train_test_split(X1,y1,shuffle=True)

In [316]:
df_num1 = X1.select_dtypes(include='number')
df_cat1 = y1.select_dtypes(include='category')


In [317]:
numeric_features1 = []
categorical_features1 = []  
numeric_features1 =  df_num1.columns.values
categorical_features1 = df_cat1.columns.values

In [318]:
#パイプライン構築

###特徴量
#numeric_features = df_num.columns.values
categorical_features = df_cat1.columns.values 
numeric_features = df_num1.columns.values
###数値型
numeric_transformer = Pipeline(steps=[
    ('num_imputer',SimpleImputer(strategy='most_frequent')),#欠損値補完
    ('scaler',StandardScaler())#標準化
])

###カテゴリ型
categorical_transformer =  Pipeline(steps=[
    ('cat_imputer',SimpleImputer(strategy='most_frequent')),
    ('transformer',OneHotEncoder(handle_unknown='ignore'))
]) 

###特徴量を変換器にかける
preprocesser = ColumnTransformer(transformers=[
    ('num_transform',numeric_transformer,numeric_features1),
    ('cat_transform',categorical_transformer,categorical_features1)
])

pipeline1 = Pipeline(steps=[#変換器パイプラインから予測器へ
    ('preprocesser',preprocesser),
    ('classifier',lgb.LGBMClassifier())#パラメータを後で調整する
])

In [319]:
pipeline1.fit(X_train1,y_train1)

  return f(*args, **kwargs)


In [321]:
predict = pipeline1.predict()

ValueError: X has 1089 features, but ColumnTransformer is expecting 257 features as input.

In [305]:
y_train1

Unnamed: 0_level_0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,...,SUMcreditpreviousAMT_GOODS_PRICE,SUMcreditpreviousDAYS_DECISION,SUMcreditpreviousDAYS_FIRST_DRAWING,SUMcreditpreviousDAYS_FIRST_DUE,SUMcreditpreviousDAYS_LAST_DUE,SUMcreditpreviousDAYS_LAST_DUE_1ST_VERSION,SUMcreditpreviousDAYS_TERMINATION,SUMcreditpreviousHOUR_APPR_PROCESS_START,SUMcreditpreviousNFLAG_LAST_APPL_IN_DAY,SUMcreditpreviousSELLERPLACE_AREA
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100959,Cash loans,F,Y,Y,2,90000.0,225000.0,12334.5,State servant,Higher education,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100737,Cash loans,F,N,Y,0,112500.0,247275.0,17208.0,Pensioner,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100573,Cash loans,F,Y,N,1,157500.0,961146.0,26559.0,Working,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100876,Cash loans,F,Y,Y,1,139500.0,317178.0,23845.5,Working,Higher education,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100207,Cash loans,F,N,Y,0,157500.0,700830.0,22738.5,Working,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100969,Cash loans,F,N,Y,0,216000.0,485640.0,39069.0,Commercial associate,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100240,Cash loans,M,N,Y,2,135000.0,590337.0,28530.0,Working,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100858,Cash loans,M,Y,Y,2,270000.0,1467612.0,58333.5,Working,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100853,Cash loans,F,N,Y,2,117000.0,276277.5,16825.5,Working,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [220]:
feature_matrix1 = feature_matrix.dropna(subset=['TARGET'])
feature_matrix1

Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,SUM(credit.previous.DAYS_LAST_DUE),SUM(credit.previous.DAYS_LAST_DUE_1ST_VERSION),SUM(credit.previous.DAYS_TERMINATION),SUM(credit.previous.HOUR_APPR_PROCESS_START),SUM(credit.previous.NFLAG_INSURED_ON_APPROVAL),SUM(credit.previous.NFLAG_LAST_APPL_IN_DAY),SUM(credit.previous.RATE_DOWN_PAYMENT),SUM(credit.previous.RATE_INTEREST_PRIMARY),SUM(credit.previous.RATE_INTEREST_PRIVILEGED),SUM(credit.previous.SELLERPLACE_AREA)
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,1.0,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100003,0.0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100004,0.0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100006,0.0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,2191458.0,2191458.0,2191458.0,90.0,0.0,6.0,0.0,0.0,0.0,-6.0
100007,0.0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101153,0.0,Cash loans,F,N,Y,0,225000.0,1113840.0,57001.5,900000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101154,0.0,Cash loans,F,Y,Y,0,144000.0,517536.0,28206.0,432000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101155,0.0,Cash loans,M,N,Y,0,315000.0,1288350.0,37800.0,1125000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101156,0.0,Cash loans,M,Y,Y,2,180000.0,679500.0,27076.5,679500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [221]:
df_num = feature_matrix1.select_dtypes(include='number')
df_cat = feature_matrix1.select_dtypes(include='category')
df_num1 = df_num.drop(columns=['TARGET'])

In [222]:
numeric_features = []
categorical_features = []  
numeric_features =  df_num.columns.values
#numeric_features
categorical_features = df_cat.columns.values
numeric_features1 = df_num1.columns.values
numeric_features1

array(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', ...,
       'SUM(credit.previous.RATE_INTEREST_PRIMARY)',
       'SUM(credit.previous.RATE_INTEREST_PRIVILEGED)',
       'SUM(credit.previous.SELLERPLACE_AREA)'], dtype=object)

In [224]:
X = pd.DataFrame(feature_matrix1.drop(columns=['TARGET']))

y = pd.DataFrame(feature_matrix1['TARGET'])
X_train,X_valid,y_train,y_valid = train_test_split(X,y,shuffle=True)

In [277]:
X

Unnamed: 0_level_0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,SUM(credit.previous.DAYS_LAST_DUE),SUM(credit.previous.DAYS_LAST_DUE_1ST_VERSION),SUM(credit.previous.DAYS_TERMINATION),SUM(credit.previous.HOUR_APPR_PROCESS_START),SUM(credit.previous.NFLAG_INSURED_ON_APPROVAL),SUM(credit.previous.NFLAG_LAST_APPL_IN_DAY),SUM(credit.previous.RATE_DOWN_PAYMENT),SUM(credit.previous.RATE_INTEREST_PRIMARY),SUM(credit.previous.RATE_INTEREST_PRIVILEGED),SUM(credit.previous.SELLERPLACE_AREA)
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100003,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100004,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100006,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,...,2191458.0,2191458.0,2191458.0,90.0,0.0,6.0,0.0,0.0,0.0,-6.0
100007,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101153,Cash loans,F,N,Y,0,225000.0,1113840.0,57001.5,900000.0,Unaccompanied,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101154,Cash loans,F,Y,Y,0,144000.0,517536.0,28206.0,432000.0,Unaccompanied,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101155,Cash loans,M,N,Y,0,315000.0,1288350.0,37800.0,1125000.0,Unaccompanied,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101156,Cash loans,M,Y,Y,2,180000.0,679500.0,27076.5,679500.0,Unaccompanied,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [279]:
a = X.dropna(axis=1)
a

Unnamed: 0_level_0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,...,SUM(credit.previous.DAYS_LAST_DUE),SUM(credit.previous.DAYS_LAST_DUE_1ST_VERSION),SUM(credit.previous.DAYS_TERMINATION),SUM(credit.previous.HOUR_APPR_PROCESS_START),SUM(credit.previous.NFLAG_INSURED_ON_APPROVAL),SUM(credit.previous.NFLAG_LAST_APPL_IN_DAY),SUM(credit.previous.RATE_DOWN_PAYMENT),SUM(credit.previous.RATE_INTEREST_PRIMARY),SUM(credit.previous.RATE_INTEREST_PRIVILEGED),SUM(credit.previous.SELLERPLACE_AREA)
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,Working,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100003,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,State servant,Higher education,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100004,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,Working,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100006,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,Working,Secondary / secondary special,...,2191458.0,2191458.0,2191458.0,90.0,0.0,6.0,0.0,0.0,0.0,-6.0
100007,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,Working,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101153,Cash loans,F,N,Y,0,225000.0,1113840.0,57001.5,Commercial associate,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101154,Cash loans,F,Y,Y,0,144000.0,517536.0,28206.0,Pensioner,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101155,Cash loans,M,N,Y,0,315000.0,1288350.0,37800.0,Commercial associate,Higher education,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101156,Cash loans,M,Y,Y,2,180000.0,679500.0,27076.5,Working,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
y.value_counts()

TARGET
0.0       931
1.0        70
dtype: int64

In [231]:
pipeline.predict()

ValueError: X has 123 features, but ColumnTransformer is expecting 1210 features as input.

In [226]:
#パイプライン構築

###特徴量
#numeric_features = df_num.columns.values
categorical_features = df_cat.columns.values 
###数値型
numeric_transformer = Pipeline(steps=[
    ('num_imputer',SimpleImputer(strategy='most_frequent')),#欠損値補完
    ('scaler',StandardScaler())#標準化
])

###カテゴリ型
categorical_transformer =  Pipeline(steps=[
    ('cat_imputer',SimpleImputer(strategy='most_frequent')),
    ('transformer',OneHotEncoder(handle_unknown='ignore'))
]) 

###特徴量を変換器にかける
preprocesser = ColumnTransformer(transformers=[
    ('num_transform',numeric_transformer,numeric_features1),
    ('cat_transform',categorical_transformer,categorical_features)
])

pipeline = Pipeline(steps=[#変換器パイプラインから予測器へ
    ('preprocesser',preprocesser),
    ('classifier',lgb.LGBMClassifier())#パラメータを後で調整する
])

In [195]:
set_config(display='diagram')
pipeline

In [227]:
pipeline.fit(X_train,y_train)

  return f(*args, **kwargs)


In [228]:
y_test_pred1 = pipeline.predict(X_valid)
print('accuracy',accuracy_score(y_valid,y_test_pred1))
print('f1',f1_score(y_valid,y_test_pred1))

accuracy 0.9163346613545816
f1 0.08695652173913043


In [139]:
def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 0.1,0.2),
    n_estimators, = trial.suggest_int('n_estimators', 20, 200),
    max_depth, = trial.suggest_int('max_depth', 3, 9),
    min_child_weight = trial.suggest_loguniform('min_child_weight', 0.5, 2),
    min_child_samples, = trial.suggest_int('min_child_samples', 5, 20),
    classifier = lgb.LGBMClassifier(learning_rate=learning_rate, 
                                    n_estimators=n_estimators,
                                    max_depth=max_depth, 
                                    min_child_weight=min_child_weight,
                                    min_child_samples=min_child_samples,
                                    subsample=0.8, colsample_bytree=0.8,
                                    verbose=-1, num_leaves=80)
    classifier.fit(X_train, y_train)
    #return classifier.score(X_train, y_train) # 正答率（train） の最適化
    return np.linalg.norm(y_train - classifier.predict_proba(X_train)[:, 1], ord=1) # 尤度の最適化

In [129]:
study = optuna.create_study(direction='minimize') # 最小化

[32m[I 2022-01-05 17:09:48,861][0m A new study created in memory with name: no-name-f21e57fd-e208-48d0-988c-d00fc4699516[0m


In [130]:
study.optimize(objective, n_trials=100)

[33m[W 2022-01-05 17:10:01,656][0m Trial 0 failed because of the following error: ValueError("Input contains NaN, infinity or a value too large for dtype('float64').")[0m
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-128-8d849f51be3f>", line 14, in objective
    classifier.fit(X_train, y_train)
  File "/opt/anaconda3/lib/python3.8/site-packages/lightgbm/sklearn.py", line 922, in fit
    _LGBMAssertAllFinite(y)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 124, in assert_all_finite
    _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 103, in _assert_all_finite
    raise V

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').