In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
import math

# Managing Warnings 
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

# Machine learning 
import sklearn.ensemble as ske
from sklearn import datasets, model_selection, tree, preprocessing, metrics, linear_model, neighbors, svm
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold, cross_val_score
from vecstack import stacking
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier
from ngboost import NGBClassifier
from ngboost.distns import k_categorical, Bernoulli
from catboost import CatBoostClassifier, Pool

ModuleNotFoundError: No module named 'catboost'

In [None]:
def run_pipeline(data_df, target):
    #자동으로 num과 cat 변수 갈라서 df 생성
    data_df_cat = data_df.select_dtypes(include=np.object)
    data_df_num = data_df.select_dtypes(exclude=np.object)
    
    # category df 만들어주기
    data_df_cat = pd.concat([data_df_cat, data_df[['occyp_category', 'child_num']]], axis=1)

    # binary df 만들어주기
    data_df_bi = data_df[['gender','car','reality','work_phone','phone','email','dup']]

    # cat인데 num df에 들어간 변수 num df에서 drop해주기
    data_df_num = data_df_num.drop(columns=['gender','car','reality','work_phone','phone','email','dup', 'child_num','occyp_category'])

    x_train, x_test, y_train, y_test = model_selection.train_test_split(data_df,
                                                                    target,
                                                                   test_size = 0.2,
                                                                   random_state=0)
    
    binary_features = data_df_bi.columns
    
    numeric_features = data_df_num.columns
    numeric_transformer = StandardScaler() # cf) RobustScaler

    categorical_features = data_df_cat.columns
    categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') # categories='auto' : just for ignoring warning messages

    preprocessor = ColumnTransformer(
        transformers=[ # List of (name, transformer, column(s))
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
             ('bi','passthrough',binary_features)])

    preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])
    preprocessor_pipe.fit(x_train)

    x_train_transformed = preprocessor_pipe.transform(x_train)
    x_test_transformed = preprocessor_pipe.transform(x_test)

    return x_train_transformed, x_test_transformed, y_train, y_test

In [3]:
# train전체를 학습시킬 때
def run_pipeline_train(data_df):
    #자동으로 num과 cat 변수 갈라서 df 생성
    data_df_cat = data_df.select_dtypes(include=np.object)
    data_df_num = data_df.select_dtypes(exclude=np.object)
    
    # category df 만들어주기
    data_df_cat = pd.concat([data_df_cat, data_df[['occyp_category', 'child_num']]], axis=1)

    # binary df 만들어주기
    data_df_bi = data_df[['gender','car','reality','work_phone','phone','email','dup']]

    # cat인데 num df에 들어간 변수 num df에서 drop해주기
    data_df_num = data_df_num.drop(columns=['gender','car','reality','work_phone','phone','email','dup', 'child_num','occyp_category'])
    
    binary_features = data_df_bi.columns
    
    numeric_features = data_df_num.columns
    numeric_transformer = StandardScaler() # cf) RobustScaler

    categorical_features = data_df_cat.columns
    categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') # categories='auto' : just for ignoring warning messages

    preprocessor = ColumnTransformer(
        transformers=[ # List of (name, transformer, column(s))
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
             ('bi','passthrough',binary_features)])

    preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])
    preprocessor_pipe.fit(data_df)

    x_train_transformed = preprocessor_pipe.transform(data_df)

    return x_train_transformed

## test데이터셋 불러오기

In [None]:
# test 데이터셋
test_df = pd.read_csv('test_df(fill_groupby)_final.csv', index_col=0)
test_df.head()

### test데이터열 파이프라인 처리

In [5]:
test_df_cat = test_df.select_dtypes(include=np.object)
test_df_num = test_df.select_dtypes(exclude=np.object)

# category df 만들어주기
test_df_cat = pd.concat([test_df_cat, test_df[['occyp_category', 'child_num']]], axis=1)

# binary df 만들어주기
test_df_bi = test_df[['gender','car','reality','work_phone','phone','email','dup']]

# cat인데 num df에 들어간 변수 num df에서 drop해주기
test_df_num = test_df_num.drop(columns=['gender','car','reality','work_phone','phone','email', 'child_num','dup','occyp_category'], axis=1)

binary_features = test_df_bi.columns

numeric_features = test_df_num.columns
numeric_transformer = StandardScaler() # cf) RobustScaler

categorical_features = test_df_cat.columns
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') # categories='auto' : just for ignoring warning messages

preprocessor = ColumnTransformer(
    transformers=[ # List of (name, transformer, column(s))
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
         ('bi','passthrough',binary_features)])

preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])
preprocessor_pipe.fit(test_df) 
test_transformed = preprocessor_pipe.transform(test_df)

In [6]:
print(test_transformed.shape)

(10000, 62)


# 01. XGB로 예측한 submission.csv만들기

## 01) train데이터 split한 경우

In [8]:
data_df = pd.read_csv('pre_credit_dffill_groupby_final.csv')
target = data_df['credit']
data_df = data_df.drop(['credit', 'index'], axis=1)

In [9]:
x_train_transformed, x_test_transformed, y_train, y_test = run_pipeline(data_df, target)

In [10]:
# 모델 학습
random_state = 0
model = XGBClassifier(objective='multi:softprob',num_classes=3,random_state=random_state )
model.fit(x_train_transformed, y_train) # <- x_train_transformed (not x_train)

# test셋에 대해서 예측 정확도 출력
x_pred = model.predict(x_test_transformed)
accuracy = accuracy_score(x_pred, y_test)
print("model score:", round(accuracy, 4))

proba_result = model.predict_proba(x_test_transformed)
print("proba_result :",proba_result)

pro_logloss = log_loss(y_test, proba_result)
print('logloss: ',pro_logloss)

Parameters: { "num_classes" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


model score: 0.7222
proba_result : [[0.08446262 0.3162606  0.5992768 ]
 [0.17053786 0.5742808  0.25518134]
 [0.0771312  0.18216455 0.7407043 ]
 ...
 [0.01597201 0.90770644 0.07632158]
 [0.04547223 0.2579263  0.69660145]
 [0.07891452 0.08106276 0.84002274]]
logloss:  0.7208258075429438


In [27]:
x_train_transformed = run_pipeline_train(data_df, target)

random_state = 0
model = XGBClassifier(objective='multi:softprob',num_classes=3,random_state=random_state )
model.fit(x_train_transformed, target) # <- x_train_transformed (not x_train)

#test데이터셋에 대한 예측 배열 (log_loss값을 구하기 위한 배열)
test_pre_proba = model.predict_proba(test_transformed)
print("test_predict_proba :",test_pre_proba)

Parameters: { "num_classes" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_classes=3, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=None, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

## 02) train데이터 전체를 학습한 model

In [18]:
data_df = pd.read_csv('pre_credit_dffill_groupby_final.csv')
target = data_df['credit']
data_df = data_df.drop(['credit', 'index'], axis=1)

In [19]:
x_train_transformed = run_pipeline_train(data_df)

random_state = 0
model = XGBClassifier(objective='multi:softprob',num_classes=3,random_state=random_state )
model.fit(x_train_transformed, target) # <- x_train_transformed (not x_train)

#test데이터셋에 대한 예측 배열 (log_loss값을 구하기 위한 배열)
test_pre_proba = model.predict_proba(test_transformed)
print("test_predict_proba :",test_pre_proba)

Parameters: { "num_classes" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


test_predict_proba : [[0.03316169 0.05487515 0.9119631 ]
 [0.1765962  0.15395547 0.6694483 ]
 [0.05818312 0.14679845 0.7950184 ]
 ...
 [0.06498508 0.11920122 0.8158137 ]
 [0.11183599 0.10681279 0.78135115]
 [0.03814835 0.20806076 0.7537909 ]]


## 03) submission.csv 파일 저장

In [21]:
# 샘플
sam_sub_df = pd.read_csv('sample_submission.csv', index_col=0)

# 샘플에서 인덱스 추출
sample_index_df = pd.DataFrame({'index':sam_sub_df.index})

# test데이터셋 predict_proba 데이터프레임으로 변환
test_pre_proba_df = pd.DataFrame(test_pre_proba, columns=['0','1','2'])

# 샘플 인덱스와 predict_proba 데이터프레임 합치기, 인덱스 열 지정
test_pre_proba_df = pd.concat([sample_index_df,test_pre_proba_df], axis=1)
test_pre_proba_df.set_index('index',inplace=True)

# 파일로 저장
test_pre_proba_df.to_csv('submission_XGB_fulltrain.csv')

## 아래는 바로 위에꺼 풀어쓴 것

In [31]:
# 샘플 확인
sam_sub_df = pd.read_csv('sample_submission.csv', index_col=0)
sam_sub_df.head()

Unnamed: 0_level_0,0,1,2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26457,0,0,0
26458,0,0,0
26459,0,0,0
26460,0,0,0
26461,0,0,0


In [32]:
# logloss array 데이터프레임으로 변환
sample_index_df = pd.DataFrame({'index':sam_sub_df.index})
sample_index_df

Unnamed: 0,index
0,26457
1,26458
2,26459
3,26460
4,26461
...,...
9995,36452
9996,36453
9997,36454
9998,36455


In [33]:
test_pre_proba_df = pd.DataFrame(test_pre_proba, columns=['0','1','2'])
test_pre_proba_df

Unnamed: 0,0,1,2
0,0.033162,0.054875,0.911963
1,0.176596,0.153955,0.669448
2,0.058183,0.146798,0.795018
3,0.113115,0.139428,0.747457
4,0.265180,0.187720,0.547099
...,...,...,...
9995,0.136203,0.138176,0.725621
9996,0.048917,0.227120,0.723962
9997,0.064985,0.119201,0.815814
9998,0.111836,0.106813,0.781351


In [34]:
test_pre_proba_df = pd.concat([sample_index_df,test_pre_proba_df], axis=1)
test_pre_proba_df.set_index('index',inplace=True)

In [36]:
test_pre_proba_df.to_csv('submission_XGB.csv')

In [298]:
# 확인
sub_XGB = pd.read_csv('submission_XGB.csv', index_col=0)
print('shape :',sub_XGB.shape)
sub_XGB

shape : (10000, 4)


Unnamed: 0,index,0,1,2
0,26457,0.039926,0.060999,0.899074
1,26458,0.212156,0.219724,0.568121
2,26459,0.069663,0.139518,0.790819
3,26460,0.099860,0.139726,0.760414
4,26461,0.094992,0.180666,0.724342
...,...,...,...,...
9995,36452,0.140914,0.186323,0.672763
9996,36453,0.138043,0.259190,0.602767
9997,36454,0.259089,0.184478,0.556433
9998,36455,0.086660,0.142415,0.770925
