In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## __패키지 설치__

In [None]:
!pip install catboost
!pip install category_encoders

## __라이브러리 구성__

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings, random
warnings.filterwarnings(action='ignore')

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold

from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool

In [None]:
# e 지수 없이 데이터 출력하는 방법
pd.options.display.float_format = '{:.5f}'.format

## __데이터 불러오기__

In [None]:
train = pd.read_csv('/content/drive/MyDrive/머신러닝 프로젝트/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/머신러닝 프로젝트/data/test.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/머신러닝 프로젝트/data/sample_submission.csv')

## __데이터 전처리__
1. 결측치 처리

In [None]:
train.fillna('NaN', inplace=True) 
test.fillna('NaN', inplace=True)

2. 이상치 처리


In [None]:
train[train['family_size'] > 7]

# DAYS_BIRTH -17754 >>> 중복된 데이터가 3개 
# DAYS_BIRTH -13827 >>> 중복된 데이터가 2개 
# 다른 중복 데이터는 어떻게 분간하여 삭제할지 ..?!

In [None]:
train[train['credit'] >= 2]['child_num'].value_counts() # 신용도가 2 이상인 사람들의 자식수

0     11785
1      3391
2      1559
3       198
4        27
14        3
7         2
5         2
19        1
Name: child_num, dtype: int64

In [None]:
train['child_num'].value_counts()

0     18340
1      5386
2      2362
3       306
4        47
5        10
14        3
7         2
19        1
Name: child_num, dtype: int64

## __Feature Engineering
1. 의미없는 변수 제거
- index 제거
- FLAG_MOBIL 삭제:모든 값이 1로 동일
- 또 어떤 변수를 더 삭제하면 좋을지 함께 고민

2. 'DAYS_EMPLOYED' : 취업이 된지 몇일지 되었는지
- 음수 값이 중요함
- 양수 값이면 무의미하기에, 0으로 변경

3. 모든 음수로 표현된 값을 양수로 변환 : 'DAYS_EMPLOYED', 'DAYS_BIRTH', 'begin_month'
- 에러 방지
- 양수 값이면 무의미하기에, 0으로 변경

In [None]:
for df in [train,test]:
  # 1. 무의미한 컬럼 삭제
  df.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)
  # 2. 무의미한 양수 값 0으로 변경
  df['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)

# 3. 음수로 표기된 값들 모두 양수로 바꾸기
feats = ['DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED']
for feat in feats:
    train[feat]=np.abs(train[feat])
    test[feat]=np.abs(test[feat])

## __새로 생성한 컬럼 리스트__
4. Age 나이관련
* AGE : 만나이, 태어난 후 일수 // 365 
* AGE_col : 나이의 앞자리 수, (예) 22세 -> 20대 -> 2
* AGE_months = 나이를 만 개월 수로 환산 , df['DAYS_BIRTH'] // 30
* AGE_weeks = 나이를 만으로 몇 주인지 환산 , df['DAYS_BIRTH'] // 7


In [None]:
for df in [train,test]:

  df["AGE"] = np.abs(df["DAYS_BIRTH"]) // 365
  df['Age_col'] = 0

  # Age_col : 나이의 앞자리 수, (예) 22세 -> 20대 -> 2
  df.loc[df['AGE']<30, 'AGE_col'] = 2 #20대
  df.loc[(df['AGE']>=30) & (df['AGE']<40), 'AGE_col'] = 3 #30대
  df.loc[(df['AGE']>=40) & (df['AGE']<50), 'AGE_col'] = 4 #40대
  df.loc[(df['AGE']>=50) & (df['AGE']<60), 'AGE_col'] = 5 #50대
  df.loc[df['AGE']>=60, 'AGE_col'] = 6 #60대

  #DAYS_BIRTH 파생변수- Age(나이), 태어난 월, 태어난 주(출생연도의 n주차)
  # df['Age'] = df['DAYS_BIRTH'] // 365
  df['AGE_months'] = df['DAYS_BIRTH'] // 30
  df['AGE_weeks'] = df['DAYS_BIRTH'] // 7

In [None]:
train[['AGE_col']].head(20)

Unnamed: 0,AGE_col
0,3.0
1,3.0
2,5.0
3,4.0
4,4.0
5,3.0
6,4.0
7,4.0
8,4.0
9,4.0


## __5. 가족 인원 수 관련 : child_num, family_size__
* family_size : 가족 수가 5 이상이면 5 로 처리
* child_num : 자녀 수가 3 이상이면 3 으로 처리
* child_exist : 자녀 존재 여부, 자녀 수가 없으면 0, 있으면 1 
* no_parent : 부모 수 (number of parents), 가족 수 - 자녀 수
* single_person_households : 1인 가족인지 사실여부, 1인 가족이면 1, 아니면 0
* 결측치 처리 : 가족 수에서 자녀 수를 뺀 수가 0 또는 미만이면, 가족 인원수를 child_num + 2로 대체

In [None]:
for df in [train,test]:
  # 가족 인원수
  df['family_size'][df['family_size'] >= 5] = 5
  # 자녀 수
  df['child_num'][df['child_num'] >= 3] = 3
  # 자녀 존재 여부
  df['child_exist'] = df['child_num'].map(lambda x: 0 if x == 0 else 1)
  # 부모님 수 
  df['no_parent'] = df['family_size'] - df['child_num']
  # 1인 가족 사실 여부
  df['single_person_households'] = df['family_size'].map(lambda x: 1 if x == 1 else 0)
  # 가족 인원수 결측치 처리
  df['family_size'][df['family_size'] - df['child_num'] <= 0] = df['child_num'] + 2

## __6. 고용 관련 변수 생성__
* 취업 및 무취업 기간 년, 월, 주로 환산


In [None]:
for df in [train,test]:

    # years (근속연수), months (고용된 달), weeks (고용된 주)  
    df['EMPLOYED_years'] = df['DAYS_EMPLOYED'] // 365
    df['EMPLOYED_months'] = df['DAYS_EMPLOYED'] // 30
    df['EMPLOYED_weeks'] = df['DAYS_EMPLOYED'] // 7

    # 태어난 후부터 계산한 무취업 일수
    df['DAYS_UNEMPLOYED'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
    df['UNEMPLOYED_years'] = df['DAYS_UNEMPLOYED'] // 365
    df['UNEMPLOYED_months'] = df['DAYS_UNEMPLOYED'] // 30 
    df['UNEMPLOYED_weeks'] = df['DAYS_UNEMPLOYED'] // 7

## __7. 연봉 관련__

In [None]:
for df in [train,test]:
    # ratio 를 더 만들어보기

    # 한 사람/회원이 대략적으로 평생 번 금액
    df['income_whole_life'] = df['income_total'] * df['EMPLOYED_years']

    # salary 월급
    df['salary'] = df['income_total'] // 12

    # 가족 1인당 연간 사용 가능 한도 금액 = 연간 소득 / 가족 인원수
    df['income_per_person'] = df['income_total'] / df['family_size']
    # 가족 1인당 월간 사용 가능 한도 금액 = 월급 / 가족 인원수
    df['income_pp_month'] = df['salary'] / df['family_size']

    # begin_month, 연봉
   

    #ability: (연봉 * 근무연도) / 근무일수
    df['income_per_day'] = (df['income_total'] * df['EMPLOYED_years']) / df['DAYS_EMPLOYED']

## __8. 회원 각각의 고유한 번호 생성__

In [None]:
for df in [train,test]:
    
    # ID 생성: 각 컬럼의 값들을 더해서 회원의 고유한 번호를 생성
    # 결과를 더 낫게 만드는 ID 조합이 있는지 찾아보기
    df['ID'] = \
    df['income_type'].astype(str) + '_' + df['income_total'].astype(str) + '_' +\
    df['house_type'].astype(str) + '_' + df['occyp_type'].astype(str) + '_' + \
    df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' +\
    df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' +\
    df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' +\
    df['gender'].astype(str) + '_' + df['car'].astype(str) + '_' +\
    df['reality'].astype(str) + '_' + df['income_type'].astype(str) + '_' +\
    df['edu_type'].astype(str) + '_' + df['family_type'].astype(str) 

In [None]:
train.head()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit,AGE,Age_col,AGE_col,AGE_months,AGE_weeks,child_exist,no_parent,single_person_households,EMPLOYED_years,EMPLOYED_months,EMPLOYED_weeks,DAYS_UNEMPLOYED,UNEMPLOYED_years,UNEMPLOYED_months,UNEMPLOYED_weeks,income_whole_life,salary,income_per_person,income_pp_month,income_per_day,ID
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,,2.0,6.0,1.0,38,0,3.0,463,1985,0,2.0,0,12,156,672,9190,25,306,1312,2430000.0,16875.0,101250.0,8437.5,516.03313,Commercial associate_202500.0_Municipal apartm...
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,11380,1540,0,0,1,Laborers,3.0,5.0,1.0,31,0,3.0,379,1625,1,2.0,0,4,51,220,9840,26,328,1405,990000.0,20625.0,82500.0,6875.0,642.85714,Commercial associate_247500.0_House / apartmen...
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,19087,4434,0,1,0,Managers,2.0,22.0,2.0,52,0,5.0,636,2726,0,2.0,0,12,147,633,14653,40,488,2093,5400000.0,37500.0,225000.0,18750.0,1217.86198,Working_450000.0_House / apartment_Managers_19...
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,0,1,0,Sales staff,2.0,37.0,0.0,41,0,4.0,502,2155,0,2.0,0,5,69,298,12996,35,433,1856,1012500.0,16875.0,101250.0,8437.5,483.98662,Commercial associate_202500.0_House / apartmen...
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,15037,2105,0,0,0,Managers,2.0,26.0,2.0,41,0,4.0,501,2148,0,2.0,0,5,70,300,12932,35,431,1847,787500.0,13125.0,78750.0,6562.5,374.10926,State servant_157500.0_House / apartment_Manag...


In [None]:
train.groupby(['ID']).count()[['credit']]

Unnamed: 0_level_0,credit
ID,Unnamed: 1_level_1
Commercial associate_103500.0_House / apartment_Accountants_11529_651_0_0_0_2.0_F_N_Y_Commercial associate_Secondary / secondary special_Single / not married,4
Commercial associate_103500.0_House / apartment_Cleaning staff_18826_837_0_0_0_2.0_F_N_Y_Commercial associate_Higher education_Married,4
Commercial associate_103500.0_House / apartment_Cooking staff_13414_1700_1_0_0_3.0_F_Y_Y_Commercial associate_Secondary / secondary special_Separated,1
Commercial associate_103500.0_House / apartment_High skill tech staff_12987_3537_0_1_0_4.0_F_N_N_Commercial associate_Higher education_Married,2
Commercial associate_103500.0_House / apartment_Laborers_18240_1751_0_0_0_2.0_F_N_Y_Commercial associate_Secondary / secondary special_Married,1
...,...
Working_99000.0_Municipal apartment_Laborers_11039_2809_0_0_0_3.0_F_N_Y_Working_Secondary / secondary special_Married,3
Working_99000.0_Municipal apartment_NaN_19030_2629_0_0_0_1.0_F_N_Y_Working_Higher education_Single / not married,1
Working_99000.0_Rented apartment_Managers_9928_1531_0_0_0_2.0_M_Y_Y_Working_Secondary / secondary special_Married,5
Working_99000.0_With parents_Medicine staff_19566_5453_1_0_0_2.0_F_N_N_Working_Secondary / secondary special_Married,1


# __문자열로 이루어진 컬럼은 정수로 변환하는 함수 Encoder 사용__
### Ordinal Encoder (참고 웹사이트 : http://www.gisdeveloper.co.kr/?p=9907 )

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   gender                    26457 non-null  object 
 1   car                       26457 non-null  object 
 2   reality                   26457 non-null  object 
 3   child_num                 26457 non-null  int64  
 4   income_total              26457 non-null  float64
 5   income_type               26457 non-null  object 
 6   edu_type                  26457 non-null  object 
 7   family_type               26457 non-null  object 
 8   house_type                26457 non-null  object 
 9   DAYS_BIRTH                26457 non-null  int64  
 10  DAYS_EMPLOYED             26457 non-null  int64  
 11  work_phone                26457 non-null  int64  
 12  phone                     26457 non-null  int64  
 13  email                     26457 non-null  int64  
 14  occyp_

In [None]:
# 숫자 컬럼만 구분하여 리스트에 컬럼명 저장, 결과치인 credit 은 제외
number_feats = train.dtypes[train.dtypes != "object"].index.tolist() 
number_feats.remove('credit')

# 문자 컬럼만 구분하여 리스트에 컬럼명 저장
string_feats = train.dtypes[train.dtypes == "object"].index.tolist() 

# Encoder 를 사용할 때에도 encoding 모델을 fit 하게 맞춰주는 것이 필요하다. 
# encoder.fit_transform : Encoder 모델을 생성하고, 사용하여 문자 변수를 정수로 변경
# encoder.transform : 만들어진 Encoder 모델을 사용해 문자 변수를 정수로 변경

encode = OrdinalEncoder(string_feats)
train[string_feats] = encode.fit_transform(train[string_feats], train['credit'])
test[string_feats] = encode.transform(test[string_feats])

non_int_feats = train.dtypes[train.dtypes != "int64"].index.tolist() 
non_int_feats.remove('credit') # credit 컬럼은 test set 에 존재하지 않으므로 삭제

for df in [train,test]:
  for feat in non_int_feats:
    df[non_int_feats] = df[non_int_feats].astype('int64')

In [None]:
train.columns

Index(['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
       'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'work_phone', 'phone', 'email', 'occyp_type', 'family_size',
       'begin_month', 'credit', 'AGE', 'AGE_col', 'AGE_months', 'AGE_weeks',
       'child_exist', 'no_parent', 'single_person_households',
       'EMPLOYED_years', 'EMPLOYED_months', 'EMPLOYED_weeks',
       'DAYS_UNEMPLOYED', 'UNEMPLOYED_years', 'UNEMPLOYED_months',
       'UNEMPLOYED_weeks', 'income_whole_life', 'salary', 'income_per_person',
       'income_pp_month', 'ID', 'cluster'],
      dtype='object')

In [None]:
for df in [test,train]:
  for feat in df.columns:
    df[df.columns] = df[df.columns].astype('int64')

In [None]:
non_int_feats

['income_total',
 'family_size',
 'begin_month',
 'AGE_col',
 'no_parent',
 'income_whole_life',
 'salary',
 'income_per_person',
 'income_pp_month']

In [None]:
# 리스트의 길이 = 컬럼의 갯수
print("Number features count : ", len(number_feats)) 
print("String features count : ", len(string_feats)) 

Number features count :  27
String features count :  9


In [None]:
number_feats

['child_num',
 'income_total',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'work_phone',
 'phone',
 'email',
 'family_size',
 'begin_month',
 'AGE',
 'AGE_col',
 'AGE_months',
 'AGE_weeks',
 'child_exist',
 'no_parent',
 'single_person_households',
 'EMPLOYED_years',
 'EMPLOYED_months',
 'EMPLOYED_weeks',
 'DAYS_UNEMPLOYED',
 'UNEMPLOYED_years',
 'UNEMPLOYED_months',
 'UNEMPLOYED_weeks',
 'income_whole_life',
 'salary',
 'income_per_person',
 'income_pp_month']

In [None]:
string_feats

['gender',
 'car',
 'reality',
 'income_type',
 'edu_type',
 'family_type',
 'house_type',
 'occyp_type',
 'ID']

In [None]:
#Log Scale

#income_total 만 로그변환하는 코드
for df in [train,test]:
  df['income_total'] = np.log1p(df['income_total']) 

# 모든 숫자 컬럼 로그변환하는 코드
# for df in [train,test]:
#    for feat in number_feats:
#      df[number_feats] = np.log1p(df[number_feats]) 

# __Scaling__
- 모든 숫자 컬럼을 스케일할지, 숫자가 큰 컬럼만 할지
- 사용할 수 있는 스케일링 / 정규화
- 옵션 1 : 최소-최대 정규화(Min-Max Normalization)
- 옵션 2 : 로그변환(Log Transformation)
- 옵션 3 : Standard Scaler (하단에 코드 있음)
- 옵션 4 : Z-점수(Z-Score) 정규화 (우리가 수업시간에 하지 않았던 정규화)

In [None]:
train.head()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit,AGE,AGE_col,AGE_months,AGE_weeks,child_exist,no_parent,single_person_households,EMPLOYED_years,EMPLOYED_months,EMPLOYED_weeks,DAYS_UNEMPLOYED,UNEMPLOYED_years,UNEMPLOYED_months,UNEMPLOYED_weeks,income_whole_life,salary,income_per_person,income_pp_month,ID
0,1,1,1,0,12.2185,1,1,1,1,13899,4709,0,0,0,1,2,6,1.0,38,3,463,1985,0,2,0,12,156,672,9190,25,306,1312,2430000,16875,101250,8437,1
1,1,1,2,1,12.41917,1,2,2,2,11380,1540,0,0,1,2,3,5,1.0,31,3,379,1625,1,2,0,4,51,220,9840,26,328,1405,990000,20625,82500,6875,2
2,2,2,2,0,13.01701,2,1,1,2,19087,4434,0,1,0,3,2,22,2.0,52,5,636,2726,0,2,0,12,147,633,14653,40,488,2093,5400000,37500,225000,18750,3
3,1,1,2,0,12.2185,1,2,1,2,15088,2092,0,1,0,4,2,37,0.0,41,4,502,2155,0,2,0,5,69,298,12996,35,433,1856,1012500,16875,101250,8437,4
4,1,2,2,0,11.96719,3,1,1,2,15037,2105,0,0,0,3,2,26,2.0,41,4,501,2148,0,2,0,5,70,300,12932,35,431,1847,787500,13125,78750,6562,5


In [None]:
test.head()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,AGE,AGE_col,AGE_months,AGE_weeks,child_exist,no_parent,single_person_households,EMPLOYED_years,EMPLOYED_months,EMPLOYED_weeks,DAYS_UNEMPLOYED,UNEMPLOYED_years,UNEMPLOYED_months,UNEMPLOYED_weeks,income_whole_life,salary,income_per_person,income_pp_month,ID
0,2,2,1,0,11.63072,4,2,2,2,21990,4709,0,1,0,1,2,60,60,6,733,3141,0,2,0,12,156,672,17281,47,576,2468,1350000,9375,56250,4687,-1.0
1,1,1,2,0,11.81304,3,1,1,2,18964,1540,0,1,0,6,2,36,51,5,632,2709,0,2,0,4,51,220,17424,47,580,2489,540000,11250,67500,5625,-1.0
2,1,1,2,0,11.14725,2,2,1,2,15887,4434,1,1,0,2,2,40,43,4,529,2269,0,2,0,12,147,633,11453,31,381,1636,832464,5781,34686,2890,-1.0
3,2,2,1,0,11.63072,1,2,1,2,19270,2092,1,0,0,7,2,41,52,5,642,2752,0,2,0,5,69,298,17178,47,572,2454,562500,9375,56250,4687,-1.0
4,1,2,2,0,12.32386,3,1,1,2,17822,2105,1,0,0,3,2,8,48,4,594,2546,0,2,0,5,70,300,15717,43,523,2245,1125000,18750,112500,9375,-1.0


In [None]:
# Standard Scale 코드
# 이미 로그변환을 진행한 income_total을 제외한 나머지 numeric 컬럼만 정규화?

# 나머지 컬럼도 정규화 대신 로그변환을 하면 어떨까?

# numerical_feats.remove('income_total')

# scale = StandardScaler()
# train[number_feats] = scale.fit_transform(train[number_feats])
# test[number_feats] = scale.transform(test[number_feats])

# __개선 가능 사항__
### 0. Scale 을 하는 이유 : 알고리즘이 숫자가 큰 변수를 중요하게 인식한다. 그래서 모든 변수를 스케일하여 맞춰준 후 알고리즘을 돌리면 모든 변수를 동일한 중요도로 인식하고 분석을 시작한다.
### 1. 클러스터링을 공부하고 사용할지 결정
### __2. 중복을 찾고 제거할지, 또는 oversampling을 통해 중복 데이터를 늘려서 더 정확한 예측 결과 유추가 가능한지 확인하기 : https://dacon.io/competitions/official/235713/codeshare/2522?page=1&dtype=vote & https://dacon.io/competitions/official/235713/codeshare/2509?page=1&dtype=vote__ & 강사님 강의 자료 참고
### 3. 다중공선을 공부 후, 파생변수와 다중공선을 보이는 컬럼 삭제할지 결정
### __4. 구간화를 공부 후, 사용할지 결정하기__
### 5. feature importance 확인하고 제거하면 더 좋은 변수들이 있는지 찾아보기 : (ID 생성 후, feature importance 가 낮은 컬럼들도 지워보기... gender, car, etc.) https://dacon.io/competitions/official/235713/codeshare/2746?page=1&dtype=recent
### __6. 정규화와 로그변환의 장단점은? 어떤 상황에 어떤 것이 더 좋은가? 한쪽으로 데이터가 치우친 컬럼의 정규화가 좋다고 배웠던 것 같다...__
### 7. Pycaret 을 사용하여 더 좋은 알고리즘 모델을 찾고, 파라미터 튜닝도 추천받기. 강사님 오후 강의 참고 & 사용법 참고 https://dacon.io/competitions/official/235713/codeshare/2477page=2&dtype=vote
### 8. 여기에 추가로 변수 변환 : https://dacon.io/competitions/official/235713/codeshare/2510?page=2&dtype=vote
### __9. 범주형 데이터와 연속형 데이터(실수)의 차이 : 각각 처리하는 방법 확인. 강사님 자료 : https://colab.research.google.com/drive/1dEzp_wQKtYXK5t5XN4Ob79UJ5iCdDJga__
### 10. Grid Search CV 를 통해서 좋은 알고리즘 파라미터를 추천 받을 수 있다. 

In [None]:
# 클러스터링 구성
# 타겟을 결정짓는 뚜렷한 특징을 갖는 피쳐를 찾지 못해 clustering 시도

# kmeans_train = train.drop(['credit'], axis=1)
# kmeans = KMeans(n_clusters=36, random_state=42).fit(kmeans_train)
# train['cluster'] = kmeans.predict(kmeans_train)
# test['cluster'] = kmeans.predict(test)

In [None]:
# 현재는 중복 제거 하지 않은 상태

In [None]:
#구간화 함수
# def make_bin(df, variable, n):
    
#     data = df
#     count, bin_dividers = np.histogram(data[variable], bins=n)
#     bin_names=[str(i) for i in range(n)]
#     data['%s_bin' % variable] = pd.cut(x=data[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)
#     data['%s_bin' % variable] = pd.factorize(data['%s_bin' % variable])[0]
#     print(data['%s_bin' % variable], '\n\n')
    
#     return data

# # days_birth만 구간화 했을 떄 가장 성능이 좋았음
# train = make_bin(train, 'DAYS_BIRTH', n=10)
# test = make_bin(test, 'DAYS_BIRTH', n=10)

In [None]:
# pivot table 써서 변수 사이의 중요한 관계도 보기 

In [None]:
# feature importance 뽑아보기 

In [None]:
# 파생변수와 다중공선을 보이는 컬럼 삭제

# cols = ['child_num', 'DAYS_BIRTH', 'DAYS_EMPLOYED',]
# train.drop(cols, axis=1, inplace=True)
# test.drop(cols, axis=1, inplace=True)

# __알고리즘 모델로 결과를 내보기__
* ### __우리만의 알고리즘 & 튜닝된 파라미터를 찾아보기.__
* ### __일단은 default 기본 값으로 알고리즘을 돌려보고 파라미터를 어떻게 변경하면 더 좋을지 고민해보아도 좋다.__
* ### Grid Search CV 를 통해서 좋은 파라미터를 추천 받을 수 있다. 

In [None]:
# Modeling - catboost
# private 1위 선택 : 
# fold 수를 5부터 17까지 돌려보고 최적 fold 15로 판단 후 선택
# parameter를 default로 두는 것이 logloss가 가장 낮았음

n_est = 2000
seed = 42
n_fold = 15
n_class = 3

target = 'credit'
X = train.drop(target, axis=1)
y = train[target]
X_test = test

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds=[]
for train_idx, valid_idx in skfold.split(X, y):
        folds.append((train_idx, valid_idx))

cat_pred = np.zeros((X.shape[0], n_class))
cat_pred_test = np.zeros((X_test.shape[0], n_class))
cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type', 'ID']
for fold in range(n_fold):
  print(f'\n----------------- Fold {fold} -----------------\n')
  train_idx, valid_idx = folds[fold]
  X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]
  train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
  valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

  model_cat = CatBoostClassifier()
  model_cat.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=100)
  
  cat_pred[valid_idx] = model_cat.predict_proba(X_valid)
  cat_pred_test += model_cat.predict_proba(X_test) / n_fold
  print(f'CV Log Loss Score: {log_loss(y_valid, cat_pred[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, cat_pred):.6f}')


----------------- Fold 0 -----------------

Learning rate set to 0.115128
0:	learn: 1.0350914	test: 1.0334635	best: 1.0334635 (0)	total: 155ms	remaining: 2m 34s
100:	learn: 0.7056311	test: 0.6583780	best: 0.6583780 (100)	total: 12.1s	remaining: 1m 47s
200:	learn: 0.6889491	test: 0.6560270	best: 0.6560270 (200)	total: 25.2s	remaining: 1m 40s
300:	learn: 0.6738504	test: 0.6556564	best: 0.6555089 (210)	total: 38.9s	remaining: 1m 30s
400:	learn: 0.6602034	test: 0.6547023	best: 0.6545426 (389)	total: 56.1s	remaining: 1m 23s
500:	learn: 0.6447439	test: 0.6548064	best: 0.6544800 (485)	total: 1m 9s	remaining: 1m 9s
600:	learn: 0.6302123	test: 0.6545517	best: 0.6535082 (549)	total: 1m 23s	remaining: 55.2s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6535082205
bestIteration = 549

Shrink model to first 550 iterations.
CV Log Loss Score: 0.653508

----------------- Fold 1 -----------------

Learning rate set to 0.115128
0:	learn: 1.0350950	test: 1.0343698	best: 1.0343698

In [None]:
train.head()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit,AGE,AGE_col,AGE_months,AGE_weeks,child_exist,no_parent,single_person_households,EMPLOYED_years,EMPLOYED_months,EMPLOYED_weeks,DAYS_UNEMPLOYED,UNEMPLOYED_years,UNEMPLOYED_months,UNEMPLOYED_weeks,income_whole_life,salary,income_per_person,income_pp_month,ID,cluster
0,1,1,1,0,12.2185,1,1,1,1,13899,4709,0,0,0,1,2,6,1.0,38,3,463,1985,0,2,0,12,156,672,9190,25,306,1312,2430000,16875,101250,8437,1,31
1,1,1,2,1,12.41917,1,2,2,2,11380,1540,0,0,1,2,3,5,1.0,31,3,379,1625,1,2,0,4,51,220,9840,26,328,1405,990000,20625,82500,6875,2,4
2,2,2,2,0,13.01701,2,1,1,2,19087,4434,0,1,0,3,2,22,2.0,52,5,636,2726,0,2,0,12,147,633,14653,40,488,2093,5400000,37500,225000,18750,3,17
3,1,1,2,0,12.2185,1,2,1,2,15088,2092,0,1,0,4,2,37,0.0,41,4,502,2155,0,2,0,5,69,298,12996,35,433,1856,1012500,16875,101250,8437,4,4
4,1,2,2,0,11.96719,3,1,1,2,15037,2105,0,0,0,3,2,26,2.0,41,4,501,2148,0,2,0,5,70,300,12932,35,431,1847,787500,13125,78750,6562,5,27


In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

In [None]:
# 참고 자료 : https://dacon.io/competitions/official/235713/codeshare/2476?page=1&dtype=vote

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.656654	valid_1's multi_logloss: 0.746321
[200]	training's multi_logloss: 0.57079	valid_1's multi_logloss: 0.721837
[300]	training's multi_logloss: 0.508088	valid_1's multi_logloss: 0.712119
[400]	training's multi_logloss: 0.45802	valid_1's multi_logloss: 0.707127
Early stopping, best iteration is:
[439]	training's multi_logloss: 0.440734	valid_1's multi_logloss: 0.705744


Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.652139	valid_1's multi_logloss: 0.7619
[200]	training's multi_logloss: 0.56928	valid_1's multi_logloss: 0.744749
[300]	training's multi_logloss: 0.506149	valid_1's multi_logloss: 0.739089
[400]	training's multi_logloss: 0.454916	valid_1's multi_logloss: 0.736064
Early stopping, best iteration is:
[398]	training's multi_logloss: 0.455956	valid_1's multi_logloss: 0.736008


Training until validation scores don't improve for 30 rou