In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings, random
warnings.filterwarnings(action='ignore')

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold

from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier

In [2]:
train = pd.read_csv('train.csv', encoding='cp949').drop(columns='index').iloc[:,0:18]
target = pd.read_csv('train.csv', encoding='cp949').drop(columns='index').iloc[:,-1]
test = pd.read_csv('test.csv', encoding='cp949').drop(columns='index').iloc[:,0:18]

In [3]:
df = pd.concat([train, test])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36457 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         36457 non-null  object 
 1   car            36457 non-null  object 
 2   reality        36457 non-null  object 
 3   child_num      36457 non-null  int64  
 4   income_total   36457 non-null  float64
 5   income_type    36457 non-null  object 
 6   edu_type       36457 non-null  object 
 7   family_type    36457 non-null  object 
 8   house_type     36457 non-null  object 
 9   DAYS_BIRTH     36457 non-null  int64  
 10  DAYS_EMPLOYED  36457 non-null  int64  
 11  FLAG_MOBIL     36457 non-null  int64  
 12  work_phone     36457 non-null  int64  
 13  phone          36457 non-null  int64  
 14  email          36457 non-null  int64  
 15  occyp_type     25134 non-null  object 
 16  family_size    36457 non-null  float64
 17  begin_month    36457 non-null  float64
dtypes: floa

In [4]:
df['1인당 수익']=df['income_total']/df['family_size']

In [5]:
df = df.fillna({'occyp_type':'No job'})

In [6]:
df['Age'] = abs(df['DAYS_BIRTH'])/365
df = df.astype({'Age': 'int'})

In [7]:
df['Employed']= df['DAYS_EMPLOYED'].map(lambda x: 0 if x>0 else x)
df['Employed'] = abs(df['Employed'])/365
df = df.astype({'Employed': 'int'})

In [8]:
df['Month'] = abs(df['begin_month'])
df = df.astype({'Month': 'int'})

In [9]:
#ID 생성: 각 컬럼의 값들을 더해서 고유한 사람을 파악(*한 사람이 여러 개 카드를 만들 가능성을 고려해 begin_month는 제외함)
df['ID'] = \
df['child_num'].astype(str) + '_' + df['income_total'].astype(str) + '_' +\
df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' +\
df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' +\
df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' +\
df['gender'].astype(str) + '_' + df['car'].astype(str) + '_' +\
df['reality'].astype(str) + '_' + df['income_type'].astype(str) + '_' +\
df['edu_type'].astype(str) + '_' + df['family_type'].astype(str) + '_' +\
df['house_type'].astype(str) + '_' + df['occyp_type'].astype(str)

In [10]:
train = df[:26457]
test = df[26457:]

In [11]:
numeric_feats = train.dtypes[train.dtypes != "object"].index.tolist()
categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()

train = train[numeric_feats+categorical_feats]
test = test[numeric_feats+categorical_feats]

In [12]:
# income_total 로그 
for df in [train,test]:
    df['income_total_log'] = np.log1p(df['income_total'])

In [13]:
# 카테고리컬 데이터 ordinal encoding
encoder = OrdinalEncoder(categorical_feats)
train[categorical_feats] = encoder.fit_transform(train[categorical_feats], target)
test[categorical_feats] = encoder.transform(test[categorical_feats])

In [14]:
train['ID'] = train['ID'].astype('int64')
test['ID'] = test['ID'].astype('int64')

In [15]:
numeric_feats.remove('income_total')
scaler = StandardScaler()
train[numeric_feats] = scaler.fit_transform(train[numeric_feats])
test[numeric_feats] = scaler.transform(test[numeric_feats])

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26457 entries, 0 to 26456
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   child_num         26457 non-null  float64
 1   income_total      26457 non-null  float64
 2   DAYS_BIRTH        26457 non-null  float64
 3   DAYS_EMPLOYED     26457 non-null  float64
 4   FLAG_MOBIL        26457 non-null  float64
 5   work_phone        26457 non-null  float64
 6   phone             26457 non-null  float64
 7   email             26457 non-null  float64
 8   family_size       26457 non-null  float64
 9   begin_month       26457 non-null  float64
 10  1인당 수익            26457 non-null  float64
 11  Age               26457 non-null  float64
 12  Employed          26457 non-null  float64
 13  Month             26457 non-null  float64
 14  gender            26457 non-null  int32  
 15  car               26457 non-null  int32  
 16  reality           26457 non-null  int32 

In [17]:
n_est = 2000
seed = 42
n_fold = 10
n_class = 3

X = train
y = target
X_test = test

In [18]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds=[]
for train_idx, valid_idx in skfold.split(X, y):
        folds.append((train_idx, valid_idx))

cat_pred = np.zeros((X.shape[0], n_class))
cat_pred_test = np.zeros((X_test.shape[0], n_class))
cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type', 'ID']
for fold in range(n_fold):
  print(f'\n----------------- Fold {fold} -----------------\n')
  train_idx, valid_idx = folds[fold]
  X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]
  train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
  valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

  model_cat = CatBoostClassifier()
  model_cat.fit(train_data, eval_set=valid_data, early_stopping_rounds=100, verbose=100)
  
  cat_pred[valid_idx] = model_cat.predict_proba(X_valid)
  cat_pred_test += model_cat.predict_proba(X_test) / n_fold
  print(f'CV Log Loss Score: {log_loss(y_valid, cat_pred[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, cat_pred):.6f}')


----------------- Fold 0 -----------------

Learning rate set to 0.115044
0:	learn: 1.0350336	test: 1.0335538	best: 1.0335538 (0)	total: 239ms	remaining: 3m 58s
100:	learn: 0.7095550	test: 0.6648015	best: 0.6648015 (100)	total: 6.73s	remaining: 59.9s
200:	learn: 0.6899477	test: 0.6621906	best: 0.6621885 (199)	total: 13.5s	remaining: 53.6s
300:	learn: 0.6748948	test: 0.6619215	best: 0.6616722 (270)	total: 20.5s	remaining: 47.7s
400:	learn: 0.6586927	test: 0.6623264	best: 0.6614311 (311)	total: 28s	remaining: 41.8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.661431102
bestIteration = 311

Shrink model to first 312 iterations.
CV Log Loss Score: 0.661431

----------------- Fold 1 -----------------

Learning rate set to 0.115044
0:	learn: 1.0346423	test: 1.0350978	best: 1.0350978 (0)	total: 62.6ms	remaining: 1m 2s
100:	learn: 0.7082401	test: 0.6552487	best: 0.6552487 (100)	total: 6.83s	remaining: 1m
200:	learn: 0.6918239	test: 0.6531127	best: 0.6529125 (191)	total

In [19]:
sub = pd.read_csv('submission0523_Final.csv')
sub.iloc[:, 1:] = cat_pred_test
sub

Unnamed: 0,index,0,1,2
0,26457,0.112329,0.169766,0.717905
1,26458,0.338342,0.235390,0.426268
2,26459,0.039730,0.070789,0.889481
3,26460,0.052322,0.078142,0.869536
4,26461,0.083276,0.208260,0.708464
...,...,...,...,...
9995,36452,0.077298,0.228723,0.693979
9996,36453,0.322380,0.181016,0.496604
9997,36454,0.046259,0.098111,0.855629
9998,36455,0.137392,0.306600,0.556008


In [20]:
sub.to_csv('catboost-1.csv', index=False)

PermissionError: [Errno 13] Permission denied: 'catboost-1.csv'