# <font color='CC3D3D'> Data Split Cluster
    
### 1. Data Merge
- <span style="color:green">**Master_df_v2_cluster.csv** </span>와 <span style="color:green">**log_feature_clansing.csv** </span>를 Merge
    
### 2. Data Split
- Merge한 데이터를 Train Test로 나눔
- Train 데이터를 Train, Validation 데이터로 나눔
- Validation 데이터를 Public, Private로 데이터를 나눔

### 3. Deployment CSV
- 나눠진 데이터를 <span style="color:blue">**X_train_cluster.csv, X_public_cluster.csv, X_private_cluster.csv, X_test_cluster.csv** </span>를 생성

# Import

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Data Load

In [2]:
master_df = pd.read_csv('../Data/Master_df_v2_cluster.csv')

In [3]:
log_feature = pd.read_csv('../Data/log_feature_clansing.csv')

In [4]:
master_df.isna().sum()

application_id                               0
bank_id                                      0
product_id                                   0
loan_limit                                   0
loan_rate                                    0
user_id                                      0
credit_score                                 0
yearly_income                                0
income_type                                  0
employment_type                              0
houseown_type                                0
desired_amount                               0
purpose                                      0
personal_rehabilitation_yn                   0
personal_rehabilitation_complete_yn          0
existing_loan_cnt                            0
existing_loan_amt                            0
enter_birth                                  0
gender                                       0
k_lending_rate                               0
us_lending_rate                              0
btc_price    

In [5]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13527250 entries, 0 to 13527249
Data columns (total 56 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   application_id                       int64  
 1   bank_id                              int64  
 2   product_id                           int64  
 3   loan_limit                           float64
 4   loan_rate                            float64
 5   user_id                              float64
 6   credit_score                         float64
 7   yearly_income                        float64
 8   income_type                          object 
 9   employment_type                      object 
 10  houseown_type                        object 
 11  desired_amount                       float64
 12  purpose                              int64  
 13  personal_rehabilitation_yn           float64
 14  personal_rehabilitation_complete_yn  float64
 15  existing_loan_cnt             

In [6]:
log_feature.isna().sum()

event       0
loan_cnt    0
duration    0
visit       0
segment     0
dtype: int64

In [7]:
log_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13527090 entries, 0 to 13527089
Data columns (total 5 columns):
 #   Column    Dtype  
---  ------    -----  
 0   event     float64
 1   loan_cnt  float64
 2   duration  float64
 3   visit     float64
 4   segment   float64
dtypes: float64(5)
memory usage: 516.0 MB


# Data Mergem

In [8]:
master_df = pd.concat([master_df,log_feature],axis=1)

In [9]:
master_df.isna().sum()

application_id      0
bank_id             0
product_id          0
loan_limit          0
loan_rate           0
                 ... 
event             160
loan_cnt          160
duration          160
visit             160
segment           160
Length: 61, dtype: int64

In [10]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13527250 entries, 0 to 13527249
Data columns (total 61 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   application_id                       int64  
 1   bank_id                              int64  
 2   product_id                           int64  
 3   loan_limit                           float64
 4   loan_rate                            float64
 5   user_id                              float64
 6   credit_score                         float64
 7   yearly_income                        float64
 8   income_type                          object 
 9   employment_type                      object 
 10  houseown_type                        object 
 11  desired_amount                       float64
 12  purpose                              int64  
 13  personal_rehabilitation_yn           float64
 14  personal_rehabilitation_complete_yn  float64
 15  existing_loan_cnt             

# Missing Value

In [11]:
master_df.segment.value_counts()

0.0    4116922
4.0    3812611
1.0    2635122
3.0    1134990
5.0    1077342
6.0     596628
2.0     153475
Name: segment, dtype: int64

In [12]:
master_df.event.fillna(master_df.event.mean(),inplace=True)
master_df.loan_cnt.fillna(master_df.loan_cnt.mean(),inplace=True)
master_df.duration.fillna(master_df.duration.mean(),inplace=True)
master_df.visit.fillna(master_df.visit.mean(),inplace=True)
master_df.segment.fillna(0,inplace=True)

In [13]:
master_df.isna().sum()

application_id    0
bank_id           0
product_id        0
loan_limit        0
loan_rate         0
                 ..
event             0
loan_cnt          0
duration          0
visit             0
segment           0
Length: 61, dtype: int64

In [14]:
master_target = master_df['is_applied']
master_df.drop(['is_applied'],axis=1,inplace=True)
master_df['is_applied'] = master_target

In [15]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13527250 entries, 0 to 13527249
Data columns (total 61 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   application_id                       int64  
 1   bank_id                              int64  
 2   product_id                           int64  
 3   loan_limit                           float64
 4   loan_rate                            float64
 5   user_id                              float64
 6   credit_score                         float64
 7   yearly_income                        float64
 8   income_type                          object 
 9   employment_type                      object 
 10  houseown_type                        object 
 11  desired_amount                       float64
 12  purpose                              int64  
 13  personal_rehabilitation_yn           float64
 14  personal_rehabilitation_complete_yn  float64
 15  existing_loan_cnt             

# Data Split

## Train Test

In [16]:
train_data = master_df[master_df.is_applied.notna()].reset_index(drop=True)
test_data = master_df[master_df.is_applied.isna()].reset_index(drop=True)

In [17]:
train_data.isna().sum()

application_id    0
bank_id           0
product_id        0
loan_limit        0
loan_rate         0
                 ..
loan_cnt          0
duration          0
visit             0
segment           0
is_applied        0
Length: 61, dtype: int64

In [18]:
test_data.isna().sum()

application_id          0
bank_id                 0
product_id              0
loan_limit              0
loan_rate               0
                   ...   
loan_cnt                0
duration                0
visit                   0
segment                 0
is_applied        3257239
Length: 61, dtype: int64

In [19]:
columns = train_data.columns.tolist()

In [20]:
train_data

Unnamed: 0,application_id,bank_id,product_id,loan_limit,loan_rate,user_id,credit_score,yearly_income,income_type,employment_type,...,weekday_cos,hour_sin,hour_cos,foreign,event,loan_cnt,duration,visit,segment,is_applied
0,2157865,54,235,20000000.0,16.5,346970.0,540.0,32000000.0,EARNEDINCOME,정규직,...,1.0,0.866025,-0.500000,0,8.0,3.0,191.0,1.0,1.0,1.0
1,576643,54,235,11000000.0,16.5,545882.0,580.0,72000000.0,EARNEDINCOME,정규직,...,1.0,0.500000,-0.866025,0,6.0,3.0,147.0,1.0,4.0,0.0
2,576643,11,118,3000000.0,20.0,545882.0,580.0,72000000.0,EARNEDINCOME,정규직,...,1.0,0.500000,-0.866025,0,6.0,3.0,147.0,1.0,4.0,0.0
3,2136706,42,216,10000000.0,13.5,558819.0,740.0,39000000.0,EARNEDINCOME,정규직,...,1.0,0.500000,-0.866025,0,78.0,37.0,3823.0,5.0,2.0,0.0
4,2136706,25,169,22000000.0,15.9,558819.0,740.0,39000000.0,EARNEDINCOME,정규직,...,1.0,0.500000,-0.866025,0,78.0,37.0,3823.0,5.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10270006,1969227,2,7,30000000.0,13.6,109899.0,600.0,37000000.0,EARNEDINCOME,정규직,...,1.0,-0.500000,-0.866025,0,8.0,3.0,324.0,1.0,0.0,0.0
10270007,1969227,33,110,9000000.0,14.4,109899.0,600.0,37000000.0,EARNEDINCOME,정규직,...,1.0,-0.500000,-0.866025,0,8.0,3.0,324.0,1.0,0.0,0.0
10270008,1969227,50,142,3000000.0,11.2,109899.0,600.0,37000000.0,EARNEDINCOME,정규직,...,1.0,-0.500000,-0.866025,0,8.0,3.0,324.0,1.0,0.0,0.0
10270009,1969227,22,100,4000000.0,15.3,109899.0,600.0,37000000.0,EARNEDINCOME,정규직,...,1.0,-0.500000,-0.866025,0,8.0,3.0,324.0,1.0,0.0,0.0


In [21]:
test_data

Unnamed: 0,application_id,bank_id,product_id,loan_limit,loan_rate,user_id,credit_score,yearly_income,income_type,employment_type,...,weekday_cos,hour_sin,hour_cos,foreign,event,loan_cnt,duration,visit,segment,is_applied
0,1748340,7,191,42000000.0,13.6,430982.0,620.0,24000000.0,EARNEDINCOME,정규직,...,0.623490,-2.588190e-01,-0.965926,0,18.000000,12.000000,861.000000,3.000000,0.0,
1,1748340,25,169,24000000.0,17.9,430982.0,620.0,24000000.0,EARNEDINCOME,정규직,...,0.623490,-2.588190e-01,-0.965926,0,18.000000,12.000000,861.000000,3.000000,0.0,
2,1748340,2,7,24000000.0,18.5,430982.0,620.0,24000000.0,EARNEDINCOME,정규직,...,0.623490,-2.588190e-01,-0.965926,0,18.000000,12.000000,861.000000,3.000000,0.0,
3,1748340,4,268,29000000.0,10.8,430982.0,620.0,24000000.0,EARNEDINCOME,정규직,...,0.623490,-2.588190e-01,-0.965926,0,18.000000,12.000000,861.000000,3.000000,0.0,
4,1748340,11,118,5000000.0,16.4,430982.0,620.0,24000000.0,EARNEDINCOME,정규직,...,0.623490,-2.588190e-01,-0.965926,0,18.000000,12.000000,861.000000,3.000000,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257234,1428218,62,200,3000000.0,14.8,478802.0,610.0,45000000.0,EARNEDINCOME,정규직,...,-0.900969,1.224647e-16,-1.000000,0,11.103804,5.275847,453.384047,1.578388,0.0,
3257235,1428218,2,7,40000000.0,11.8,478802.0,610.0,45000000.0,EARNEDINCOME,정규직,...,-0.900969,1.224647e-16,-1.000000,0,11.103804,5.275847,453.384047,1.578388,0.0,
3257236,1428218,32,257,15000000.0,7.2,478802.0,610.0,45000000.0,EARNEDINCOME,정규직,...,-0.900969,1.224647e-16,-1.000000,0,11.103804,5.275847,453.384047,1.578388,0.0,
3257237,1428218,33,110,44000000.0,13.5,478802.0,610.0,45000000.0,EARNEDINCOME,정규직,...,-0.900969,1.224647e-16,-1.000000,0,11.103804,5.275847,453.384047,1.578388,0.0,


# Train Validation Split

In [22]:
data = train_data.drop(['is_applied'],axis=1)
target = train_data['is_applied']

In [23]:
target

0           1.0
1           0.0
2           0.0
3           0.0
4           0.0
           ... 
10270006    0.0
10270007    0.0
10270008    0.0
10270009    0.0
10270010    0.0
Name: is_applied, Length: 10270011, dtype: float64

In [24]:
data

Unnamed: 0,application_id,bank_id,product_id,loan_limit,loan_rate,user_id,credit_score,yearly_income,income_type,employment_type,...,weekday_sin,weekday_cos,hour_sin,hour_cos,foreign,event,loan_cnt,duration,visit,segment
0,2157865,54,235,20000000.0,16.5,346970.0,540.0,32000000.0,EARNEDINCOME,정규직,...,0.0,1.0,0.866025,-0.500000,0,8.0,3.0,191.0,1.0,1.0
1,576643,54,235,11000000.0,16.5,545882.0,580.0,72000000.0,EARNEDINCOME,정규직,...,0.0,1.0,0.500000,-0.866025,0,6.0,3.0,147.0,1.0,4.0
2,576643,11,118,3000000.0,20.0,545882.0,580.0,72000000.0,EARNEDINCOME,정규직,...,0.0,1.0,0.500000,-0.866025,0,6.0,3.0,147.0,1.0,4.0
3,2136706,42,216,10000000.0,13.5,558819.0,740.0,39000000.0,EARNEDINCOME,정규직,...,0.0,1.0,0.500000,-0.866025,0,78.0,37.0,3823.0,5.0,2.0
4,2136706,25,169,22000000.0,15.9,558819.0,740.0,39000000.0,EARNEDINCOME,정규직,...,0.0,1.0,0.500000,-0.866025,0,78.0,37.0,3823.0,5.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10270006,1969227,2,7,30000000.0,13.6,109899.0,600.0,37000000.0,EARNEDINCOME,정규직,...,0.0,1.0,-0.500000,-0.866025,0,8.0,3.0,324.0,1.0,0.0
10270007,1969227,33,110,9000000.0,14.4,109899.0,600.0,37000000.0,EARNEDINCOME,정규직,...,0.0,1.0,-0.500000,-0.866025,0,8.0,3.0,324.0,1.0,0.0
10270008,1969227,50,142,3000000.0,11.2,109899.0,600.0,37000000.0,EARNEDINCOME,정규직,...,0.0,1.0,-0.500000,-0.866025,0,8.0,3.0,324.0,1.0,0.0
10270009,1969227,22,100,4000000.0,15.3,109899.0,600.0,37000000.0,EARNEDINCOME,정규직,...,0.0,1.0,-0.500000,-0.866025,0,8.0,3.0,324.0,1.0,0.0


In [25]:
x_train, x_valid, y_train, y_valid = train_test_split(data, target, test_size=0.3, shuffle=True, stratify=target, random_state=34)

In [26]:
print(x_train.shape)
print(y_train.shape)

(7189007, 60)
(7189007,)


In [27]:
print(x_valid.shape)
print(y_valid.shape)

(3081004, 60)
(3081004,)


In [28]:
y_train.isna().sum()

0

In [29]:
y_valid.isna().sum()

0

# Public Private Split

In [30]:
x_public, x_private, y_public, y_private = train_test_split(x_valid, y_valid, test_size=0.7, shuffle=True, stratify=y_valid, random_state=34)

In [31]:
print(x_public.shape)
print(y_public.shape)

(924301, 60)
(924301,)


In [32]:
print(x_private.shape)
print(y_private.shape)

(2156703, 60)
(2156703,)


In [33]:
y_public.isna().sum()

0

In [34]:
y_private.isna().sum()

0

# Data Structure

## Train

In [35]:
x_train['is_applied'] = y_train
x_train = x_train.reset_index(drop=True)
x_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train['is_applied'] = y_train


Unnamed: 0,application_id,bank_id,product_id,loan_limit,loan_rate,user_id,credit_score,yearly_income,income_type,employment_type,...,weekday_cos,hour_sin,hour_cos,foreign,event,loan_cnt,duration,visit,segment,is_applied
0,1200139,14,197,1000000.0,19.9,572156.0,580.0,48000000.0,EARNEDINCOME,정규직,...,-0.900969,5.000000e-01,8.660254e-01,0,13.0,7.0,496.0,1.0,0.0,0.0
1,1047176,62,48,13000000.0,7.0,406806.0,690.0,22000000.0,EARNEDINCOME,계약직,...,-0.900969,2.588190e-01,9.659258e-01,0,6.0,5.0,24.0,1.0,3.0,0.0
2,454447,21,196,3000000.0,12.5,319964.0,630.0,36000000.0,EARNEDINCOME,정규직,...,-0.900969,1.224647e-16,-1.000000e+00,0,6.0,3.0,349.0,1.0,4.0,0.0
3,2065029,56,5,2000000.0,16.8,853386.0,650.0,24000000.0,EARNEDINCOME,정규직,...,0.623490,1.224647e-16,-1.000000e+00,0,7.0,2.0,206.0,1.0,0.0,1.0
4,1549164,30,121,36000000.0,10.8,127040.0,940.0,45000000.0,EARNEDINCOME,정규직,...,0.623490,1.224647e-16,-1.000000e+00,0,11.2,6.6,290.2,1.4,6.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7189002,1144832,64,254,25000000.0,13.2,733411.0,634.0,32000000.0,EARNEDINCOME,정규직,...,-0.900969,5.000000e-01,-8.660254e-01,0,20.0,18.0,1385.0,3.0,0.0,0.0
7189003,1987234,36,60,80000000.0,12.4,767866.0,950.0,110000000.0,EARNEDINCOME,정규직,...,-0.900969,8.660254e-01,-5.000000e-01,0,17.0,5.0,615.0,1.0,0.0,0.0
7189004,15269,58,175,25000000.0,11.8,763418.0,590.0,67000000.0,EARNEDINCOME,계약직,...,-0.222521,5.000000e-01,-8.660254e-01,0,8.0,3.0,179.0,1.0,1.0,0.0
7189005,398762,12,68,26000000.0,11.2,864179.0,950.0,30000000.0,EARNEDINCOME,정규직,...,-0.900969,-1.000000e+00,-1.836970e-16,0,19.0,9.0,1987.0,3.0,0.0,0.0


## Public

In [36]:
x_public['is_applied'] = y_public
x_public = x_public.reset_index(drop=True)
x_public

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_public['is_applied'] = y_public


Unnamed: 0,application_id,bank_id,product_id,loan_limit,loan_rate,user_id,credit_score,yearly_income,income_type,employment_type,...,weekday_cos,hour_sin,hour_cos,foreign,event,loan_cnt,duration,visit,segment,is_applied
0,486082,1,61,3000000.0,9.9,872922.0,680.0,30000000.0,FREELANCER,기타,...,-0.222521,-1.000000,-1.836970e-16,0,6.0,3.0,186.0,1.0,4.0,0.0
1,581973,46,227,41000000.0,10.7,99259.0,778.0,40000000.0,EARNEDINCOME,정규직,...,0.623490,-0.500000,8.660254e-01,1,9.0,3.0,300.0,1.0,1.0,0.0
2,742581,28,217,2000000.0,19.4,355159.0,880.0,3000000.0,EARNEDINCOME,정규직,...,0.623490,0.866025,-5.000000e-01,0,8.0,4.6,284.6,1.8,6.0,0.0
3,1959959,35,29,3000000.0,14.8,738241.0,850.0,20000000.0,OTHERINCOME,기타,...,-0.900969,-0.500000,-8.660254e-01,0,13.0,3.0,376.0,2.0,4.0,0.0
4,1140131,22,124,11000000.0,14.9,177013.0,652.0,42000000.0,EARNEDINCOME,정규직,...,1.000000,0.258819,-9.659258e-01,0,8.0,5.0,214.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
924296,154520,23,108,36000000.0,11.5,367686.0,790.0,36000000.0,EARNEDINCOME,정규직,...,-0.222521,-0.707107,7.071068e-01,0,11.0,7.0,236.0,1.0,0.0,0.0
924297,1239968,3,152,26000000.0,18.9,653167.0,660.0,50000000.0,OTHERINCOME,기타,...,-0.222521,0.965926,-2.588190e-01,0,15.0,7.0,427.0,2.0,0.0,0.0
924298,1840353,1,61,7000000.0,12.4,301742.0,590.0,24000000.0,EARNEDINCOME,정규직,...,-0.900969,0.500000,-8.660254e-01,0,7.0,3.0,252.0,1.0,1.0,0.0
924299,1062797,6,36,39000000.0,17.7,739658.0,630.0,48000000.0,EARNEDINCOME,정규직,...,0.623490,0.500000,-8.660254e-01,0,8.0,5.0,215.0,2.0,3.0,0.0


## Private

In [37]:
x_private['is_applied'] = y_private
x_private = x_private.reset_index(drop=True)
x_private

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_private['is_applied'] = y_private


Unnamed: 0,application_id,bank_id,product_id,loan_limit,loan_rate,user_id,credit_score,yearly_income,income_type,employment_type,...,weekday_cos,hour_sin,hour_cos,foreign,event,loan_cnt,duration,visit,segment,is_applied
0,1313998,3,205,3000000.0,15.9,391198.0,960.0,45000000.0,EARNEDINCOME,정규직,...,0.623490,-0.500000,-8.660254e-01,0,10.0,3.0,1651.0,1.0,4.0,0.0
1,629526,29,265,40000000.0,8.6,563552.0,750.0,70000000.0,OTHERINCOME,기타,...,0.623490,0.965926,2.588190e-01,0,33.0,21.0,1400.0,4.0,5.0,1.0
2,1035132,37,206,12000000.0,19.9,716291.0,680.0,38000000.0,EARNEDINCOME,정규직,...,-0.900969,-0.866025,-5.000000e-01,0,31.0,18.0,520.0,3.0,0.0,0.0
3,1691942,28,217,2000000.0,12.0,540935.0,960.0,40000000.0,EARNEDINCOME,계약직,...,1.000000,-0.500000,-8.660254e-01,0,17.0,6.0,913.0,3.0,0.0,1.0
4,1807902,35,29,3000000.0,15.9,460253.0,920.0,72000000.0,EARNEDINCOME,정규직,...,-0.900969,0.965926,-2.588190e-01,0,4.0,0.0,13.0,1.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2156698,757029,22,221,20000000.0,15.5,778905.0,920.0,40000000.0,EARNEDINCOME,정규직,...,-0.900969,0.258819,-9.659258e-01,0,12.0,7.0,1099.0,1.0,1.0,0.0
2156699,1635604,50,142,38000000.0,12.6,847826.0,716.0,38000000.0,EARNEDINCOME,정규직,...,0.623490,0.707107,-7.071068e-01,0,5.4,3.2,154.2,1.2,6.0,0.0
2156700,123699,11,170,52000000.0,13.3,720454.0,648.0,35000000.0,EARNEDINCOME,정규직,...,-0.900969,1.000000,6.123234e-17,0,7.0,4.0,727.0,1.0,3.0,1.0
2156701,1086790,43,58,40000000.0,14.8,573302.0,770.0,53000000.0,EARNEDINCOME,정규직,...,-0.222521,-0.707107,7.071068e-01,0,8.0,5.0,386.0,1.0,3.0,0.0


## Test

In [38]:
test_data

Unnamed: 0,application_id,bank_id,product_id,loan_limit,loan_rate,user_id,credit_score,yearly_income,income_type,employment_type,...,weekday_cos,hour_sin,hour_cos,foreign,event,loan_cnt,duration,visit,segment,is_applied
0,1748340,7,191,42000000.0,13.6,430982.0,620.0,24000000.0,EARNEDINCOME,정규직,...,0.623490,-2.588190e-01,-0.965926,0,18.000000,12.000000,861.000000,3.000000,0.0,
1,1748340,25,169,24000000.0,17.9,430982.0,620.0,24000000.0,EARNEDINCOME,정규직,...,0.623490,-2.588190e-01,-0.965926,0,18.000000,12.000000,861.000000,3.000000,0.0,
2,1748340,2,7,24000000.0,18.5,430982.0,620.0,24000000.0,EARNEDINCOME,정규직,...,0.623490,-2.588190e-01,-0.965926,0,18.000000,12.000000,861.000000,3.000000,0.0,
3,1748340,4,268,29000000.0,10.8,430982.0,620.0,24000000.0,EARNEDINCOME,정규직,...,0.623490,-2.588190e-01,-0.965926,0,18.000000,12.000000,861.000000,3.000000,0.0,
4,1748340,11,118,5000000.0,16.4,430982.0,620.0,24000000.0,EARNEDINCOME,정규직,...,0.623490,-2.588190e-01,-0.965926,0,18.000000,12.000000,861.000000,3.000000,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257234,1428218,62,200,3000000.0,14.8,478802.0,610.0,45000000.0,EARNEDINCOME,정규직,...,-0.900969,1.224647e-16,-1.000000,0,11.103804,5.275847,453.384047,1.578388,0.0,
3257235,1428218,2,7,40000000.0,11.8,478802.0,610.0,45000000.0,EARNEDINCOME,정규직,...,-0.900969,1.224647e-16,-1.000000,0,11.103804,5.275847,453.384047,1.578388,0.0,
3257236,1428218,32,257,15000000.0,7.2,478802.0,610.0,45000000.0,EARNEDINCOME,정규직,...,-0.900969,1.224647e-16,-1.000000,0,11.103804,5.275847,453.384047,1.578388,0.0,
3257237,1428218,33,110,44000000.0,13.5,478802.0,610.0,45000000.0,EARNEDINCOME,정규직,...,-0.900969,1.224647e-16,-1.000000,0,11.103804,5.275847,453.384047,1.578388,0.0,


# Info

In [39]:
x_train.isna().sum()

application_id    0
bank_id           0
product_id        0
loan_limit        0
loan_rate         0
                 ..
loan_cnt          0
duration          0
visit             0
segment           0
is_applied        0
Length: 61, dtype: int64

In [40]:
x_public.isna().sum()

application_id    0
bank_id           0
product_id        0
loan_limit        0
loan_rate         0
                 ..
loan_cnt          0
duration          0
visit             0
segment           0
is_applied        0
Length: 61, dtype: int64

In [41]:
x_private.isna().sum()

application_id    0
bank_id           0
product_id        0
loan_limit        0
loan_rate         0
                 ..
loan_cnt          0
duration          0
visit             0
segment           0
is_applied        0
Length: 61, dtype: int64

In [42]:
test_data.isna().sum()

application_id          0
bank_id                 0
product_id              0
loan_limit              0
loan_rate               0
                   ...   
loan_cnt                0
duration                0
visit                   0
segment                 0
is_applied        3257239
Length: 61, dtype: int64

# Deployment CSV

In [43]:
x_train.to_csv('../Data/X_train_cluster.csv',index=False)
x_public.to_csv('../Data/X_public_cluster.csv',index=False)
x_private.to_csv('../Data/X_private_cluster.csv',index=False)
test_data.to_csv('../Data/X_test_cluster.csv',index=False)