# <font color='CC3D3D'> 0.6_4 Undersampling+SMOTE
    
- 딥러닝 수행하기 전 불균형성을 맞추기 위해 데이터의 40%를 undersampling한 뒤 SMOTE 알고리즘을 사용해 희소 label의 양을 늘려 label 균형을 맞춰주는 데이터를 생성하는 코드
- 불균형 해소 데이터 <span style="color:blue"> **oversample_train.csv** </span> 생성

# Import

In [None]:
!pip install -q imblearn
print("Done!")

Done!


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from tqdm import tqdm
import os
import pickle
import warnings
import gc

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE

RANDOM_STATE = 4158
pd.set_option('display.max_columns', None)
CHUNKSIZE = 50000

with open(f"{PATH}/dtypes.pkl", 'rb') as f:
    dtypes = pickle.load(f)

warnings.filterwarnings('ignore')

In [None]:
def loaddata(fname:str, chunksize:int, dtype:dict=None, columns:list=None):  
    df = pd.DataFrame()
    for chunk in tqdm(pd.read_csv(fname, engine='python', low_memory=True, chunksize=chunksize, dtype=dtype)):
        df = pd.concat([df, chunk], axis=0)
        del chunk
        gc.collect()

    return df

# Data Load

In [None]:
train = loaddata("../Data/master_train_data.csv", chunksize=CHUNKSIZE, dtype=dtypes)
valid = loaddata("../Data/master_public_data.csv", chunksize=CHUNKSIZE, dtype=dtypes)

144it [09:18,  3.88s/it]
19it [01:07,  3.53s/it]


# Undersampling & SMOTE

In [None]:
_, under = train_test_split(train, test_size=0.4, shuffle=True, random_state=RANDOM_STATE, stratify=train['is_applied'])

In [None]:
under.shape

(2875603, 72)

In [None]:
under['is_applied'].value_counts() / under['is_applied'].value_counts().sum()

False    0.94544
True     0.05456
Name: is_applied, dtype: float64

In [None]:
train['is_applied'].value_counts() / train['is_applied'].value_counts().sum()

False    0.94544
True     0.05456
Name: is_applied, dtype: float64

In [None]:
X_under = under.drop('is_applied', axis=1)
y_under = under['is_applied']

In [None]:
smote = SMOTE(random_state=RANDOM_STATE, n_jobs=-1)
X_over, y_over = smote.fit_resample(X_under, y_under)

In [None]:
X_over.shape

(5437418, 71)

In [None]:
y_over.value_counts()

False    2718709
True     2718709
Name: is_applied, dtype: int64

In [None]:
master_train_oversample = pd.concat([X_over, y_over], axis=1)

In [None]:
master_train_oversample.head()

Unnamed: 0,loan_limit,loan_rate,credit_score,yearly_income,income_type,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt,enter_birth,gender,k_lending_rate,us_lending_rate,btc_price,btc_variance,eth_price,eth_variance,lunc_price,lunc_variance,Close,Open,High,Low,Volume,Change,실업자,실업률,현재가계부채CSI,가계부채전망CSI,소비심리지수,loanapply_insert_time_rank,loan_limit_avg_sub,loan_rate_avg_sub,bank_diversity,product_diversity,missing_value_cnt,working_year,working_month,age,holiday,weekday,hour,weekday_sin,weekday_cos,hour_sin,hour_cos,foreign,event,loan_cnt,duration,visit,segment,product_cnt,loan_limit_mean_x,loan_rate_mean_x,loan_limit_min_x,loan_rate_min_x,loan_limit_max_x,loan_rate_max_x,bank_label,loan_limit_min_y,loan_rate_min_y,loan_limit_max_y,loan_rate_max_y,loan_limit_mean_y,loan_rate_mean_y,product_label,is_applied
0,14000000,9.0,710,28000000,0,3,3,40000000,7,False,False,4,67000000,29,True,1.5,1.0,39120756,-0.026001,2640599,-0.080688,0.000322,-0.999512,2550.0,2570.0,2592.0,2546.0,900330000,-0.016296,76,2.300781,102,101,94.0,10,-4161290.0,-1.806641,22,31,0,2,9,31,False,4,11,-0.433838,-0.900879,0.258789,-0.96582,False,10,5,141,1,1,4,16687270.0,10.851562,1000000.0,5.898438,100000000.0,16.296875,2,6000000.0,9.0,20000000.0,10.0,13556320.0,9.054688,1,False
1,9000000,6.601562,580,36000000,1,3,3,3000000,7,False,False,5,78000000,49,False,1.5,0.5,50162048,-0.027496,3790690,-0.0298,81.518517,-0.071899,2716.0,2720.0,2720.0,2702.0,665350000,0.0001,47,1.599609,102,101,91.8125,1,0.0,-0.657227,7,7,2,2,13,51,False,4,4,-0.433838,-0.900879,0.866211,0.5,True,6,2,354,1,4,4,16687270.0,10.851562,1000000.0,5.898438,100000000.0,16.296875,2,2000000.0,6.5,20000000.0,10.0,12126460.0,6.613281,1,False
2,1000000,12.5,780,25000000,0,3,3,40000000,7,False,False,3,237000000,33,False,1.25,0.25,48605860,0.0033,3289621,0.013397,87.549965,0.050507,2714.0,2736.0,2736.0,2702.0,765680000,-0.012199,65,3.099609,103,102,93.8125,1,-8391304.0,1.521484,20,23,2,2,16,35,True,6,22,-0.781738,0.623535,-0.5,0.866211,False,9,3,301,1,6,4,16687270.0,10.851562,1000000.0,5.898438,100000000.0,16.296875,2,1000000.0,5.898438,100000000.0,16.296875,20307260.0,13.828125,0,False
3,40000000,7.898438,990,40000000,5,3,2,270000000,3,False,False,1,55000000,35,True,1.5,1.0,37052276,-0.055389,2468705,-0.083191,6.1e-05,-0.195312,2626.0,2634.0,2644.0,2618.0,870270000,0.0021,65,1.700195,103,100,92.5,7,11333330.0,-2.546875,18,21,2,15,165,50,False,3,14,0.433838,-0.900879,-0.5,-0.866211,False,10,5,503,1,1,7,24648460.0,12.179688,0.0,4.800781,150000000.0,19.90625,2,0.0,4.800781,150000000.0,19.90625,24183660.0,12.664062,0,False
4,6000000,15.796875,910,24000000,0,3,2,10000000,7,False,False,12,111000000,40,False,1.75,1.0,39541500,0.057404,2493979,0.082214,0.000127,0.234253,2670.0,2662.0,2672.0,2654.0,469800000,0.012001,71,2.699219,102,101,93.3125,14,-7500000.0,4.148438,26,36,1,9,100,49,False,1,18,0.781738,0.623535,-1.0,-0.0,False,2,0,900,1,3,2,18613160.0,13.226562,0.0,7.199219,50000000.0,19.90625,2,0.0,13.398438,50000000.0,19.90625,21210150.0,15.804688,0,False


## Deployment

In [None]:
master_train_oversample.to_csv("../Data/oversample_train.csv", index=False)