## 1.데이터 생성

In [1]:
from sklearn.datasets import make_classification
import pandas as pd
import numpy as np

X1, y1 = make_classification(n_samples=int(20000/ 0.7 / 0.9), n_features=16, n_informative=16, n_redundant=0,  n_classes=2, weights=[0.9,0.1], random_state=0)
X2, y2 = make_classification(n_samples=int(20000/ 0.7 / 0.9), n_features=8, n_informative=8, n_redundant=0,  n_classes=2, weights=[0.9,0.1], random_state=0)

print(f'X1 shape = {X1.shape}')
print(f'X2 shape = {X2.shape}')

X1 shape = (31746, 16)
X2 shape = (31746, 8)


In [2]:
import copy

def contamination(X, y, ratio, weight):
    tmp_df = pd.DataFrame(X)
    tmp_df['label'] = y
    normal = tmp_df.loc[tmp_df['label'] == 0, :].reset_index(drop=True)
    abnormal = tmp_df.loc[tmp_df['label'] == 1, :].reset_index(drop=True)
    
    normal_idx = np.arange(len(normal))
    abnormal_idx = np.arange(len(abnormal))
    num_contamination = int(ratio * len(abnormal))
    
    np.random.seed(0)
    normal_idx = np.random.choice(normal_idx, size=num_contamination, replace=False)
    abnormal_idx = np.random.choice(abnormal_idx, size=num_contamination, replace=False)
    
    normal.iloc[normal_idx, :-1] = copy.deepcopy(normal.iloc[normal_idx, :-1].values * (1 - weight))  + copy.deepcopy(abnormal.iloc[abnormal_idx, :-1].values * weight)
    result_df = pd.concat([normal, abnormal]).reset_index(drop=True)
    return num_contamination, result_df, normal_idx

In [3]:
import pickle

# # # save
# # with open('data.pickle', 'wb') as f:
# #     pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

# # # load
# # with open('data.pickle', 'rb') as f:
# #     data = pickle.load(f)

In [6]:
normal_df1 = pd.DataFrame(X1)
normal_df1['label'] = y1
normal_df1.to_csv('./sim_data/normal1.csv', index=False)

for ratio in [0.07, 0.7]:
    for weight in [0.1, 0.5, 0.9]:

        X = X1.copy()
        y = y2.copy()
        num_dirty_1, dirty_df_1, choice_idx_1 = contamination(X, y, ratio, weight) # noise는 train, valid 모두에 들어가있다.
        train_noise = num_dirty_1  / (X1.shape[0] * 0.9 * 0.7)

        print(f'noise ratio : {ratio}, noise weight : {weight}, train noise ratio = {train_noise:.3f}')
        
        if ratio == 0.07:
            _ratio = '001'
        else:
            _ratio = '010'
            
        if weight == 0.1:
            _weight = '01'
        elif weight == 0.5:
            _weight = '05'
        else:
            _weight = '09'
            
        dirty_df_1.to_csv('./sim_data/df1_' + _ratio + '_' + _weight + '.csv', index=False)
        
        with open('./sim_data/noise_idx1_' + _ratio + '_' + _weight + '.pickle', 'wb') as f:
            pickle.dump(choice_idx_1, f, pickle.HIGHEST_PROTOCOL)

noise ratio : 0.07, noise weight : 0.1, train noise ratio = 0.012
noise ratio : 0.07, noise weight : 0.5, train noise ratio = 0.012
noise ratio : 0.07, noise weight : 0.9, train noise ratio = 0.012
noise ratio : 0.7, noise weight : 0.1, train noise ratio = 0.115
noise ratio : 0.7, noise weight : 0.5, train noise ratio = 0.115
noise ratio : 0.7, noise weight : 0.9, train noise ratio = 0.115


In [5]:
normal_df2 = pd.DataFrame(X2)
normal_df2['label'] = y2
normal_df2.to_csv('./sim_data/normal2.csv', index=False)

for ratio in [0.07, 0.7]:
    for weight in [0.1, 0.5, 0.9]:

        num_dirty_2, dirty_df_2, choice_idx_2 = contamination(X2, y2, ratio, weight) # noise는 train, valid 모두에 들어가있다.
        train_noise = num_dirty_2  / (X2.shape[0] * 0.9 * 0.7)

        print(f'noise ratio : {ratio}, noise weight : {weight}, train noise ratio = {train_noise:.3f}')
        
        if ratio == 0.07:
            _ratio = '001'
        else:
            _ratio = '010'
            
        if weight == 0.1:
            _weight = '01'
        elif weight == 0.5:
            _weight = '05'
        else:
            _weight = '09'
            
        dirty_df_1.to_csv('./sim_data/df2_' + _ratio + '_' + _weight + '.csv', index=False)
        
        with open('./sim_data/noise_idx2_' + _ratio + '_' + _weight + '.pickle', 'wb') as f:
            pickle.dump(choice_idx_2, f, pickle.HIGHEST_PROTOCOL)

noise ratio : 0.07, noise weight : 0.1, train noise ratio = 0.012
noise ratio : 0.07, noise weight : 0.5, train noise ratio = 0.012
noise ratio : 0.07, noise weight : 0.9, train noise ratio = 0.012
noise ratio : 0.7, noise weight : 0.1, train noise ratio = 0.115
noise ratio : 0.7, noise weight : 0.5, train noise ratio = 0.115
noise ratio : 0.7, noise weight : 0.9, train noise ratio = 0.115
