In [1]:
!export TF_CPP_MIN_LOG_LEVEL=2

# Import Modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy

import os
import tensorflow as tf
from tensorflow import keras
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

import tempfile
from pandas.core.common import SettingWithCopyWarning
from sklearn.exceptions import DataConversionWarning
import warnings
warnings.simplefilter(action = 'ignore', category = SettingWithCopyWarning)
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = DataConversionWarning)

from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA, FastICA
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer


from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import IsolationForest

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm as SVM
from sklearn.ensemble import GradientBoostingClassifier

# Define Functions

### Feature / Label 분리 및 병합

In [5]:
def featureLabelSplit(dataframe, label = 'Pass/Fail'):
  X = dataframe.drop([label],axis = 1)
  y = dataframe[label]
  return (X,y)

def featureLabelMerge(feature_df, label_df, label_index = 'Pass/Fail'):
  feature_df[label_index] = label_df
  return feature_df

def passFailSplit(dataframe):
  pass_data = dataframe[dataframe['Pass/Fail']==-1]
  fail_data = dataframe[dataframe['Pass/Fail']== 1]
  return (pass_data,fail_data)
  
def passFailMerge(pass_df,fail_df):
  return pd.concat([pass_df,fail_df])

### 일정 상관관계 이상의 Feature 추출

In [6]:
def get_high_corr_indicies(dataframe, correaltion_score = 0.05):
  corr = dataframe.corr(method = 'pearson')
  corr_label = abs(corr["Pass/Fail"])
  high_corr_features = corr_label[corr_label > correaltion_score]
  return high_corr_features

### 한가지 값의 Feature 제거

In [7]:
def removeUnique(dataframe, except_index_list = []):
  df_fill = dataframe.fillna(method = 'ffill')
  df_fill = df_fill.fillna(method = 'bfill')
  single_unique_columns = []
  for i in dataframe.columns:
    if ((len(df_fill[i].unique()) == 1)  and  ((except_index_list.index.isin([i])).any() == False)):
      single_unique_columns.append(i)
  removed_df = dataframe.drop(single_unique_columns,1)
  return removed_df

### 일정 비율 이상 Null값을 가지는 Column 제거

In [8]:
def dropNullDominant(dataframe,except_index_list = [],ratio = 0.5):
  null_dominant_columns = []
  for i in dataframe.columns:
    if dataframe[i].isna().sum() > ratio * (dataframe.shape[0])and ((except_index_list.index.isin([i])).any() == False):
      null_dominant_columns.append(i)
  drop_null_dominant_df = dataframe.drop(null_dominant_columns, axis = 1)
  return drop_null_dominant_df

### Feature Scaling

In [9]:
def robustScaler(dataframe):
  RS = RobustScaler()
  rs_data = RS.fit_transform(dataframe)
  rs_df = pd.DataFrame(data = rs_data, columns = dataframe.columns)
  return rs_df

### PowerTransform

In [10]:
def powerTransform(dataframe):
  PT = PowerTransformer(standardize = False,method = 'yeo-johnson')
  pt_data = PT.fit_transform(dataframe)
  pt_df = pd.DataFrame(data = pt_data, columns = dataframe.columns)
  return pt_df

### 평균으로 Missing Value 보간

In [11]:
def imputeNull(dataframe):
  for i in dataframe.columns:
    dataframe[i].fillna(dataframe[i].mean(), inplace = True)
    # ddataframef[i].fillna(0, inplace = True)
  return dataframe

### PCA를 통한 차원 축소

In [12]:
def decomposition(feature_df, method = 'pca', decom_rs = 0):
  # feature_df, label_df = featureLabelSplit(dataframe)
  if method == 'pca':
    pca = PCA(n_components = None)
    principalComponents = pca.fit_transform(feature_df) #### DataFrame
    df = pd.DataFrame(principalComponents)
  elif method == 'ica':
    ica = FastICA(n_components=50,random_state=decom_rs,max_iter = 1000000)
    principalComponents = ica.fit_transform(feature_df)
    df = pd.DataFrame(principalComponents)
  return df

# Main

### Preprocessing

In [13]:
## Load Data
raw_df = pd.read_csv("./uci-secom.csv")
raw_df.drop(['Time'], axis = 1, inplace = True)

## Split Target
feats, label = featureLabelSplit(raw_df)

## 상관관계 0.03이상인 feature 추출
high_corr_features = get_high_corr_indicies(raw_df, 0.03)

## 값이 하나인 피쳐 제거
unique_feats = removeUnique(feats, high_corr_features)

## 결측치 많은 피쳐 제거
less_null_feats = dropNullDominant(unique_feats, high_corr_features, ratio=0.0)

## 피쳐 스케일링
scaled_feats = robustScaler(less_null_feats)

## 피쳐 정규분포화
power_transformed_feats = powerTransform(scaled_feats)

## 전체 평균으로 결측치 대체
mean_df = imputeNull(power_transformed_feats)

## 차원 축소
dec_df = decomposition(mean_df,'pca')

## feature label 합체
target_df = featureLabelMerge(dec_df,label,'Pass/Fail')

## 높은 상관관계(0.039) 가진 애들만 추출
high_corr_features = get_high_corr_indicies(target_df, 0.039)
high_corr_data = target_df[high_corr_features.index]

## neg / pos 비율 계산
raw_dff = raw_df.copy()
raw_dff['Pass/Fail'] = raw_df['Pass/Fail'].replace({-1:0})
pos, neg = np.bincount(raw_dff['Pass/Fail'])
total = neg + pos

### Define Model

In [15]:
## Early Stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=0,
    patience=10,
    mode='max',
    restore_best_weights=True)

## Metrics
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
  ]

## Model 선언
def make_model(metrics=METRICS, output_bias=None):
  if output_bias is not None:
    output_bias = tf.keras.initializers.Constant(output_bias)
  model = keras.Sequential([
      keras.layers.Dense(512 , input_shape=(46,), activation= tf.keras.layers.LeakyReLU(alpha=0.5),),
      keras.layers.BatchNormalization(),
      keras.layers.Dense(1, activation='sigmoid',
                         bias_initializer=output_bias),
  ])
  model.compile(
      optimizer=keras.optimizers.Adam(lr=1e-3),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)

  return model

## Learning parameter
EPOCHS = 300
BATCH_SIZE = 2048

## initial bias
initial_bias = np.log([pos/neg])

### Learning

In [16]:
## 학습 진행
sum = 0
for i in range(1,101): 
  train_df, test_df = train_test_split(high_corr_data, test_size=0.2, random_state = i , shuffle =True, stratify = high_corr_data['Pass/Fail'])
  train_df, val_df = train_test_split(train_df, test_size=0.2, random_state = 1 , shuffle = True, stratify = train_df['Pass/Fail'])
  
  train_features = train_df.drop("Pass/Fail",1) 
  val_features = val_df.drop("Pass/Fail",1)
  test_features = test_df.drop("Pass/Fail",1)
  train_labels = train_df['Pass/Fail'] 
  val_labels = val_df['Pass/Fail'] 
  test_labels = test_df['Pass/Fail']
  train_labels= train_labels.replace({-1:0})
  val_labels= val_labels.replace({-1:0})
  test_labels= test_labels.replace({-1:0})
  
  weight_for_0 = (1 / neg)*(total)/2.0 
  weight_for_1 = (1 / pos)*(total)/2.0

  class_weights = {0: weight_for_0, 1: weight_for_1}
  
  test_model = make_model(output_bias=initial_bias)
  #test_model.load_weights(initial_weights)
  weighted_history = test_model.fit(
      train_features,
      train_labels,
      batch_size=BATCH_SIZE,
      epochs=EPOCHS,
      callbacks=[early_stopping],
      validation_data=(val_features, val_labels),
      class_weight=class_weights,
      verbose = 0)
  train_predictions = test_model.predict(train_features, batch_size=BATCH_SIZE)
  test_predictions = test_model.predict(test_features, batch_size=BATCH_SIZE)
  results = test_model.evaluate(test_features, test_labels,batch_size=BATCH_SIZE, verbose=0)
  sum += results[-1]
  print("{:2}\tauc = {:.4f}\t----avg auc ={:.4f}".format(i, results[-1],sum/i))
print(sum/100)

 1	auc = 0.8401	----avg auc =0.8401
 2	auc = 0.9185	----avg auc =0.8793
 3	auc = 0.8276	----avg auc =0.8621
 4	auc = 0.8176	----avg auc =0.8509
 5	auc = 0.8181	----avg auc =0.8444
 6	auc = 0.7193	----avg auc =0.8235
 7	auc = 0.8292	----avg auc =0.8243


KeyboardInterrupt: ignored