In [1]:
import pandas as pd
import numpy  as np
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix


from sklearn.linear_model import LogisticRegression

In [3]:
# 평가함수
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)    # 오차행렬
    accuracy = accuracy_score(y_test , pred)      # 정확도
    precision = precision_score(y_test , pred)    # 정밀도
    recall = recall_score(y_test , pred)          # 재현율
    f1 = f1_score(y_test,pred)
                    
    print('오차 행렬')
    print(confusion)
    print('정확도: {:.4f}, 정밀도: {:.4f}, 재현율: {:.4f}, F1: {:.4f}'.format(accuracy, precision, recall, f1))

In [4]:
def make_df_warning_area(df) :
    
    df_new = df.copy()
    
    # 경계수준 인덱스
    w1_idx_list = df_new[(df_new['MELT_WEIGHT'] < 21000) & (df_new['MELT_TEMP'] < 600)].index  
    
    # 위험수준 인덱스
    w2_idx_list = df_new[(df_new['MELT_WEIGHT'] > 15000) & (df_new['MELT_WEIGHT'] < 20000) & (df_new['MELT_TEMP'] < 480)].index
    
    # 데이터프레임['warning_area'] 초기화
    df_new['warning_area'] = 0
    
    # 위험군 값 입력
    df_new.loc[w1_idx_list, 'warning_area'] = 0.5
    df_new.loc[w2_idx_list, 'warning_area'] = 2
    
    
    return df_new

In [7]:
df = pd.read_csv('data/melting_tank.csv')
df['TAG'] = df['TAG'].apply(lambda x : 0 if x =='OK' else 1 )

In [8]:
df_std = df.drop(['STD_DT','NUM'], axis=1)

In [9]:
df_added = make_df_warning_area(df_std)

In [10]:
df_added.head()

Unnamed: 0,MELT_TEMP,MOTORSPEED,MELT_WEIGHT,INSP,TAG,warning_area
0,489,116,631,3.19,0,0.5
1,433,78,609,3.19,0,0.5
2,464,154,608,3.19,0,0.5
3,379,212,606,3.19,0,0.5
4,798,1736,604,3.21,0,0.0


## train, test셋 분리

In [11]:
def get_train_test(df, ratio):
    split_date = int(df.shape[0]*ratio)
    train = df_std[:split_date]
    test = df_std[split_date:]
    return train, test

In [40]:
train, test = get_train_test(df_added, 0.5)

## train셋 성능 확인_스모트 사용

In [41]:
X = train.drop('TAG', axis='columns')
y = train['TAG']

# 데이터 표준화 작업
sc = MinMaxScaler()
sc.fit(X)

# 표준화된 데이터셋
X_sc = sc.transform(X)

In [42]:
X_train, X_valid, y_train, y_valid = train_test_split(X_sc, y, test_size=0.3, random_state=0, stratify=y)

In [43]:
# SMOTE 객체 생성
smote = SMOTE(random_state=0)

# 샘플링
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [44]:
log_reg = LogisticRegression()
log_reg.fit(X_train_resampled, y_train_resampled)
predict = log_reg.predict(X_valid)

get_clf_eval(y_valid,predict)

오차 행렬
[[59049 29896]
 [ 8065 28270]]
정확도: 0.6970, 정밀도: 0.4860, 재현율: 0.7780, F1: 0.5983


## test셋의 70%를 기존 모델로 예측

In [45]:
Xt = test.drop('TAG', axis='columns')
yt = test['TAG']

# 데이터 표준화 작업
sc = MinMaxScaler()
sc.fit(Xt)

# 표준화된 데이터셋
Xt_sc = sc.transform(Xt)

In [46]:
Xt_train, Xt_valid, yt_train, yt_valid = train_test_split(Xt_sc, yt, test_size=0.3, random_state=0,stratify=y)

In [48]:
# 기존에 학습된 모델로 test셋 앞0.7 예측
predicted_label = log_reg.predict(Xt_train)

In [49]:
# 예측된 label을 Xt_train와 합침
predicted_np = np.c_[Xt_train,predicted_label]

## 기존 train셋과 예측된 test셋을 합침

In [50]:
train_7_np = np.c_[X_train,y_train]
merged_np= np.r_[train_7_np,predicted_np]

## 합친 데이터로 model 재학습

In [51]:
Xm = merged_np[:,:-1]
ym = merged_np[:,-1]

In [52]:
# SMOTE 객체 생성
smote = SMOTE(random_state=0)

# 샘플링
Xm_train_resampled, ym_train_resampled = smote.fit_resample(Xm, ym)

In [53]:
# 스모트 함 수도레이블링O
improved_log_reg = LogisticRegression()
improved_log_reg.fit(Xm_train_resampled, ym_train_resampled)

LogisticRegression()

### test셋 valid를 기존 모델과 재학습한 모델 결과 비교

In [54]:
#기존 모델
predict = log_reg.predict(Xt_valid)
get_clf_eval(yt_valid,predict)

오차 행렬
[[64037 44538]
 [  452 16253]]
정확도: 0.6409, 정밀도: 0.2674, 재현율: 0.9729, F1: 0.4195


In [55]:
#재학습 모델
predict = improved_log_reg.predict(Xt_valid)
get_clf_eval(yt_valid,predict)

오차 행렬
[[67734 40841]
 [ 1725 14980]]
정확도: 0.6602, 정밀도: 0.2684, 재현율: 0.8967, F1: 0.4131


### train셋 valid를 기존 모델과 재학습한 모델 결과 비교

In [56]:
#기존 모델
predict = log_reg.predict(X_valid)
get_clf_eval(y_valid,predict)

오차 행렬
[[59049 29896]
 [ 8065 28270]]
정확도: 0.6970, 정밀도: 0.4860, 재현율: 0.7780, F1: 0.5983


In [57]:
#재학습 모델
predict = improved_log_reg.predict(X_valid)
get_clf_eval(y_valid,predict)

오차 행렬
[[61737 27208]
 [ 9707 26628]]
정확도: 0.7053, 정밀도: 0.4946, 재현율: 0.7328, F1: 0.5906


## 로지스틱 확률 조정해서 결과 보기

In [58]:
def cut_off(prd_proba, threshold) :
    res = prd_proba.copy()  # 대문자 Y를 새로운 변수로 하여 기존의 y값에 영향이 가지 않도록 한다.
    res = np.delete(res, 0, 1)
    res[res>=threshold] = 1
    res[res<threshold] = 0
    return res.reshape(1,len(res))[0]

def get_result_by_cf(model, threshold, x_val, y_val):
    predict_prb = model.predict_proba(Xt_train)
    cut_off_predict = cut_off(predict_prb,threshold)
    cf_predicted_np= np.c_[Xt_train, cut_off_predict]
    merged_cf_np = np.r_[train_7_np,cf_predicted_np]
    
    X = merged_cf_np[:,:-1]
    y = merged_cf_np[:,-1]
    
    # SMOTE 객체 생성
    smote = SMOTE(random_state=0)

    # 샘플링
    X_train_over, y_train_over = smote.fit_resample(X, y)
    
    cf_log_reg = LogisticRegression()
    cf_log_reg.fit(X_train_over, y_train_over)

    predict = cf_log_reg.predict(x_val)
    get_clf_eval(y_val,predict)

### test셋 valid- 확률에 따른 성능 확인

In [60]:
get_result_by_cf(log_reg, 0.5, Xt_valid, yt_valid)

오차 행렬
[[67734 40841]
 [ 1725 14980]]
정확도: 0.6602, 정밀도: 0.2684, 재현율: 0.8967, F1: 0.4131


In [63]:
get_result_by_cf(log_reg, 0.45, Xt_valid, yt_valid)

오차 행렬
[[64506 44069]
 [  708 15997]]
정확도: 0.6426, 정밀도: 0.2663, 재현율: 0.9576, F1: 0.4167


In [62]:
get_result_by_cf(log_reg, 0.40, Xt_valid, yt_valid)

오차 행렬
[[57460 51115]
 [   50 16655]]
정확도: 0.5916, 정밀도: 0.2458, 재현율: 0.9970, F1: 0.3943


### train셋 valid- 확률에 따른 성능 확인

In [65]:
get_result_by_cf(log_reg, 0.5, X_valid, y_valid)

오차 행렬
[[61737 27208]
 [ 9707 26628]]
정확도: 0.7053, 정밀도: 0.4946, 재현율: 0.7328, F1: 0.5906


In [67]:
get_result_by_cf(log_reg, 0.45, X_valid, y_valid)

오차 행렬
[[59239 29706]
 [ 8352 27983]]
정확도: 0.6962, 정밀도: 0.4851, 재현율: 0.7701, F1: 0.5952


In [66]:
get_result_by_cf(log_reg, 0.4, X_valid, y_valid)

오차 행렬
[[53779 35166]
 [ 6465 29870]]
정확도: 0.6677, 정밀도: 0.4593, 재현율: 0.8221, F1: 0.5893
