# 특성공학 종합 실습 (분류)

In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv('preprocessing_data.csv')

In [3]:
df1['target_binom'].value_counts()

target_binom
0.0    688
1.0     64
Name: count, dtype: int64

In [4]:
Y = df1['target_binom']
X = df1[['Vapor', 'resist_target', 'Line_CD', 'UV_type', 'Etching_rate',
         'thickness', 'Temp_Etching']]

In [29]:
X

Unnamed: 0,Vapor,resist_target,Line_CD,UV_type,Etching_rate,thickness,Temp_Etching
0,O2,1.211940,30.959,H,2.75950,699.443,70.878
1,O2,0.887720,29.653,H,2.72775,696.792,69.561
2,O2,1.113156,28.063,I,2.67000,705.471,70.968
3,O2,0.882195,31.556,I,2.74825,710.772,70.146
4,O2,0.834001,31.969,I,2.74625,716.975,71.174
...,...,...,...,...,...,...,...
747,H2O,0.923802,35.404,I,2.67450,708.586,70.859
748,H2O,0.837348,31.011,I,2.72725,712.936,71.294
749,H2O,0.859869,32.525,I,2.72275,715.498,71.550
750,H2O,0.914315,28.001,I,2.69150,707.179,70.718


In [5]:
# Imbalanced Data Sampling 수행 라이브러리
!pip install --user imblearn



In [6]:
# 학습 데이터와 검증데이터 분할
from sklearn.model_selection import train_test_split
# 특성공학 + 학습 Pipe 구성
# 연속형 데이터와 범주형 데이터를 따로 처리하는 Pipe
from sklearn.compose import make_column_transformer
# 각각의 처리기법을 파이프 형태로 구성
from imblearn.pipeline import make_pipeline
# 연속형 데이터 전처리
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
# 범주형 데이터 전처리
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
# Imbalanced Data Sampling
from imblearn.over_sampling import SMOTE
# 알고리즘으로 학습
from sklearn.tree import DecisionTreeClassifier
# 교차검증 + HyperParameter Tuning
from sklearn.model_selection import GridSearchCV
# 평가 지표
from sklearn.metrics import classification_report

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 1234)

In [8]:
numeric_list = ['resist_target', 'Line_CD', 'Etching_rate', 'thickness', 'Temp_Etching']
category_list = ['Vapor', 'UV_type']

In [9]:
numeric_pipe = make_pipeline(KNNImputer(), MinMaxScaler())
category_pipe = make_pipeline(SimpleImputer(strategy = 'most_frequent'), OneHotEncoder())

In [10]:
preprocessing_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                             (category_pipe, category_list))

In [11]:
pipe_model = make_pipeline(preprocessing_pipe, SMOTE(), DecisionTreeClassifier())
pipe_model

In [12]:
hyperparameter = {'decisiontreeclassifier__max_depth' : range(5, 11),
                  'decisiontreeclassifier__min_samples_split' : range(5, 11),
                  'decisiontreeclassifier__min_samples_leaf' : range(5, 11)}
grid_model = GridSearchCV(pipe_model, param_grid = hyperparameter, cv = 3,
                          n_jobs = -1, scoring = 'f1')
grid_model.fit(X_train, Y_train)

In [13]:
best_model = grid_model.best_estimator_

In [14]:
def evaluation_cla(model):
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)
    print('학습 능력 평가 : ')
    print(classification_report(Y_train, Y_train_pred))
    print('일반화 능력 평가 : ')
    print(classification_report(Y_test, Y_test_pred))

In [15]:
evaluation_cla(best_model)

학습 능력 평가 : 
              precision    recall  f1-score   support

         0.0       0.98      0.84      0.91       516
         1.0       0.33      0.81      0.46        48

    accuracy                           0.84       564
   macro avg       0.65      0.83      0.69       564
weighted avg       0.92      0.84      0.87       564

일반화 능력 평가 : 
              precision    recall  f1-score   support

         0.0       0.94      0.78      0.85       172
         1.0       0.16      0.44      0.23        16

    accuracy                           0.76       188
   macro avg       0.55      0.61      0.54       188
weighted avg       0.87      0.76      0.80       188



**모델을 파일 형태로 저장**

In [16]:
import pickle

In [17]:
pickle.dump(best_model, open('model.sav', 'wb'))

In [26]:
x1 = input('Vapor 값을 입력하시오 : ')
x2 = input('resist_target 값을 입력하시오 : ')
x3 = input('Line_CD 값을 입력하시오 : ')
x4 = input('UV_type 값을 입력하시오 : ')
x5 = input('Etching_rate 값을 입력하시오 : ')
x6 = input('thickness 값을 입력하시오 : ')
x7 = input('Temp_Etching 값을 입력하시오 : ')

Vapor 값을 입력하시오 :  O2
resist_target 값을 입력하시오 :  1.5
Line_CD 값을 입력하시오 :  32
UV_type 값을 입력하시오 :  H
Etching_rate 값을 입력하시오 :  23
thickness 값을 입력하시오 :  520
Temp_Etching 값을 입력하시오 :  72


In [27]:
X_list = ['Vapor', 'resist_target', 'Line_CD', 'UV_type', 'Etching_rate',
          'thickness', 'Temp_Etching']
input_data = pd.DataFrame([[x1, x2, x3, x4, x5, x6, x7]], columns = X_list)
input_data

Unnamed: 0,Vapor,resist_target,Line_CD,UV_type,Etching_rate,thickness,Temp_Etching
0,O2,1.5,32,H,23,520,72


In [20]:
# 파일형태로 저장한 모델을 새로운 스크립트에 불러오기
model = pickle.load(open('model.sav', 'rb'))

In [28]:
model.predict(input_data)

array([1.])