In [1]:
import pandas as pd

In [3]:
df1 = pd.read_csv('preprocessing_data.csv')
df1.head()

Unnamed: 0.1,Unnamed: 0,Ox_Chamber,process,type,Temp_OXid,Vapor,ppm,Pressure,Oxid_time,thickness,...,Flux480s,Flux840s,input_Energy,Temp_implantation,Furance_Temp,RTA_Temp,Target,Error_message,target_binom,Chamber_Route
0,0,1,Oxidation,dry,1138.979159,O2,32.8,0.2,62.0,699.443,...,3.002593e+17,6.000007e+17,31574.41,102.847,885.0,154,96,none,0.0,route_11133
1,1,1,Oxidation,dry,1218.184551,O2,31.86,0.194,137.0,696.792,...,3.017903e+17,6.000012e+17,31580.213,104.323,919.0,154,102,none,0.0,route_11222
2,2,1,Oxidation,dry,1062.467808,O2,39.51,0.217,128.0,705.471,...,2.994231e+17,6.000002e+17,32162.414,100.605,916.0,155,95,none,0.0,route_11311
3,3,1,Oxidation,dry,1114.704773,O2,32.88,0.201,90.0,710.772,...,2.991354e+17,6.000003e+17,32874.925,101.739,911.0,156,117,none,0.0,route_12111
4,4,1,Oxidation,dry,989.411946,O2,38.11,0.204,98.0,716.975,...,3.005576e+17,6.000013e+17,30985.928,106.422,872.0,155,143,none,0.0,route_12222


In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 752 entries, 0 to 751
Data columns (total 60 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           752 non-null    int64  
 1   Ox_Chamber           752 non-null    int64  
 2   process              752 non-null    object 
 3   type                 752 non-null    object 
 4   Temp_OXid            752 non-null    float64
 5   Vapor                752 non-null    object 
 6   ppm                  752 non-null    float64
 7   Pressure             752 non-null    float64
 8   Oxid_time            752 non-null    float64
 9   thickness            752 non-null    float64
 10  No_Die               752 non-null    object 
 11  Reinforcement        752 non-null    bool   
 12  Unnamed: 0_x         752 non-null    int64  
 13  photo_soft_Chamber   752 non-null    int64  
 14  process 2            752 non-null    object 
 15  resist_target        752 non-null    flo

In [5]:
Y = df1['Target'] # Wafer 내 불량 Chip 개수
X = df1[['Temp_OXid', 'Oxid_time', 'thickness', 'Line_CD',
         'Etching_rate', 'Temp_Etching']]

In [6]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 1234)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(564, 6)
(188, 6)
(564,)
(188,)


In [10]:
from sklearn.linear_model import LinearRegression

In [11]:
model = LinearRegression()
model.fit(X_train, Y_train)

In [12]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [13]:
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

In [14]:
r2_score(Y_train, Y_train_pred)

0.34582379022991816

In [15]:
r2_score(Y_test, Y_test_pred)

0.29396801142541706

In [16]:
mean_squared_error(Y_train, Y_train_pred)

np.float64(2684.9553213362337)

In [17]:
mean_squared_error(Y_test, Y_test_pred)

np.float64(2089.478577922479)

In [25]:
def evaluation_reg(model):
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)
    print("학습 데이터 결정계수 :", r2_score(Y_train, Y_train_pred))
    print("검증 데이터 결정계수 :", r2_score(Y_test, Y_test_pred))
    print("학습 데이터 MSE :", mean_squared_error(Y_train, Y_train_pred))
    print("검증 데이터 MSE :", mean_squared_error(Y_test, Y_test_pred))
    print("학습 데이터 MAE :", mean_absolute_error(Y_train, Y_train_pred))
    print("검증 데이터 MAE :", mean_absolute_error(Y_test, Y_test_pred))

In [26]:
evaluation_reg(model)

학습 데이터 결정계수 : 0.34582379022991816
검증 데이터 결정계수 : 0.29396801142541706
학습 데이터 MSE : 2684.9553213362337
검증 데이터 MSE : 2089.478577922479
학습 데이터 MAE : 37.2300116180249
검증 데이터 MAE : 34.71724975119037


# 특성공학 기법 적용
- Pipe Line : 학습과 특성공학 기법을 동시에 사용하여 데이터를 한꺼번에 처리하면서 학습을 수행하는 함수

In [27]:
from sklearn.pipeline import Pipeline # 학습 + 특성공학
from sklearn.impute import SimpleImputer # 결측값 처리 함수

In [42]:
pipe_list = [('imputer', SimpleImputer()),
             ('model', LinearRegression())]
pipe_model = Pipeline(pipe_list)
pipe_model

In [43]:
pipe_model.fit(X_train, Y_train)

In [44]:
evaluation_reg(pipe_model)

학습 데이터 결정계수 : 0.34582379022991816
검증 데이터 결정계수 : 0.29396801142541706
학습 데이터 MSE : 2684.9553213362337
검증 데이터 MSE : 2089.478577922479
학습 데이터 MAE : 37.2300116180249
검증 데이터 MAE : 34.71724975119037


**새로운 값을 직접 입력하여 예측**

In [45]:
X

Unnamed: 0,Temp_OXid,Oxid_time,thickness,Line_CD,Etching_rate,Temp_Etching
0,1138.979159,62.0,699.443,30.959,2.75950,70.878
1,1218.184551,137.0,696.792,29.653,2.72775,69.561
2,1062.467808,128.0,705.471,28.063,2.67000,70.968
3,1114.704773,90.0,710.772,31.556,2.74825,70.146
4,989.411946,98.0,716.975,31.969,2.74625,71.174
...,...,...,...,...,...,...
747,1280.687973,21.0,708.586,35.404,2.67450,70.859
748,1275.153349,22.0,712.936,31.011,2.72725,71.294
749,1275.182502,21.0,715.498,32.525,2.72275,71.550
750,1268.105427,22.0,707.179,28.001,2.69150,70.718


In [46]:
x1 = input('Temp_Oxid을 입력하시오 : ')
x2 = input('Oxid_time을 입력하시오 : ')
x3 = input('thickness를 입력하시오 : ')
x4 = input('Line_CD를 입력하시오 : ')
x5 = input('Etching_rate를 입력하시오 : ')
x6 = input('Temp_Etching을 입력하시오 : ')

Temp_Oxid을 입력하시오 :  100
Oxid_time을 입력하시오 :  200
thickness를 입력하시오 :  50
Line_CD를 입력하시오 :  50
Etching_rate를 입력하시오 :  30
Temp_Etching을 입력하시오 :  40


In [49]:
input_new_data = pd.DataFrame([[x1, x2, x3, x4, x5, x6]], columns = X.columns.tolist())

In [50]:
pipe_model.predict(input_new_data)

array([32248.38608327])

**결측값이 있어도, Pipe Line 함수로 인해 오류 없이 예측값이 계산되어 출력**

In [51]:
import numpy as np

In [52]:
input_new_data2 = pd.DataFrame([[x1, np.nan, x3, x4, x5, x6]], columns = X.columns.tolist())
input_new_data2

Unnamed: 0,Temp_OXid,Oxid_time,thickness,Line_CD,Etching_rate,Temp_Etching
0,100,,50,50,30,40


In [53]:
pipe_model.predict(input_new_data2)

array([32241.76629099])

**Scaling & Encoding**
- Scaling -> 연속형
- Encoding -> 범주형
- Imputation -> 연속 / 범주

In [54]:
df1.columns

Index(['Unnamed: 0', 'Ox_Chamber', 'process', 'type', 'Temp_OXid', 'Vapor',
       'ppm', 'Pressure', 'Oxid_time', 'thickness', 'No_Die', 'Reinforcement',
       'Unnamed: 0_x', 'photo_soft_Chamber', 'process 2', 'resist_target',
       'N2_HMDS', 'pressure_HMDS', 'temp_HMDS', 'temp_HMDS_bake',
       'time_HMDS_bake', 'spin1', 'spin2', 'spin3', 'photoresist_bake',
       'temp_softbake', 'time_softbake', 'lithography_Chamber', 'Line_CD',
       'UV_type', 'Wavelength', 'Resolution', 'Energy_Exposure', 'Range_check',
       'Unnamed: 0_y', 'Etching_Chamber', 'Process 3', 'Temp_Etching',
       'Source_Power', 'Selectivity', 'Thin Film 4', 'Thin Film 3',
       'Thin Film 2', 'Thin Film 1', 'Etching_rate', 'Chamber_Num', 'process4',
       'Flux60s', 'Flux90s', 'Flux160s', 'Flux480s', 'Flux840s',
       'input_Energy', 'Temp_implantation', 'Furance_Temp', 'RTA_Temp',
       'Target', 'Error_message', 'target_binom', 'Chamber_Route'],
      dtype='object')

In [55]:
Y = df1['Target']
X = df1[['Oxid_time', 'thickness', 'Vapor', 'resist_target', 'Line_CD',
         'Etching_rate', 'UV_type']]

In [56]:
X

Unnamed: 0,Oxid_time,thickness,Vapor,resist_target,Line_CD,Etching_rate,UV_type
0,62.0,699.443,O2,1.211940,30.959,2.75950,H
1,137.0,696.792,O2,0.887720,29.653,2.72775,H
2,128.0,705.471,O2,1.113156,28.063,2.67000,I
3,90.0,710.772,O2,0.882195,31.556,2.74825,I
4,98.0,716.975,O2,0.834001,31.969,2.74625,I
...,...,...,...,...,...,...,...
747,21.0,708.586,H2O,0.923802,35.404,2.67450,I
748,22.0,712.936,H2O,0.837348,31.011,2.72725,I
749,21.0,715.498,H2O,0.859869,32.525,2.72275,I
750,22.0,707.179,H2O,0.914315,28.001,2.69150,I


In [57]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 1234)

In [58]:
# 문자데이터와 숫자데이터를 구분하여 처리하는 프로세스를 구축할 때 사용하는 함수
from sklearn.compose import make_column_transformer
# 특성공학 + 학습
from sklearn.pipeline import make_pipeline
# Scaling & Encoding
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [59]:
# 연속형 데이터 이름을 리스트로 선언
numeric_list = ['Oxid_time', 'thickness', 'resist_target', 'Line_CD', 'Etching_rate']
category_list = ['Vapor', 'UV_type']

In [61]:
# 연속형 데이터가 처리되는 Process
# 결측값을 평균으로 대치한 다음, Scale을 최소 0 / 최대 1로 변환
numeric_pipe = make_pipeline(SimpleImputer(strategy = 'mean'), MinMaxScaler())
numeric_pipe

In [63]:
# 범주형 데이터가 처리되는 Process
# 결측값을 최빈값으로 대치한 다음, 문자데이터를 OneHot 인코딩 변환
category_pipe = make_pipeline(SimpleImputer(strategy = 'most_frequent'), OneHotEncoder())
category_pipe

In [67]:
# 연속형과 범주형을 나누어 처리하는 Pipe를 구성
preprocessing_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                             (category_pipe, category_list))
preprocessing_pipe

In [65]:
pipe_model = make_pipeline(preprocessing_pipe, LinearRegression())
pipe_model

In [66]:
pipe_model.fit(X_train, Y_train)

In [68]:
evaluation_reg(pipe_model)

학습 데이터 결정계수 : 0.3804084568229765
검증 데이터 결정계수 : 0.2871965092882167
학습 데이터 MSE : 2543.0084219858154
검증 데이터 MSE : 2109.5186170212764
학습 데이터 MAE : 36.37145390070922
검증 데이터 MAE : 35.170212765957444


**교차 검증 Cross Validation (CV)**

In [69]:
from sklearn.model_selection import GridSearchCV

In [71]:
# 5회의 교차검증을 실시하며 학습을 수행 (Pipe Model의 Process를 따라가며 학습)
grid_model = GridSearchCV(pipe_model, param_grid = {}, cv = 5)
grid_model.fit(X_train, Y_train)

In [75]:
# 교차검증 이후 가장 성능이 좋은 Model 선택
best_model = grid_model.best_estimator_

In [79]:
evaluation_reg(best_model)

학습 데이터 결정계수 : 0.3804084568229765
검증 데이터 결정계수 : 0.2871965092882167
학습 데이터 MSE : 2543.0084219858154
검증 데이터 MSE : 2109.5186170212764
학습 데이터 MAE : 36.37145390070922
검증 데이터 MAE : 35.170212765957444


**Hyper Parameter Tuning**

In [81]:
# 의사결정나무 알고리즘
from sklearn.tree import DecisionTreeRegressor

In [82]:
# 연속형과 범주형을 나누어 처리하는 Pipe를 구성
preprocessing_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                             (category_pipe, category_list))
pipe_model = make_pipeline(preprocessing_pipe, DecisionTreeRegressor())

In [84]:
# depth : 5 ~ 10 6개 / split : 1 ~ 10 10개 / leaf : 5 ~ 10 6개 -> 360가지 경우의 수
hyperparameter = {'decisiontreeregressor__max_depth' : range(5, 11),
                 'decisiontreeregressor__min_samples_split' : range(1, 11),
                 'decisiontreeregressor__min_samples_leaf' : range(5, 11)}

grid_model = GridSearchCV(pipe_model, param_grid = hyperparameter, cv = 5)
grid_model.fit(X_train, Y_train)

180 fits failed out of a total of 1800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\magne\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\magne\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\magne\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **la

In [85]:
# 가장 적절한 모델을 선택
best_model = grid_model.best_estimator_
best_model

In [86]:
evaluation_reg(best_model)

학습 데이터 결정계수 : 0.6071143811638606
검증 데이터 결정계수 : -0.14097311763387732
학습 데이터 MSE : 1612.532399093697
검증 데이터 MSE : 3376.6726237072885
학습 데이터 MAE : 30.07200540661487
검증 데이터 MAE : 40.95643622461593
