# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [2]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

### 데이터 읽어오기


In [3]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
train_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000,,,...,117,,,1,,,0,,,Normal


In [4]:
# Drop columns with more than half of the values missing
drop_cols = []
for column in train_data.columns:
    if (train_data[column].notnull().sum() // 2) < train_data[
        column
    ].isnull().sum():
        drop_cols.append(column)
train_data = train_data.drop(drop_cols, axis=1)

train_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION ? Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240,2.5,-90,...,50.0,91.8,270,50,114.612,19.9,7,127,1,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240,2.5,-90,...,91.8,270.0,50,85,19.600,7.0,185,1,0,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000,12.5,90,...,50.0,91.8,270,50,114.612,19.8,10,73,1,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000,12.5,90,...,91.8,270.0,50,85,19.900,12.0,268,1,0,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240,2.5,-90,...,91.8,270.0,50,85,19.700,8.0,121,1,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240,2.5,-90,...,91.8,270.0,50,85,19.200,1.0,318,1,0,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000,12.5,90,...,50.0,91.8,270,50,114.612,20.5,14,197,1,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240,2.5,-90,...,50.0,91.8,270,50,85.000,19.7,1,27,1,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000,12.5,90,...,91.8,270.0,50,85,20.100,13.0,117,1,0,Normal


In [5]:
# 각 열의 NaN 값 개수 계산
nan_counts = train_data.isnull().sum()

# NaN 값이 있는 열들만 필터링
nan_columns = nan_counts[nan_counts > 0]

nan_columns

HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam      12766
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1    12766
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2    12766
dtype: int64

In [6]:
# 각 열의 NaN 값이 있는 행을 표시
nan_dam = train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].isnull()
nan_fill1 = train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].isnull()
nan_fill2 = train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].isnull()

# 세 열의 NaN 값이 모두 같은 행에서 발생했는지 확인
nan_same_rows = nan_dam & nan_fill1 & nan_fill2

# NaN 값이 동일한 행의 개수 확인
num_same_nan_rows = nan_same_rows.sum()

# 결과 출력
print(f"세 열에서 NaN 값이 동일한 행의 개수: {num_same_nan_rows}")
print(f"세 열에서 NaN 값이 동일한 행:\n{train_data[nan_same_rows]}")

세 열에서 NaN 값이 동일한 행의 개수: 12766
세 열에서 NaN 값이 동일한 행:
      Wip Line_Dam Process Desc._Dam     Equipment_Dam Model.Suffix_Dam  \
0          IVI-OB6     Dam Dispenser  Dam dispenser #1      AJX75334505   
12         IVI-OB6     Dam Dispenser  Dam dispenser #2      AJX75334501   
13         IVI-OB6     Dam Dispenser  Dam dispenser #2      AJX75334501   
18         IVI-OB6     Dam Dispenser  Dam dispenser #1      AJX75334501   
24         IVI-OB6     Dam Dispenser  Dam dispenser #1      AJX75334507   
...            ...               ...               ...              ...   
40485      IVI-OB6     Dam Dispenser  Dam dispenser #1      AJX75334501   
40491      IVI-OB6     Dam Dispenser  Dam dispenser #2      AJX75334501   
40492      IVI-OB6     Dam Dispenser  Dam dispenser #2      AJX75334501   
40500      IVI-OB6     Dam Dispenser  Dam dispenser #2      AJX75334501   
40502      IVI-OB6     Dam Dispenser  Dam dispenser #2      AJX75334501   

       Workorder_Dam  Insp. Seq No._Dam Insp Judg

In [7]:
# 열 목록을 정의
required_columns = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2',
    'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
    'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave'
]

# 존재하는 열들만 필터링
existing_columns = [col for col in required_columns if col in train_data.columns]

# 존재하는 열들 출력
print("존재하는 열들:", existing_columns)

존재하는 열들: []


In [8]:
# Model.Suffix의 값이 공정에 따라 같은지 다른지 비교할 열 리스트
suffix_columns = [
    'Model.Suffix_Dam',
    'Model.Suffix_Fill1',
    'Model.Suffix_Fill2',
    'Model.Suffix_AutoClave'
]

# 각 행에 대해 모든 비교 열의 값이 동일한지 확인
all_equal = train_data[suffix_columns].nunique(axis=1) == 1

# 동일한 값을 가진 행만 필터링하여 네 개의 열만 선택
equal_rows = train_data.loc[all_equal, suffix_columns]

# 동일한 값을 가진 행의 개수와 결과 출력
print(f"동일한 값을 가진 행의 개수: {len(equal_rows)}")
print(equal_rows)

동일한 값을 가진 행의 개수: 40506
      Model.Suffix_Dam Model.Suffix_Fill1 Model.Suffix_Fill2  \
0          AJX75334505        AJX75334505        AJX75334505   
1          AJX75334505        AJX75334505        AJX75334505   
2          AJX75334501        AJX75334501        AJX75334501   
3          AJX75334501        AJX75334501        AJX75334501   
4          AJX75334501        AJX75334501        AJX75334501   
...                ...                ...                ...   
40501      AJX75334501        AJX75334501        AJX75334501   
40502      AJX75334501        AJX75334501        AJX75334501   
40503      AJX75334501        AJX75334501        AJX75334501   
40504      AJX75334501        AJX75334501        AJX75334501   
40505      AJX75334501        AJX75334501        AJX75334501   

      Model.Suffix_AutoClave  
0                AJX75334505  
1                AJX75334505  
2                AJX75334501  
3                AJX75334501  
4                AJX75334501  
...                   

In [9]:
# Workorder의 값이 공정에 따라 같은지 다른지 비교할 열 리스트
suffix_columns = [
    'Workorder_Dam',
    'Workorder_Fill1',
    'Workorder_Fill2',
    'Workorder_AutoClave'
]

# 각 행에 대해 모든 비교 열의 값이 동일한지 확인
all_equal = train_data[suffix_columns].nunique(axis=1) == 1

# 동일한 값을 가진 행만 필터링하여 네 개의 열만 선택
equal_rows = train_data.loc[all_equal, suffix_columns]

# 동일한 값을 가진 행의 개수와 결과 출력
print(f"동일한 값을 가진 행의 개수: {len(equal_rows)}")
print(equal_rows)

동일한 값을 가진 행의 개수: 40506
      Workorder_Dam Workorder_Fill1 Workorder_Fill2 Workorder_AutoClave
0        4F1XA938-1      4F1XA938-1      4F1XA938-1          4F1XA938-1
1        3KPM0016-2      3KPM0016-2      3KPM0016-2          3KPM0016-2
2        4E1X9167-1      4E1X9167-1      4E1X9167-1          4E1X9167-1
3        3K1X0057-1      3K1X0057-1      3K1X0057-1          3K1X0057-1
4        3HPM0007-1      3HPM0007-1      3HPM0007-1          3HPM0007-1
...             ...             ...             ...                 ...
40501    3J1XF434-2      3J1XF434-2      3J1XF434-2          3J1XF434-2
40502    4E1XC796-1      4E1XC796-1      4E1XC796-1          4E1XC796-1
40503    4C1XD438-1      4C1XD438-1      4C1XD438-1          4C1XD438-1
40504    3I1XA258-1      3I1XA258-1      3I1XA258-1          3I1XA258-1
40505    3G1XA501-1      3G1XA501-1      3G1XA501-1          3G1XA501-1

[40506 rows x 4 columns]


In [10]:
# 삭제할 열 리스트
columns_to_drop = [
        'Model.Suffix_Fill1', 'Model.Suffix_Fill2', 'Model.Suffix_AutoClave', 
        'Workorder_Fill1', 'Workorder_Fill2', 'Workorder_AutoClave'
]

# 열 삭제
train_data = train_data.drop(columns=columns_to_drop)

# 열 이름 변경
train_data = train_data.rename(columns={
    'Model.Suffix_Dam': 'Model.Suffix',
    'Workorder_Dam': 'Workorder'
})

train_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix,Workorder,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION ? Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240,2.5,-90,...,50.0,91.8,270,50,114.612,19.9,7,127,1,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240,2.5,-90,...,91.8,270.0,50,85,19.600,7.0,185,1,0,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000,12.5,90,...,50.0,91.8,270,50,114.612,19.8,10,73,1,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000,12.5,90,...,91.8,270.0,50,85,19.900,12.0,268,1,0,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240,2.5,-90,...,91.8,270.0,50,85,19.700,8.0,121,1,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240,2.5,-90,...,91.8,270.0,50,85,19.200,1.0,318,1,0,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000,12.5,90,...,50.0,91.8,270,50,114.612,20.5,14,197,1,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240,2.5,-90,...,50.0,91.8,270,50,85.000,19.7,1,27,1,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000,12.5,90,...,91.8,270.0,50,85,20.100,13.0,117,1,0,Normal


In [11]:
# 모든 행이 동일한 값을 가지는 열을 찾고, 그 값을 함께 저장
constant_columns = {col: train_data[col].iloc[0] for col in train_data.columns if train_data[col].nunique() == 1}

# 데이터 프레임에서 해당 열 삭제
train_data = train_data.drop(columns=constant_columns.keys())

# 삭제된 열의 개수 출력
print(f"삭제된 열의 개수: {len(constant_columns)}")

# 삭제된 열과 그 값을 한 줄에 하나씩 출력
print("삭제된 열과 값:")
for col, value in constant_columns.items():
    print(f"{col}: {value}")

train_data

삭제된 열의 개수: 35
삭제된 열과 값:
Wip Line_Dam: IVI-OB6
Process Desc._Dam: Dam Dispenser
Insp. Seq No._Dam: 1
Insp Judge Code_Dam: OK
CURE STANDBY POSITION X Collect Result_Dam: 1150
CURE STANDBY POSITION Z Collect Result_Dam: 33.5
CURE STANDBY POSITION ? Collect Result_Dam: 0
CURE START POSITION Z Collect Result_Dam: 33.5
Wip Line_AutoClave: IVI-OB6
Process Desc._AutoClave: Auto Clave Out
Equipment_AutoClave: Auto Clave Out
Insp. Seq No._AutoClave: 1
Insp Judge Code_AutoClave: OK
1st Pressure Judge Value_AutoClave: OK
2nd Pressure Judge Value_AutoClave: OK
3rd Pressure Judge Value_AutoClave: OK
Wip Line_Fill1: IVI-OB6
Process Desc._Fill1: Fill1 Dispenser
Insp. Seq No._Fill1: 1
Insp Judge Code_Fill1: OK
Wip Line_Fill2: IVI-OB6
Process Desc._Fill2: Fill2 Dispenser
Insp. Seq No._Fill2: 1
Insp Judge Code_Fill2: OK
CURE END POSITION ? Collect Result_Fill2: -90
CURE STANDBY POSITION X Collect Result_Fill2: 1020
CURE STANDBY POSITION ? Collect Result_Fill2: 0
CURE START POSITION ? Collect Result_Fill2

Unnamed: 0,Equipment_Dam,Model.Suffix,Workorder,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION ? Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION ? Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,Dam dispenser #1,AJX75334505,4F1XA938-1,240,2.5,-90,100,1030,-90,16,...,50.0,91.8,270,50,114.612,19.9,7,127,1,Normal
1,Dam dispenser #1,AJX75334505,3KPM0016-2,240,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.600,7.0,185,1,0,Normal
2,Dam dispenser #2,AJX75334501,4E1X9167-1,1000,12.5,90,85,280,90,16,...,50.0,91.8,270,50,114.612,19.8,10,73,1,Normal
3,Dam dispenser #2,AJX75334501,3K1X0057-1,1000,12.5,90,70,280,90,10,...,91.8,270.0,50,85,19.900,12.0,268,1,0,Normal
4,Dam dispenser #1,AJX75334501,3HPM0007-1,240,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.700,8.0,121,1,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,Dam dispenser #1,AJX75334501,3J1XF434-2,240,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.200,1.0,318,1,0,Normal
40502,Dam dispenser #2,AJX75334501,4E1XC796-1,1000,12.5,90,100,280,90,16,...,50.0,91.8,270,50,114.612,20.5,14,197,1,Normal
40503,Dam dispenser #1,AJX75334501,4C1XD438-1,240,2.5,-90,100,1030,-90,16,...,50.0,91.8,270,50,85.000,19.7,1,27,1,Normal
40504,Dam dispenser #2,AJX75334501,3I1XA258-1,1000,12.5,90,70,280,90,10,...,91.8,270.0,50,85,20.100,13.0,117,1,0,Normal


In [12]:
# 열 이름 정의
target_column = 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'

# 열의 값들 확인
if target_column not in train_data.columns:
    raise ValueError(f"열 '{target_column}'이(가) 데이터프레임에 없습니다.")

# 고유한 값 추출
unique_values = train_data[target_column].unique()

# 고유한 값들을 포함한 DataFrame 생성
unique_values_df = pd.DataFrame(unique_values, columns=[target_column])

# 결측치와 문자열 포함 모든 고유 값 출력
print(f"'{target_column}' 열의 고유한 값들:")
print(unique_values_df)

# 추가로 결측치와 특정 값('OK')의 존재 여부 확인
nan_count = train_data[target_column].isna().sum()
ok_count = (train_data[target_column] == 'OK').sum()

print(f"\n결측치(NaN) 개수: {nan_count}")
print(f"'OK' 값 개수: {ok_count}")

'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam' 열의 고유한 값들:
  HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam
0                                                NaN      
1                                              550.3      
2                                                 OK      
3                                              162.4      
4                                                549      
5                                              549.5      
6                                                550      
7                                              548.5      

결측치(NaN) 개수: 12766
'OK' 값 개수: 11293


In [13]:
data = train_data.copy()

In [14]:
# # 제거할 열 리스트
columns_to_drop = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam', 
                   'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 
                   'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']
#   'WorkMode Collect Result_Dam', 'WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2'
# # 지정한 열 제거
data = data.drop(columns=columns_to_drop)

In [15]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

# 범주형 열을 선택하여 인코딩 수행
categorical_features = ['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2', 'Chamber Temp. Judge Value_AutoClave', 'Model.Suffix', 'Workorder']

# OrdinalEncoder 인스턴스 생성
encoder = OrdinalEncoder()

# 범주형 열에 대해서만 인코딩 수행
data[categorical_features] = encoder.fit_transform(data[categorical_features])

# 인코딩된 데이터프레임 확인
data

Unnamed: 0,Equipment_Dam,Model.Suffix,Workorder,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION ? Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION ? Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,0.0,3.0,657.0,240,2.5,-90,100,1030,-90,16,...,50.0,91.8,270,50,114.612,19.9,7,127,1,Normal
1,0.0,3.0,283.0,240,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.600,7.0,185,1,0,Normal
2,1.0,0.0,589.0,1000,12.5,90,85,280,90,16,...,50.0,91.8,270,50,114.612,19.8,10,73,1,Normal
3,1.0,0.0,251.0,1000,12.5,90,70,280,90,10,...,91.8,270.0,50,85,19.900,12.0,268,1,0,Normal
4,0.0,0.0,142.0,240,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.700,8.0,121,1,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,0.0,0.0,238.0,240,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.200,1.0,318,1,0,Normal
40502,1.0,0.0,643.0,1000,12.5,90,100,280,90,16,...,50.0,91.8,270,50,114.612,20.5,14,197,1,Normal
40503,0.0,0.0,540.0,240,2.5,-90,100,1030,-90,16,...,50.0,91.8,270,50,85.000,19.7,1,27,1,Normal
40504,1.0,0.0,164.0,1000,12.5,90,70,280,90,10,...,91.8,270.0,50,85,20.100,13.0,117,1,0,Normal


In [16]:
# 각 범주에 대한 인코딩 값 확인
for feature, categories in zip(categorical_features, encoder.categories_):
    print(f"Feature: {feature}")
    for category, index in zip(categories, range(len(categories))):
        print(f"  {category}: {index}")

Feature: Equipment_Dam
  Dam dispenser #1: 0
  Dam dispenser #2: 1
Feature: Equipment_Fill1
  Fill1 dispenser #1: 0
  Fill1 dispenser #2: 1
Feature: Equipment_Fill2
  Fill2 dispenser #1: 0
  Fill2 dispenser #2: 1
Feature: Chamber Temp. Judge Value_AutoClave
  NG: 0
  OK: 1
Feature: Model.Suffix
  AJX75334501: 0
  AJX75334502: 1
  AJX75334503: 2
  AJX75334505: 3
  AJX75334506: 4
  AJX75334507: 5
  AJX75334508: 6
Feature: Workorder
  3F1X5847-2: 0
  3F1X9643-1: 1
  3F1X9644-1: 2
  3F1X9648-1: 3
  3F1X9648-2: 4
  3F1XA350-1: 5
  3F1XA351-1: 6
  3F1XB560-1: 7
  3F1XC376-1: 8
  3F1XC414-1: 9
  3F1XC414-2: 10
  3F1XC600-1: 11
  3F1XC781-1: 12
  3FPM0081-1: 13
  3FPM0085-1: 14
  3FPXX064-0003: 15
  3G1X4501-1: 16
  3G1X4502-2: 17
  3G1X4503-1: 18
  3G1X8290-1: 19
  3G1X8293-1: 20
  3G1X8295-1: 21
  3G1X8296-1: 22
  3G1X8297-1: 23
  3G1X8297-2: 24
  3G1X8298-1: 25
  3G1X8298-2: 26
  3G1X8299-1: 27
  3G1X8300-1: 28
  3G1X8300-2: 29
  3G1X8303-1: 30
  3G1X8646-1: 31
  3G1X8646-2: 32
  3G1X8647-1

In [25]:
# 문자열 타입인 열들의 이름과 데이터 타입 출력
string_columns = data.select_dtypes(include='object')
string_columns.dtypes

target    object
dtype: object

##  train/val split,  df_abnormal 구분, 언더샘플링,  장비 구분

In [26]:
from sklearn.model_selection import train_test_split

# 랜덤 시드 설정
RANDOM_STATE = 42

# train/val split
df_train, df_val = train_test_split(
    data,
    test_size=0.3,
    stratify=data["target"],
    random_state=RANDOM_STATE,
)

In [27]:
categorical_features = ['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2']
# 세 가지 조건: 열 간 값이 다르면 불량으로 분류
condition_receip = (
    (df_train['Receip No Collect Result_Dam'] != df_train['Receip No Collect Result_Fill1']) |
    (df_train['Receip No Collect Result_Dam'] != df_train['Receip No Collect Result_Fill2']) |
    (df_train['Receip No Collect Result_Fill1'] != df_train['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (df_train['Production Qty Collect Result_Dam'] != df_train['Production Qty Collect Result_Fill1']) |
    (df_train['Production Qty Collect Result_Dam'] != df_train['Production Qty Collect Result_Fill2']) |
    (df_train['Production Qty Collect Result_Fill1'] != df_train['Production Qty Collect Result_Fill2'])
)

# 장비가 다르면 불량으로 분류
equip1 = df_train[categorical_features].eq(0).all(axis=1)
equip2 = df_train[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~equip1 & ~equip2

# 네 조건 중 하나라도 만족하면 불량으로 분류
condition_abnormal = condition_receip | condition_production_qty | condition_equip_different

# 불량 데이터와 정상 데이터로 분리
df_abnormal = df_train[condition_abnormal]  # 불량으로 분류할 데이터
df_filtered = df_train[~condition_abnormal]  # 모델1, 2로 학습할 데이터


# 결과 확인
print(f"불량 데이터: {len(df_abnormal)} 행")
print(f"정상 데이터: {len(df_filtered)} 행")

불량 데이터: 62 행
정상 데이터: 28292 행


In [28]:
df_abnormal['target'].value_counts()

AbNormal    62
Name: target, dtype: int64

In [29]:
print(df_train.shape)
print(df_filtered.shape)

(28354, 137)
(28292, 137)


In [31]:
normal_ratio = 4.0  # 1.0 means 1:1 ratio

df_normal = df_filtered[df_filtered["target"] == "Normal"]
df_abnormal = df_filtered[df_filtered["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 26709, AbNormal: 1583


target
Normal      6332
AbNormal    1583
dtype: int64

In [32]:
# 정상 데이터에서 장비별로 데이터 분리
df_model1 = df_concat[df_concat[categorical_features].eq(0).all(axis=1)]
df_model2 = df_concat[df_concat[categorical_features].eq(1).all(axis=1)]

print(f"Equipment #1 데이터: {len(df_model1)} 행")
print(f"Equipment #2 데이터: {len(df_model2)} 행")

Equipment #1 데이터: 4946 행
Equipment #2 데이터: 2969 행


In [33]:
# 데이터와 타겟 분리
X_model1 = df_model1.drop(columns='target')
y_model1 = df_model1['target']
X_model2 = df_model2.drop(columns='target')
y_model2 = df_model2['target']

##  RandomForest

In [34]:
# 모델 훈련
model1 = RandomForestClassifier(random_state=42)
model1.fit(X_model1, y_model1)

model2 = RandomForestClassifier(random_state=42)
model2.fit(X_model2, y_model2)

RandomForestClassifier(random_state=42)

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100, 200, 500],  # 적절한 트리 수
    'max_depth': [30, 50, None],  # 깊이를 넓게 설정
    'min_samples_split': [10, 20],  # 분할을 더 자유롭게 설정
    'min_samples_leaf': [1, 2, 4, 8, 16],  # 리프 노드 최소 샘플 수
    'criterion': ['gini']
    #'max_features': ['auto', 'sqrt', 'log2']  # 다양한 특징 선택 방법
}

model1 = RandomForestClassifier(random_state=RANDOM_STATE)

model1_grid_search = GridSearchCV(estimator=model1, param_grid=param_grid, 
                           scoring='f1_weighted', cv=5, verbose=2, n_jobs=-1)

model1_grid_search.fit(X_model1, y_model1)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:  4.5min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [30, 50, None],
                         'min_samples_leaf': [1, 2, 4, 8, 16],
                         'min_samples_split': [10, 20],
                         'n_estimators': [100, 200, 500]},
             scoring='f1_weighted', verbose=2)

In [36]:
print("Best1 parameters found: ", model1_grid_search.best_params_)
print("Best1 parameters score: ", model1_grid_search.best_score_)

Best1 parameters found:  {'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}
Best1 parameters score:  0.7289238554869744


In [37]:
model2 = RandomForestClassifier(random_state=RANDOM_STATE)
                                
model2_grid_search = GridSearchCV(estimator=model2, param_grid=param_grid, 
                           scoring='f1_weighted', cv=5, verbose=2, n_jobs=-1)

model2_grid_search.fit(X_model2, y_model2)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   58.4s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:  2.9min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [30, 50, None],
                         'min_samples_leaf': [1, 2, 4, 8, 16],
                         'min_samples_split': [10, 20],
                         'n_estimators': [100, 200, 500]},
             scoring='f1_weighted', verbose=2)

In [38]:
print("Best2 parameters found: ", model2_grid_search.best_params_)
print("Best2 parameters score: ", model2_grid_search.best_score_)

Best2 parameters found:  {'criterion': 'gini', 'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best2 parameters score:  0.7692859456319903


In [39]:
model1 = RandomForestClassifier(**model1_grid_search.best_params_, class_weight='balanced')
model1.fit(X_model1, y_model1)
model2 = RandomForestClassifier(**model2_grid_search.best_params_, class_weight='balanced')
model2.fit(X_model2, y_model2)

RandomForestClassifier(class_weight='balanced', max_depth=50,
                       min_samples_split=10)

###  RandomForest 모델 평가

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

y_val = df_val["target"]

# 세 가지 조건: 열 간 값이 다르면 불량으로 분류
condition_receip = (
    (df_val['Receip No Collect Result_Dam'] != df_val['Receip No Collect Result_Fill1']) |
    (df_val['Receip No Collect Result_Dam'] != df_val['Receip No Collect Result_Fill2']) |
    (df_val['Receip No Collect Result_Fill1'] != df_val['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (df_val['Production Qty Collect Result_Dam'] != df_val['Production Qty Collect Result_Fill1']) |
    (df_val['Production Qty Collect Result_Dam'] != df_val['Production Qty Collect Result_Fill2']) |
    (df_val['Production Qty Collect Result_Fill1'] != df_val['Production Qty Collect Result_Fill2'])
)

val_equip1 = df_val[categorical_features].eq(0).all(axis=1)
val_equip2 = df_val[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~val_equip1 & ~val_equip2 

# 조건들을 묶어서 필터링
val_condition = ( condition_receip | condition_production_qty | condition_equip_different )

val_abnormal = df_val[val_condition]
# abnormal로 제외하고 남은 데이터

val_filtered = df_val[~val_condition]
# 장비가 0 또는 1의 값으로 나누기 (label encoding으로 Equipment #1: 0 , Equipment #2: 1)

val_model1 = val_filtered[val_filtered[categorical_features].eq(0).all(axis=1)]
val_model2 = val_filtered[val_filtered[categorical_features].eq(1).all(axis=1)]

In [41]:
val_model1_train = val_model1.drop(columns = 'target')
val_model2_train = val_model2.drop(columns = 'target')

In [42]:
val_model1_predictions = model1.predict(val_model1_train)
val_model2_predictions = model2.predict(val_model2_train)

# 예측을 위한 데이터 인덱스
val_model1_index = val_model1.index
val_model2_index = val_model2.index

# 검증 데이터의 전체 예측을 위한 결과 결합
val_predictions = []

for idx in df_val.index:
    if idx in val_model1_index:
        val_predictions.append(val_model1_predictions[val_model1_index.get_loc(idx)])
    elif idx in val_model2_index:
        val_predictions.append(val_model2_predictions[val_model2_index.get_loc(idx)])
    else:
        val_predictions.append('AbNormal')


# 정확도 계산
val_accuracy = accuracy_score(y_val, val_predictions)

# F1 스코어 계산
val_f1_score = f1_score(y_val, val_predictions, pos_label="AbNormal")

# 결과 출력
print(f"검증 데이터 정확도: {val_accuracy:.4f}")
print(f"검증 데이터 F1 스코어: {val_f1_score:.4f}")

검증 데이터 정확도: 0.8943
검증 데이터 F1 스코어: 0.2015


In [43]:
# 혼동 행렬 생성
cm = confusion_matrix(y_val, val_predictions)

print(f"F1 스코어: {val_f1_score}")
print(f"혼동 행렬:\n{cm}")# 혼동 행렬 생성

F1 스코어: 0.20149253731343283
혼동 행렬:
[[  162   543]
 [  741 10706]]


## XGBoost

In [None]:
pip install xgboost

### 모델 나누어서

In [44]:
# 데이터와 타겟 분리
X_model1 = df_model1.drop(columns='target')
y_model1 = df_model1['target']
X_model2 = df_model2.drop(columns='target')
y_model2 = df_model2['target']

y_model1_encoded = y_model1.replace({'Normal': 0, 'AbNormal': 1})
y_model2_encoded = y_model2.replace({'Normal': 0, 'AbNormal': 1})

In [45]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# 모델 훈련
model1 = XGBClassifier(random_state=42)
model1.fit(X_model1, y_model1_encoded)

model2 = XGBClassifier(random_state=42)
model2.fit(X_model2, y_model2_encoded)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)

In [46]:
param_grid = {
    'max_depth': [3, 5, 7],
    'eta': [0.01, 0.1, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0.5, 1, 2]
}

model1 = XGBClassifier(random_state=42)

model1_grid_search = GridSearchCV(estimator=model1, param_grid=param_grid, 
                           scoring='f1_weighted', cv=5, verbose=2, n_jobs=-1)

model1_grid_search.fit(X_model1, y_model1_encoded)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   59.0s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 3645 out of 3645 | elapsed: 12.4min finished


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     grow_policy=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     multi_strategy=None, n_estimators=None,
                                     n_jobs=None, num

In [47]:
model2 = XGBClassifier(random_state=42)
                                
model2_grid_search = GridSearchCV(estimator=model2, param_grid=param_grid, 
                           scoring='f1_weighted', cv=5, verbose=2, n_jobs=-1)

model2_grid_search.fit(X_model2, y_model2_encoded)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   46.0s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 3645 out of 3645 | elapsed:  9.2min finished


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     grow_policy=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     multi_strategy=None, n_estimators=None,
                                     n_jobs=None, num

In [48]:
model1_xgb = XGBClassifier(**model1_grid_search.best_params_, class_weight='balanced')
model1_xgb.fit(X_model1, y_model1_encoded)
model2_xgb = XGBClassifier(**model2_grid_search.best_params_, class_weight='balanced')
model2_xgb.fit(X_model2, y_model2_encoded)

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.



XGBClassifier(base_score=None, booster=None, callbacks=None,
              class_weight='balanced', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=1.0, device=None,
              early_stopping_rounds=None, enable_categorical=False, eta=0.3,
              eval_metric=None, feature_types=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None, ...)

In [49]:
print("Best1 parameters found: ", model1_grid_search.best_params_)
print("Best1 parameters score: \n", model1_grid_search.best_score_)

print("Best1 parameters found: ", model2_grid_search.best_params_)
print("Best1 parameters score: \n", model2_grid_search.best_score_)

Best1 parameters found:  {'colsample_bytree': 1.0, 'eta': 0.3, 'max_depth': 7, 'reg_alpha': 0, 'reg_lambda': 0.5, 'subsample': 1.0}
Best1 parameters score: 
 0.7425957230246111
Best1 parameters found:  {'colsample_bytree': 1.0, 'eta': 0.3, 'max_depth': 5, 'reg_alpha': 0.1, 'reg_lambda': 2, 'subsample': 1.0}
Best1 parameters score: 
 0.7746897517775111


In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

y_val = df_val["target"]
y_val_encoded = y_val.replace({'Normal': 0, 'AbNormal': 1})

# 세 가지 조건: 열 간 값이 다르면 불량으로 분류
condition_receip = (
    (df_val['Receip No Collect Result_Dam'] != df_val['Receip No Collect Result_Fill1']) |
    (df_val['Receip No Collect Result_Dam'] != df_val['Receip No Collect Result_Fill2']) |
    (df_val['Receip No Collect Result_Fill1'] != df_val['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (df_val['Production Qty Collect Result_Dam'] != df_val['Production Qty Collect Result_Fill1']) |
    (df_val['Production Qty Collect Result_Dam'] != df_val['Production Qty Collect Result_Fill2']) |
    (df_val['Production Qty Collect Result_Fill1'] != df_val['Production Qty Collect Result_Fill2'])
)

val_equip1 = df_val[categorical_features].eq(0).all(axis=1)
val_equip2 = df_val[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~val_equip1 & ~val_equip2 

# 조건들을 묶어서 필터링
val_condition = ( condition_receip | condition_production_qty | condition_equip_different )

val_abnormal = df_val[val_condition]
# abnormal로 제외하고 남은 데이터

val_filtered = df_val[~val_condition]
# 장비가 0 또는 1의 값으로 나누기 (label encoding으로 Equipment #1: 0 , Equipment #2: 1)

val_model1 = val_filtered[val_filtered[categorical_features].eq(0).all(axis=1)]
val_model2 = val_filtered[val_filtered[categorical_features].eq(1).all(axis=1)]

val_model1_train = val_model1.drop(columns = 'target')
val_model2_train = val_model2.drop(columns = 'target')

In [51]:
val_model1_predictions = model1_xgb.predict(val_model1_train)
val_model2_predictions = model2_xgb.predict(val_model2_train)

# 예측을 위한 데이터 인덱스
val_model1_index = val_model1.index
val_model2_index = val_model2.index

# 검증 데이터의 전체 예측을 위한 결과 결합
val_predictions = []

for idx in df_val.index:
    if idx in val_model1_index:
        val_predictions.append(val_model1_predictions[val_model1_index.get_loc(idx)])
    elif idx in val_model2_index:
        val_predictions.append(val_model2_predictions[val_model2_index.get_loc(idx)])
    else:
        val_predictions.append(1)


# 정확도 계산
val_accuracy = accuracy_score(y_val_encoded, val_predictions)

# F1 스코어 계산
val_f1_score = f1_score(y_val_encoded, val_predictions, pos_label=1)

# 결과 출력
print(f"검증 데이터 정확도: {val_accuracy:.4f}")
print(f"검증 데이터 F1 스코어: {val_f1_score:.4f}")

검증 데이터 정확도: 0.9022
검증 데이터 F1 스코어: 0.1761


# SVC

In [67]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

model1_svm = SVC(kernel = 'rbf',random_state=RANDOM_STATE)

parameters = {'C': [0.1, 1, 10, 50, 100],
             'gamma':[0.001, 0.01, 0.1, 1]}

model1_svm_grid = GridSearchCV(model1_svm,
                      param_grid = parameters, cv = 5)

model1_svm_grid.fit(X_model1, y_model1)

GridSearchCV(cv=5, estimator=SVC(random_state=42),
             param_grid={'C': [1, 10, 50, 100], 'gamma': [0.001, 0.01, 0.1]})

In [68]:
result1 = pd.DataFrame(model1_svm_grid.cv_results_['params'])
result1['mean_test_score'] = model1_svm_grid.cv_results_['mean_test_score']
result1.sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,C,gamma,mean_test_score
2,1,0.1,0.798423
0,1,0.001,0.798423
5,10,0.1,0.792559
8,50,0.1,0.792559
11,100,0.1,0.792559
1,1,0.01,0.791345
4,10,0.01,0.752122
10,100,0.01,0.749898
7,50,0.01,0.749898
3,10,0.001,0.74545


In [69]:
model2_svm = SVC(kernel = 'rbf',random_state=RANDOM_STATE)
model2_svm_grid = GridSearchCV(model2_svm,
                      param_grid = parameters, cv = 5)

model2_svm_grid.fit(X_model2, y_model2)

GridSearchCV(cv=5, estimator=SVC(random_state=42),
             param_grid={'C': [1, 10, 50, 100], 'gamma': [0.001, 0.01, 0.1]})

In [70]:
result2 = pd.DataFrame(model2_svm_grid.cv_results_['params'])
result2['mean_test_score'] = model2_svm_grid.cv_results_['mean_test_score']
result2.sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,C,gamma,mean_test_score
0,1,0.001,0.816436
1,1,0.01,0.8161
5,10,0.1,0.808689
8,50,0.1,0.808689
11,100,0.1,0.808689
2,1,0.1,0.805321
4,10,0.01,0.775008
7,50,0.01,0.773997
10,100,0.01,0.773997
3,10,0.001,0.760528


In [78]:
model1_svm = SVC(**model1_svm_grid.best_params_, kernel = 'rbf',random_state=RANDOM_STATE)
model2_svm = SVC(**model2_svm_grid.best_params_, kernel = 'rbf',random_state=RANDOM_STATE)
model1_svm.fit(X_model1, y_model1)
model2_svm.fit(X_model2, y_model2)

SVC(C=1, gamma=0.001, random_state=42)

In [79]:
val_model1_predictions = model1_svm.predict(val_model1_train)
val_model2_predictions = model2_svm.predict(val_model2_train)

# 예측을 위한 데이터 인덱스
val_model1_index = val_model1.index
val_model2_index = val_model2.index

# 검증 데이터의 전체 예측을 위한 결과 결합
val_predictions = []

for idx in df_val.index:
    if idx in val_model1_index:
        val_predictions.append(val_model1_predictions[val_model1_index.get_loc(idx)])
    elif idx in val_model2_index:
        val_predictions.append(val_model2_predictions[val_model2_index.get_loc(idx)])
    else:
        val_predictions.append('AbNormal')


# 정확도 계산
val_accuracy = accuracy_score(y_val, val_predictions)

# F1 스코어 계산
val_f1_score = f1_score(y_val, val_predictions, pos_label='AbNormal')

# 결과 출력
print(f"검증 데이터 정확도: {val_accuracy:.4f}")
print(f"검증 데이터 F1 스코어: {val_f1_score:.4f}")

검증 데이터 정확도: 0.9380
검증 데이터 F1 스코어: 0.1253


## LGBM

In [123]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

# LightGBM 모델 정의
model1_lgbm = LGBMClassifier()
model2_lgbm = LGBMClassifier()

# 하이퍼파라미터 그리드 설정
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'max_depth': [-1, 10, 15],
    'num_leaves': [31, 50, 100],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

model1_lgbm_grid_search = GridSearchCV(estimator=model1_lgbm, param_grid=param_grid,
                                       cv=5, verbose=1, n_jobs = -1)

# 모델 학습
model1_lgbm_grid_search.fit(X_model1, y_model1)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 75.4min
[Parallel(n_jobs=-1)]: Done 2430 out of 2430 | elapsed: 81.2min finished


GridSearchCV(cv=5, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'colsample_bytree': [0.8, 1.0],
                         'learning_rate': [0.01, 0.05, 0.1],
                         'max_depth': [-1, 10, 15],
                         'n_estimators': [100, 200, 500],
                         'num_leaves': [31, 50, 100],
                         'subsample': [0.6, 0.8, 1.0]},
             verbose=1)

In [124]:
print(model1_lgbm_grid_search.best_params_)
print(model1_lgbm_grid_search.best_score_)

{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 200, 'num_leaves': 50, 'subsample': 0.6}
0.8008489342361941


In [147]:
model2_lgbm_grid_search = GridSearchCV(estimator=model2_lgbm, param_grid=param_grid,
                                       cv=5, verbose=1, n_jobs = -1)

# 모델 학습
model2_lgbm_grid_search.fit(X_model2, y_model2)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 2430 out of 2430 | elapsed: 25.0min finished


GridSearchCV(cv=5,
             estimator=LGBMClassifier(class_weight='balanced',
                                      learning_rate=0.01, max_depth=15,
                                      n_estimators=200, num_leaves=50,
                                      random_state=42, subsample=0.6),
             n_jobs=-1,
             param_grid={'colsample_bytree': [0.8, 1.0],
                         'learning_rate': [0.01, 0.05, 0.1],
                         'max_depth': [-1, 10, 15],
                         'n_estimators': [100, 200, 500],
                         'num_leaves': [31, 50, 100],
                         'subsample': [0.6, 0.8, 1.0]},
             verbose=1)

In [148]:
print(model2_lgbm_grid_search.best_params_)
print(model2_lgbm_grid_search.best_score_)

{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 500, 'num_leaves': 50, 'subsample': 0.6}
0.7894924512125187


In [149]:
model1_lgbm = LGBMClassifier(**model1_lgbm_grid_search.best_params_, class_weight='balanced')
model2_lgbm = LGBMClassifier(**model2_lgbm_grid_search.best_params_, class_weight='balanced')

model1_lgbm.fit(X_model1, y_model1)
model2_lgbm.fit(X_model2, y_model2)

LGBMClassifier(class_weight='balanced', colsample_bytree=0.8, n_estimators=500,
               num_leaves=50, subsample=0.6)

In [150]:
val_model1_predictions = model1_lgbm.predict(val_model1_train)
val_model2_predictions = model2_lgbm.predict(val_model2_train)

# 예측을 위한 데이터 인덱스
val_model1_index = val_model1.index
val_model2_index = val_model2.index

# 검증 데이터의 전체 예측을 위한 결과 결합
val_predictions = []

for idx in df_val.index:
    if idx in val_model1_index:
        val_predictions.append(val_model1_predictions[val_model1_index.get_loc(idx)])
    elif idx in val_model2_index:
        val_predictions.append(val_model2_predictions[val_model2_index.get_loc(idx)])
    else:
        val_predictions.append('AbNormal')


# 정확도 계산
val_accuracy = accuracy_score(y_val, val_predictions)

# F1 스코어 계산
val_f1_score = f1_score(y_val, val_predictions, pos_label='AbNormal')

# 결과 출력
print(f"검증 데이터 정확도: {val_accuracy:.4f}")
print(f"검증 데이터 F1 스코어: {val_f1_score:.4f}")

검증 데이터 정확도: 0.7530
검증 데이터 F1 스코어: 0.1793


In [151]:
# 혼동 행렬 생성
cm = confusion_matrix(y_val, val_predictions)

print(f"F1 스코어: {val_f1_score}")
print(f"혼동 행렬:\n{cm}")# 혼동 행렬 생성

F1 스코어: 0.17933296883542918
혼동 행렬:
[[ 328  377]
 [2625 8822]]


# 모델1

In [152]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

scale_pos_weight = len(y_model1[y_model1 == 'AbNormal']) / len(y_model1[y_model1 == 'Normal'])
RANDOM_STATE = 42

model1_rf = RandomForestClassifier(**{'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}, class_weight='balanced', random_state=RANDOM_STATE)
model1_xgb = XGBClassifier(**{'colsample_bytree': 1.0, 'eta': 0.3, 'max_depth': 7, 'reg_alpha': 0, 'reg_lambda': 0.5, 'subsample': 1.0}, scale_pos_weight=scale_pos_weight, random_state=RANDOM_STATE)
model1_svm = SVC(C=1, gamma=0.1, kernel = 'rbf',random_state=RANDOM_STATE, probability=True)
model1_lgbm = LGBMClassifier(**{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 20, 'n_estimators': 200, 'num_leaves': 50, 'subsample': 0.6}, class_weight='balanced', random_state=RANDOM_STATE)

# 앙상블 모델 정의 (투표 기반)
model1_voting = VotingClassifier(
    estimators=[('rf', model1_rf), ('xgb', model1_xgb), ('svm', model1_svm), ('lgbm', model1_lgbm)],
    voting='soft'
)

# 모델 학습
model1_voting.fit(X_model1, y_model1)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=30,
                                                     min_samples_split=10,
                                                     n_estimators=500,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=1.0, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False, eta=...
                                            min_child_weight=None

# 모델2

In [153]:
scale_pos_weight = len(y_model2[y_model2 == 'AbNormal']) / len(y_model2[y_model2 == 'Normal'])
RANDOM_STATE = 42

model2_rf = RandomForestClassifier(** {'criterion': 'gini', 'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}, class_weight='balanced', random_state=RANDOM_STATE)
model2_xgb = XGBClassifier(**{'colsample_bytree': 1.0, 'eta': 0.3, 'max_depth': 5, 'reg_alpha': 0.1, 'reg_lambda': 2, 'subsample': 1.0}, scale_pos_weight=scale_pos_weight, random_state=RANDOM_STATE)
model2_svm = SVC(C=1, gamma=0.001, kernel = 'rbf',random_state=RANDOM_STATE, probability=True)
model2_lgbm = LGBMClassifier(**{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 500, 'num_leaves': 50, 'subsample': 0.6}, class_weight='balanced', random_state=RANDOM_STATE)

# 앙상블 모델 정의 (투표 기반)
model2_voting = VotingClassifier(
    estimators=[('rf', model2_rf), ('xgb', model2_xgb), ('svm', model2_svm), ('lgbm', model2_lgbm)],
    voting='soft'
)

# 모델 학습
model2_voting.fit(X_model2, y_model2)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(class_weight='balanced',
                                                     max_depth=50,
                                                     min_samples_split=10,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=1.0, device=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False, eta=0.3,
                                            eval_metric=N...
                                            max_leaves=None,
             

In [140]:
from sklearn.metrics import accuracy_score, f1_score

y_val = df_val["target"]

# 세 가지 조건: 열 간 값이 다르면 불량으로 분류
condition_receip = (
    (df_val['Receip No Collect Result_Dam'] != df_val['Receip No Collect Result_Fill1']) |
    (df_val['Receip No Collect Result_Dam'] != df_val['Receip No Collect Result_Fill2']) |
    (df_val['Receip No Collect Result_Fill1'] != df_val['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (df_val['Production Qty Collect Result_Dam'] != df_val['Production Qty Collect Result_Fill1']) |
    (df_val['Production Qty Collect Result_Dam'] != df_val['Production Qty Collect Result_Fill2']) |
    (df_val['Production Qty Collect Result_Fill1'] != df_val['Production Qty Collect Result_Fill2'])
)

val_equip1 = df_val[categorical_features].eq(0).all(axis=1)
val_equip2 = df_val[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~val_equip1 & ~val_equip2 

# 조건들을 묶어서 필터링
val_condition = ( condition_receip | condition_production_qty | condition_equip_different )

val_abnormal = df_val[val_condition]
# abnormal로 제외하고 남은 데이터

val_filtered = df_val[~val_condition]
# 장비가 0 또는 1의 값으로 나누기 (label encoding으로 Equipment #1: 0 , Equipment #2: 1)

val_model1 = val_filtered[val_filtered[categorical_features].eq(0).all(axis=1)]
val_model2 = val_filtered[val_filtered[categorical_features].eq(1).all(axis=1)]

In [141]:
val_model1_predictions = model1_voting.predict(val_model1_train)
val_model2_predictions = model2_voting.predict(val_model2_train)

# 예측을 위한 데이터 인덱스
val_model1_index = val_model1.index
val_model2_index = val_model2.index

# 검증 데이터의 전체 예측을 위한 결과 결합
val_predictions = []

for idx in df_val.index:
    if idx in val_model1_index:
        val_predictions.append(val_model1_predictions[val_model1_index.get_loc(idx)])
    elif idx in val_model2_index:
        val_predictions.append(val_model2_predictions[val_model2_index.get_loc(idx)])
    else:
        val_predictions.append('AbNormal')


# 정확도 계산
val_accuracy = accuracy_score(y_val, val_predictions)

# F1 스코어 계산
val_f1_score = f1_score(y_val, val_predictions, pos_label='AbNormal')

# 결과 출력
print(f"검증 데이터 정확도: {val_accuracy:.4f}")
print(f"검증 데이터 F1 스코어: {val_f1_score:.4f}")

검증 데이터 정확도: 0.8998
검증 데이터 F1 스코어: 0.1803


In [142]:
# 혼동 행렬 생성
cm = confusion_matrix(y_val, val_predictions)

print(f"F1 스코어: {val_f1_score}")
print(f"혼동 행렬:\n{cm}")# 혼동 행렬 생성

F1 스코어: 0.180349932705249
혼동 행렬:
[[  134   571]
 [  647 10800]]


# 4. 제출하기

### 테스트 데이터 예측


테스트 데이터 불러오기


In [52]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
test_data

Unnamed: 0,Set ID,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,0001be084fbc4aaa9d921f39e595961b,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XF767-1,1,OK,1000.0,,...,195,,,1,,,0,,,
1,0005bbd180064abd99e63f9ed3e1ac80,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4B1XD472-2,1,OK,1000.0,,...,14,,,256,,,1,,,
2,000948934c4140d883d670adcb609584,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3H1XE355-1,1,OK,240.0,,...,98,,,1,,,0,,,
3,000a6bfd02874c6296dc7b2e9c5678a7,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3L1XA128-1,1,OK,1000.0,,...,14,,,0,,,1,,,
4,0018e78ce91343678716e2ea27a51c95,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4A1XA639-1,1,OK,240.0,,...,1,,,215,,,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,ffea508b59934d689b540f95eb3fa730,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1XB597-1,1,OK,1000.0,,...,14,,,131,,,1,,,
17357,ffed8923c8a448a98afc641b770be153,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XB974-1,1,OK,1000.0,,...,12,,,279,,,1,,,
17358,fff1e73734da40adbe805359b3efb462,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3L1XA998-1,1,OK,240.0,,...,4,,,66,,,1,,,
17359,fff8e38bdd09470baf95f71e92075dec,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1XC376-1,1,OK,240.0,,...,117,,,1,,,0,,,


In [53]:
# Drop columns with more than half of the values missing
drop_cols = []
for column in test_data.columns:
    if (test_data[column].notnull().sum() // 2) < test_data[
        column
    ].isnull().sum():
        drop_cols.append(column)
test_data = test_data.drop(drop_cols, axis=1)

test_data

Unnamed: 0,Set ID,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,...,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
0,0001be084fbc4aaa9d921f39e595961b,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XF767-1,1,OK,1000.0,12.5,...,50,91.8,270.0,50,85,19.8,13.0,195,1,0
1,0005bbd180064abd99e63f9ed3e1ac80,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4B1XD472-2,1,OK,1000.0,12.5,...,119,50.0,91.8,270,50,85.0,19.8,14,256,1
2,000948934c4140d883d670adcb609584,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3H1XE355-1,1,OK,240.0,2.5,...,50,91.8,270.0,50,85,19.7,1.0,98,1,0
3,000a6bfd02874c6296dc7b2e9c5678a7,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3L1XA128-1,1,OK,1000.0,12.5,...,119,50.0,91.8,270,50,85.0,20.0,14,0,1
4,0018e78ce91343678716e2ea27a51c95,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4A1XA639-1,1,OK,240.0,2.5,...,119,50.0,91.8,270,50,85.0,19.8,1,215,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,ffea508b59934d689b540f95eb3fa730,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1XB597-1,1,OK,1000.0,12.5,...,119,50.0,91.8,270,50,85.0,19.5,14,131,1
17357,ffed8923c8a448a98afc641b770be153,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XB974-1,1,OK,1000.0,12.5,...,119,50.0,91.8,270,50,85.0,19.8,12,279,1
17358,fff1e73734da40adbe805359b3efb462,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3L1XA998-1,1,OK,240.0,2.5,...,119,50.0,91.8,270,50,85.0,20.5,4,66,1
17359,fff8e38bdd09470baf95f71e92075dec,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1XC376-1,1,OK,240.0,2.5,...,50,91.8,270.0,50,85,18.9,1.0,117,1,0


In [54]:
# 각 열의 NaN 값 개수 계산
nan_counts = test_data.isnull().sum()

# NaN 값이 있는 열들만 필터링
nan_columns = nan_counts[nan_counts > 0]

nan_columns

HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam      5468
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1    5468
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2    5468
dtype: int64

In [55]:
# 각 열의 NaN 값이 있는 행을 표시
nan_dam = test_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].isnull()
nan_fill1 = test_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].isnull()
nan_fill2 = test_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].isnull()

# 세 열의 NaN 값이 모두 같은 행에서 발생했는지 확인
nan_same_rows = nan_dam & nan_fill1 & nan_fill2

# NaN 값이 동일한 행의 개수 확인
num_same_nan_rows = nan_same_rows.sum()

# 결과 출력
print(f"세 열에서 NaN 값이 동일한 행의 개수: {num_same_nan_rows}")
print(f"세 열에서 NaN 값이 동일한 행:\n{test_data[nan_same_rows]}")

세 열에서 NaN 값이 동일한 행의 개수: 5468
세 열에서 NaN 값이 동일한 행:
                                 Set ID Wip Line_Dam Process Desc._Dam  \
3      000a6bfd02874c6296dc7b2e9c5678a7      IVI-OB6     Dam Dispenser   
8      00297b6c93e44d49ac534758a23dc74e      IVI-OB6     Dam Dispenser   
9      002d904240d84b188d410d16383a9c3a      IVI-OB6     Dam Dispenser   
10     002fdfad651c4a98b6f0c4046976aac3      IVI-OB6     Dam Dispenser   
13     0039b02541014d678d5b0e5a3cb3797a      IVI-OB6     Dam Dispenser   
...                                 ...          ...               ...   
17349  ffd71ad9aac547a4a367d538c6e3ee30      IVI-OB6     Dam Dispenser   
17354  ffe5c71ef7b045868d177023be7f364b      IVI-OB6     Dam Dispenser   
17355  ffe77be574b24429b1bf6a69b5c2c2ef      IVI-OB6     Dam Dispenser   
17356  ffea508b59934d689b540f95eb3fa730      IVI-OB6     Dam Dispenser   
17358  fff1e73734da40adbe805359b3efb462      IVI-OB6     Dam Dispenser   

          Equipment_Dam Model.Suffix_Dam Workorder_Dam  Insp. 

In [56]:
# 삭제할 열 리스트
columns_to_drop = [
        'Model.Suffix_Fill1', 'Model.Suffix_Fill2', 'Model.Suffix_AutoClave', 
        'Workorder_Fill1', 'Workorder_Fill2', 'Workorder_AutoClave'
]

# 열 삭제
test_data = test_data.drop(columns=columns_to_drop)

# 열 이름 변경
test_data = test_data.rename(columns={
    'Model.Suffix_Dam': 'Model.Suffix',
    'Workorder_Dam': 'Workorder'
})

test_data

Unnamed: 0,Set ID,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix,Workorder,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,...,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
0,0001be084fbc4aaa9d921f39e595961b,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XF767-1,1,OK,1000.0,12.5,...,50,91.8,270.0,50,85,19.8,13.0,195,1,0
1,0005bbd180064abd99e63f9ed3e1ac80,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4B1XD472-2,1,OK,1000.0,12.5,...,119,50.0,91.8,270,50,85.0,19.8,14,256,1
2,000948934c4140d883d670adcb609584,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3H1XE355-1,1,OK,240.0,2.5,...,50,91.8,270.0,50,85,19.7,1.0,98,1,0
3,000a6bfd02874c6296dc7b2e9c5678a7,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3L1XA128-1,1,OK,1000.0,12.5,...,119,50.0,91.8,270,50,85.0,20.0,14,0,1
4,0018e78ce91343678716e2ea27a51c95,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4A1XA639-1,1,OK,240.0,2.5,...,119,50.0,91.8,270,50,85.0,19.8,1,215,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,ffea508b59934d689b540f95eb3fa730,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1XB597-1,1,OK,1000.0,12.5,...,119,50.0,91.8,270,50,85.0,19.5,14,131,1
17357,ffed8923c8a448a98afc641b770be153,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XB974-1,1,OK,1000.0,12.5,...,119,50.0,91.8,270,50,85.0,19.8,12,279,1
17358,fff1e73734da40adbe805359b3efb462,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3L1XA998-1,1,OK,240.0,2.5,...,119,50.0,91.8,270,50,85.0,20.5,4,66,1
17359,fff8e38bdd09470baf95f71e92075dec,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1XC376-1,1,OK,240.0,2.5,...,50,91.8,270.0,50,85,18.9,1.0,117,1,0


In [57]:
# 모든 행이 동일한 값을 가지는 열을 찾고, 그 값을 함께 저장
constant_columns = {col: test_data[col].iloc[0] for col in test_data.columns if test_data[col].nunique() == 1}

# 데이터 프레임에서 해당 열 삭제
test_data = test_data.drop(columns=constant_columns.keys())

# 삭제된 열의 개수 출력
print(f"삭제된 열의 개수: {len(constant_columns)}")

# 삭제된 열과 그 값을 한 줄에 하나씩 출력
print("삭제된 열과 값:")
for col, value in constant_columns.items():
    print(f"{col}: {value}")

test_data

삭제된 열의 개수: 35
삭제된 열과 값:
Wip Line_Dam: IVI-OB6
Process Desc._Dam: Dam Dispenser
Insp. Seq No._Dam: 1
Insp Judge Code_Dam: OK
CURE STANDBY POSITION X Collect Result_Dam: 1150
CURE STANDBY POSITION Z Collect Result_Dam: 33.5
CURE STANDBY POSITION Θ Collect Result_Dam: 0
CURE START POSITION Z Collect Result_Dam: 33.5
Wip Line_AutoClave: IVI-OB6
Process Desc._AutoClave: Auto Clave Out
Equipment_AutoClave: Auto Clave Out
Insp. Seq No._AutoClave: 1
Insp Judge Code_AutoClave: OK
1st Pressure Judge Value_AutoClave: OK
2nd Pressure Judge Value_AutoClave: OK
3rd Pressure Judge Value_AutoClave: OK
Wip Line_Fill1: IVI-OB6
Process Desc._Fill1: Fill1 Dispenser
Insp. Seq No._Fill1: 1
Insp Judge Code_Fill1: OK
Wip Line_Fill2: IVI-OB6
Process Desc._Fill2: Fill2 Dispenser
Insp. Seq No._Fill2: 1
Insp Judge Code_Fill2: OK
CURE END POSITION Θ Collect Result_Fill2: -90
CURE STANDBY POSITION X Collect Result_Fill2: 1020
CURE STANDBY POSITION Θ Collect Result_Fill2: 0
CURE START POSITION Θ Collect Result_Fill2

Unnamed: 0,Set ID,Equipment_Dam,Model.Suffix,Workorder,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,...,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
0,0001be084fbc4aaa9d921f39e595961b,Dam dispenser #2,AJX75334501,3J1XF767-1,1000.0,12.5,90,70,280,90,...,50,91.8,270.0,50,85,19.8,13.0,195,1,0
1,0005bbd180064abd99e63f9ed3e1ac80,Dam dispenser #2,AJX75334501,4B1XD472-2,1000.0,12.5,90,70,280,90,...,119,50.0,91.8,270,50,85.0,19.8,14,256,1
2,000948934c4140d883d670adcb609584,Dam dispenser #1,AJX75334501,3H1XE355-1,240.0,2.5,-90,70,1030,-90,...,50,91.8,270.0,50,85,19.7,1.0,98,1,0
3,000a6bfd02874c6296dc7b2e9c5678a7,Dam dispenser #2,AJX75334501,3L1XA128-1,1000.0,12.5,90,70,280,90,...,119,50.0,91.8,270,50,85.0,20.0,14,0,1
4,0018e78ce91343678716e2ea27a51c95,Dam dispenser #1,AJX75334501,4A1XA639-1,240.0,2.5,-90,70,1030,-90,...,119,50.0,91.8,270,50,85.0,19.8,1,215,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,ffea508b59934d689b540f95eb3fa730,Dam dispenser #2,AJX75334501,3K1XB597-1,1000.0,12.5,90,70,280,90,...,119,50.0,91.8,270,50,85.0,19.5,14,131,1
17357,ffed8923c8a448a98afc641b770be153,Dam dispenser #2,AJX75334501,4A1XB974-1,1000.0,12.5,90,70,280,90,...,119,50.0,91.8,270,50,85.0,19.8,12,279,1
17358,fff1e73734da40adbe805359b3efb462,Dam dispenser #1,AJX75334501,3L1XA998-1,240.0,2.5,-90,70,1030,-90,...,119,50.0,91.8,270,50,85.0,20.5,4,66,1
17359,fff8e38bdd09470baf95f71e92075dec,Dam dispenser #1,AJX75334501,3F1XC376-1,240.0,2.5,-90,70,1030,-90,...,50,91.8,270.0,50,85,18.9,1.0,117,1,0


In [58]:
# 변환할 열들의 리스트
columns_to_convert = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']

# 각 열을 float로 변환
for col in columns_to_convert:
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')

test_data[columns_to_convert].dtypes

HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam      float64
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1    float64
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2    float64
dtype: object

In [59]:
# # 제거할 열 리스트
columns_to_drop = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam', 
                   'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 
                   'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']
#   'WorkMode Collect Result_Dam', 'WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2'
# # 지정한 열 제거
test_data = test_data.drop(columns=columns_to_drop)

In [60]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

# 범주형 열을 선택하여 인코딩 수행
categorical_features = ['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2', 'Chamber Temp. Judge Value_AutoClave', 'Model.Suffix', 'Workorder']

# OrdinalEncoder 인스턴스 생성
encoder = OrdinalEncoder()

# 범주형 열에 대해서만 인코딩 수행
test_data[categorical_features] = encoder.fit_transform(test_data[categorical_features])

# 인코딩된 데이터프레임 확인
test_data

Unnamed: 0,Set ID,Equipment_Dam,Model.Suffix,Workorder,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,...,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
0,0001be084fbc4aaa9d921f39e595961b,1.0,0.0,240.0,1000.0,12.5,90,70,280,90,...,50,91.8,270.0,50,85,19.8,13.0,195,1,0
1,0005bbd180064abd99e63f9ed3e1ac80,1.0,0.0,509.0,1000.0,12.5,90,70,280,90,...,119,50.0,91.8,270,50,85.0,19.8,14,256,1
2,000948934c4140d883d670adcb609584,0.0,0.0,128.0,240.0,2.5,-90,70,1030,-90,...,50,91.8,270.0,50,85,19.7,1.0,98,1,0
3,000a6bfd02874c6296dc7b2e9c5678a7,1.0,0.0,306.0,1000.0,12.5,90,70,280,90,...,119,50.0,91.8,270,50,85.0,20.0,14,0,1
4,0018e78ce91343678716e2ea27a51c95,0.0,0.0,415.0,240.0,2.5,-90,70,1030,-90,...,119,50.0,91.8,270,50,85.0,19.8,1,215,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,ffea508b59934d689b540f95eb3fa730,1.0,0.0,277.0,1000.0,12.5,90,70,280,90,...,119,50.0,91.8,270,50,85.0,19.5,14,131,1
17357,ffed8923c8a448a98afc641b770be153,1.0,0.0,439.0,1000.0,12.5,90,70,280,90,...,119,50.0,91.8,270,50,85.0,19.8,12,279,1
17358,fff1e73734da40adbe805359b3efb462,0.0,0.0,314.0,240.0,2.5,-90,70,1030,-90,...,119,50.0,91.8,270,50,85.0,20.5,4,66,1
17359,fff8e38bdd09470baf95f71e92075dec,0.0,0.0,8.0,240.0,2.5,-90,70,1030,-90,...,50,91.8,270.0,50,85,18.9,1.0,117,1,0


In [61]:
categorical_features = ['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2']

# 세 가지 조건: 열 간 값이 다르면 불량으로 분류
condition_receip = (
    (test_data['Receip No Collect Result_Dam'] != test_data['Receip No Collect Result_Fill1']) |
    (test_data['Receip No Collect Result_Dam'] != test_data['Receip No Collect Result_Fill2']) |
    (test_data['Receip No Collect Result_Fill1'] != test_data['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (test_data['Production Qty Collect Result_Dam'] != test_data['Production Qty Collect Result_Fill1']) |
    (test_data['Production Qty Collect Result_Dam'] != test_data['Production Qty Collect Result_Fill2']) |
    (test_data['Production Qty Collect Result_Fill1'] != test_data['Production Qty Collect Result_Fill2'])
)

test_equip1 = test_data[categorical_features].eq(0).all(axis=1)
test_equip2 = test_data[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~test_equip1 & ~test_equip2 

# 조건들을 묶어서 필터링
test_condition = ( condition_receip | condition_production_qty | condition_equip_different )

test_abnormal = test_data[test_condition]
# abnormal로 제외하고 남은 데이터

test_filtered = test_data[~test_condition]

# 장비가 0 또는 1의 값으로 나누기 (label encoding으로 Equipment #1: 0 , Equipment #2: 1)
test_model1 = test_filtered[test_filtered[categorical_features].eq(0).all(axis=1)]
test_model2 = test_filtered[test_filtered[categorical_features].eq(1).all(axis=1)]

test_model1_for_prediction = test_model1.drop(columns=['Set ID'])
test_model2_for_prediction = test_model2.drop(columns=['Set ID'])


test_model1_predictions = model1_voting.predict(test_model1_for_prediction)
test_model2_predictions = model2_voting.predict(test_model2_for_prediction)

# 예측을 위한 데이터 인덱스
test_model1_index = test_model1.index
test_model2_index = test_model2.index

# 검증 데이터의 전체 예측을 위한 결과 결합
final_predictions = []

for idx in test_data.index:
    if idx in test_model1_index:
        final_predictions.append(test_model1_predictions[test_model1_index.get_loc(idx)])
    elif idx in test_model2_index:
        final_predictions.append(test_model2_predictions[test_model2_index.get_loc(idx)])
    else:
        final_predictions.append('AbNormal')

In [62]:
for i in range(len(final_predictions)):
    if final_predictions[i] == 0:
        final_predictions[i] = "Normal"
    elif final_predictions[i] == 1:
        final_predictions[i] = "AbNormal"
final_predictions

['Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',


### 제출 파일 작성


In [66]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./submission.csv")
df_sub["target"] = final_predictions

# 제출 파일 저장
df_sub.to_csv("./submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
