In [2]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
# 학습을 위해 train_data를 지정
train_data = pd.read_csv('./data/train_cleaned.csv')
train_data

Unnamed: 0,Equipment_Dam,Model.Suffix,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,...,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2,HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2,HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,target
0,Dam dispenser #1,AJX75334505,240.0,2.5,100,1030,-90,16,14.9,8.4,...,156.0,428.0,427.9,243.7,114.612,19.9,7.0,127,1,Normal
1,Dam dispenser #2,AJX75334501,1000.0,12.5,85,280,90,16,14.7,8.5,...,694.0,1324.2,1324.2,243.5,114.612,19.8,10.0,73,1,Normal
2,Dam dispenser #1,AJX75334501,240.0,2.5,70,1030,-90,16,13.2,6.5,...,156.0,428.0,427.9,243.7,85.000,19.7,8.0,483,1,Normal
3,Dam dispenser #2,AJX75334501,1000.0,12.5,70,280,90,16,13.2,7.6,...,694.0,1324.2,1324.2,243.5,85.000,19.9,11.0,105,1,Normal
4,Dam dispenser #2,AJX75334501,1000.0,12.5,85,280,90,16,14.7,8.5,...,694.0,1324.2,1324.2,243.5,114.612,20.0,15.0,78,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,Dam dispenser #1,AJX75334501,240.0,2.5,70,1030,-90,10,9.7,4.9,...,156.0,428.0,427.9,243.7,85.000,19.6,2.0,56,1,Normal
40502,Dam dispenser #1,AJX75334501,240.0,2.5,70,1030,-90,10,9.7,4.9,...,156.0,428.0,427.9,243.7,85.000,19.3,8.0,329,1,Normal
40503,Dam dispenser #1,AJX75334501,240.0,2.5,70,1030,-90,10,17.0,5.0,...,156.0,428.0,427.9,243.7,85.000,19.2,1.0,318,1,Normal
40504,Dam dispenser #2,AJX75334501,1000.0,12.5,70,280,90,10,9.7,4.9,...,694.0,1324.2,1324.2,243.5,85.000,20.1,13.0,117,1,Normal


In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

# 범주형 열을 선택하여 인코딩 수행
categorical_features = ['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2']

# OrdinalEncoder 인스턴스 생성
encoder = OrdinalEncoder()

# 범주형 열에 대해서만 인코딩 수행
train_data[categorical_features] = encoder.fit_transform(train_data[categorical_features])

# 인코딩된 데이터프레임 확인
train_data

Unnamed: 0,Equipment_Dam,Model.Suffix,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,...,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2,HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2,HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,target
0,0.0,AJX75334505,240.0,2.5,100,1030,-90,16,14.9,8.4,...,156.0,428.0,427.9,243.7,114.612,19.9,7.0,127,1,Normal
1,1.0,AJX75334501,1000.0,12.5,85,280,90,16,14.7,8.5,...,694.0,1324.2,1324.2,243.5,114.612,19.8,10.0,73,1,Normal
2,0.0,AJX75334501,240.0,2.5,70,1030,-90,16,13.2,6.5,...,156.0,428.0,427.9,243.7,85.000,19.7,8.0,483,1,Normal
3,1.0,AJX75334501,1000.0,12.5,70,280,90,16,13.2,7.6,...,694.0,1324.2,1324.2,243.5,85.000,19.9,11.0,105,1,Normal
4,1.0,AJX75334501,1000.0,12.5,85,280,90,16,14.7,8.5,...,694.0,1324.2,1324.2,243.5,114.612,20.0,15.0,78,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,0.0,AJX75334501,240.0,2.5,70,1030,-90,10,9.7,4.9,...,156.0,428.0,427.9,243.7,85.000,19.6,2.0,56,1,Normal
40502,0.0,AJX75334501,240.0,2.5,70,1030,-90,10,9.7,4.9,...,156.0,428.0,427.9,243.7,85.000,19.3,8.0,329,1,Normal
40503,0.0,AJX75334501,240.0,2.5,70,1030,-90,10,17.0,5.0,...,156.0,428.0,427.9,243.7,85.000,19.2,1.0,318,1,Normal
40504,1.0,AJX75334501,1000.0,12.5,70,280,90,10,9.7,4.9,...,694.0,1324.2,1324.2,243.5,85.000,20.1,13.0,117,1,Normal


In [5]:
# 각 범주에 대한 인코딩 값 확인
for feature, categories in zip(categorical_features, encoder.categories_):
    print(f"Feature: {feature}")
    for category, index in zip(categories, range(len(categories))):
        print(f"  {category}: {index}")

Feature: Equipment_Dam
  Dam dispenser #1: 0
  Dam dispenser #2: 1
Feature: Equipment_Fill1
  Fill1 dispenser #1: 0
  Fill1 dispenser #2: 1
Feature: Equipment_Fill2
  Fill2 dispenser #1: 0
  Fill2 dispenser #2: 1


In [6]:
columns_to_drop = [
    'Model.Suffix', 'Chamber Temp. Judge Value_AutoClave']

# 열 삭제
train_data = train_data.drop(columns=columns_to_drop)

In [22]:
# 세 가지 조건: 열 간 값이 다르면 불량으로 분류
condition_receip = (
    (train_data['Receip No Collect Result_Dam'] != train_data['Receip No Collect Result_Fill1']) |
    (train_data['Receip No Collect Result_Dam'] != train_data['Receip No Collect Result_Fill2']) |
    (train_data['Receip No Collect Result_Fill1'] != train_data['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (train_data['Production Qty Collect Result_Dam'] != train_data['Production Qty Collect Result_Fill1']) |
    (train_data['Production Qty Collect Result_Dam'] != train_data['Production Qty Collect Result_Fill2']) |
    (train_data['Production Qty Collect Result_Fill1'] != train_data['Production Qty Collect Result_Fill2'])
)

condition_palletid = (
    (train_data['PalletID Collect Result_Dam'] != train_data['PalletID Collect Result_Fill1']) |
    (train_data['PalletID Collect Result_Dam'] != train_data['PalletID Collect Result_Fill2']) |
    (train_data['PalletID Collect Result_Fill1'] != train_data['PalletID Collect Result_Fill2'])
)

# 장비가 다르면 불량으로 분류
equip1 = train_data[categorical_features].eq(0).all(axis=1)
equip2 = train_data[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~equip1 & ~equip2

# 네 조건 중 하나라도 만족하면 불량으로 분류
condition_abnormal = condition_receip | condition_production_qty | condition_palletid | condition_equip_different

# 불량 데이터와 정상 데이터로 분리
df_abnormal = train_data[condition_abnormal]  # 불량 데이터
df_filtered = train_data[~condition_abnormal]  # 정상 데이터

# 정상 데이터에서 장비별로 데이터 분리
df_model1 = df_filtered[df_filtered[categorical_features].eq(0).all(axis=1)]
df_model2 = df_filtered[df_filtered[categorical_features].eq(1).all(axis=1)]

# 결과 확인
print(f"불량 데이터: {len(df_abnormal)} 행")
print(f"정상 데이터: {len(df_filtered)} 행")
print(f"Equipment #1 데이터: {len(df_model1)} 행")
print(f"Equipment #2 데이터: {len(df_model2)} 행")

불량 데이터: 94 행
정상 데이터: 40412 행
Equipment #1 데이터: 24968 행
Equipment #2 데이터: 15444 행


### 오버샘플링

In [24]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# 랜덤 시드 설정
RANDOM_STATE = 42

# train/val split
df_train, df_val = train_test_split(
    train_data,
    test_size=0.3,
    stratify=train_data["target"],
    random_state=RANDOM_STATE,
)

# Features와 타겟 분리 (train 데이터)
X_train = df_train.drop(columns=["target"])
y_train = df_train["target"]

# Features와 타겟 분리 (validation 데이터)
X_val = df_val.drop(columns=["target"])
y_val = df_val["target"]

In [25]:
# SMOTE 오버샘플링 수행
smote = SMOTE(random_state=RANDOM_STATE)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# SMOTE 이후의 타겟값 분포 확인
print("Train data after SMOTE:")
print(y_train_smote.value_counts())

# Validation 데이터의 타겟값 분포 확인
print("\nValidation data:")
print(y_val.value_counts())

# SMOTE 결과를 DataFrame으로 변환
df_train_smote = pd.DataFrame(X_train_smote, columns=X_train.columns)
df_train_smote["target"] = y_train_smote

# Print statistics function
def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])
    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal:.2f}")

# Print statistics
print("\nStatistics after SMOTE and splitting:")
print("Train data:")
print_stats(df_train_smote)
print("Validation data:")
print_stats(df_val)

Train data after SMOTE:
target
Normal      26709
AbNormal    26709
Name: count, dtype: int64

Validation data:
target
Normal      11447
AbNormal      705
Name: count, dtype: int64

Statistics after SMOTE and splitting:
Train data:
  Total: Normal: 26709, AbNormal: 26709 ratio: 1.00
Validation data:
  Total: Normal: 11447, AbNormal: 705 ratio: 0.06


### 모델 학습

In [26]:
# 세 가지 조건: 열 간 값이 다르면 불량으로 분류
condition_receip = (
    (df_train_smote['Receip No Collect Result_Dam'] != df_train_smote['Receip No Collect Result_Fill1']) |
    (df_train_smote['Receip No Collect Result_Dam'] != df_train_smote['Receip No Collect Result_Fill2']) |
    (df_train_smote['Receip No Collect Result_Fill1'] != df_train_smote['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (df_train_smote['Production Qty Collect Result_Dam'] != df_train_smote['Production Qty Collect Result_Fill1']) |
    (df_train_smote['Production Qty Collect Result_Dam'] != df_train_smote['Production Qty Collect Result_Fill2']) |
    (df_train_smote['Production Qty Collect Result_Fill1'] != df_train_smote['Production Qty Collect Result_Fill2'])
)

condition_palletid = (
    (df_train_smote['PalletID Collect Result_Dam'] != df_train_smote['PalletID Collect Result_Fill1']) |
    (df_train_smote['PalletID Collect Result_Dam'] != df_train_smote['PalletID Collect Result_Fill2']) |
    (df_train_smote['PalletID Collect Result_Fill1'] != df_train_smote['PalletID Collect Result_Fill2'])
)

# 장비가 다르면 불량으로 분류
equip1 = df_train_smote[categorical_features].eq(0).all(axis=1)
equip2 = df_train_smote[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~equip1 & ~equip2

# 네 조건 중 하나라도 만족하면 불량으로 분류
condition_abnormal = condition_receip | condition_production_qty | condition_palletid | condition_equip_different

# 불량 데이터와 정상 데이터로 분리
df_abnormal = df_train_smote[condition_abnormal]  # 불량 데이터
df_filtered = df_train_smote[~condition_abnormal]  # 정상 데이터

# 정상 데이터에서 장비별로 데이터 분리
df_model1 = df_filtered[df_filtered[categorical_features].eq(0).all(axis=1)]
df_model2 = df_filtered[df_filtered[categorical_features].eq(1).all(axis=1)]

# 결과 확인
print(f"불량 데이터: {len(df_abnormal)} 행")
print(f"정상 데이터: {len(df_filtered)} 행")
print(f"Equipment #1 데이터: {len(df_model1)} 행")
print(f"Equipment #2 데이터: {len(df_model2)} 행")

# 장비별로 고유값이 1개인 컬럼 제외
# 모든 행이 동일한 값을 가지는 열을 찾고, 그 값을 함께 저장
def unique_column_drop(df):
    constant_columns = [col for col in df.columns if df[col].nunique() == 1]
    # 데이터 프레임에서 해당 열 삭제
    return df.drop(columns=constant_columns), constant_columns
# 장비별 drop columns 저장
df_model1, drop_column1 = unique_column_drop(df_model1)
df_model2, drop_column2 = unique_column_drop(df_model2)

불량 데이터: 1475 행
정상 데이터: 51943 행
Equipment #1 데이터: 32492 행
Equipment #2 데이터: 19451 행


In [28]:
df_model2

Unnamed: 0,CURE SPEED Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage1) Collect Result_Dam,Dispense Volume(Stage2) Collect Result_Dam,Dispense Volume(Stage3) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam,...,CURE START POSITION X Collect Result_Fill2,CURE START POSITION Z Collect Result_Fill2,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,target
0,70,10,17.000000,4.900000,17.0,1.190000,0.340000,1.19,162.400000,465.100000,...,1020,33,305.0,694.0,85.000,19.900000,10.000000,91,1,Normal
3,100,16,14.900000,8.600000,14.7,1.040000,0.600000,1.02,163.500000,466.200000,...,1020,33,304.8,692.8,114.612,20.200000,12.000000,259,1,Normal
4,70,16,13.200000,7.600000,13.1,0.920000,0.530000,0.91,162.700000,465.700000,...,1020,33,305.0,694.0,85.000,20.600000,15.000000,347,1,Normal
7,70,16,13.200000,8.300000,13.1,0.920000,0.580000,0.91,162.400000,465.400000,...,1020,33,305.0,694.0,85.000,19.500000,15.000000,364,1,Normal
9,70,10,21.300000,10.500000,21.3,1.490000,0.730000,1.49,162.400000,465.500000,...,1020,33,305.0,694.0,85.000,20.200000,12.000000,142,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53400,70,10,21.300000,10.538610,21.3,1.490000,0.733861,1.49,162.786098,465.808879,...,1020,33,305.0,694.0,85.000,19.838610,12.841705,54,1,AbNormal
53402,70,16,13.182656,8.282656,13.2,0.918266,0.578266,0.92,162.400000,465.400000,...,1020,33,305.0,694.0,85.000,19.995938,13.173437,37,1,AbNormal
53406,70,10,21.300000,10.527131,21.3,1.490000,0.732713,1.49,163.128693,466.082955,...,1020,33,305.0,694.0,85.000,20.218608,12.271307,58,2,AbNormal
53407,70,10,17.000000,4.900000,17.0,1.190000,0.340000,1.19,162.400000,465.116978,...,1020,33,305.0,694.0,85.000,19.216978,10.254663,219,1,AbNormal


In [11]:
df_abnormal['target'].unique()

array(['AbNormal'], dtype=object)

In [12]:
# 추가로 제거할 열들
additional_drop_column1 = [
                            'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
                            'Dispense Volume(Stage3) Collect Result_Dam',
                            'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
                            'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
                            'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
                            'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
                            'Head Clean Position Z Collect Result_Dam',
                            'Head Zero Position Z Collect Result_Dam',
                            'Stage1 Line3 Distance Speed Collect Result_Dam',
                            'Stage2 Line1 Distance Speed Collect Result_Dam',
                            'Stage2 Line2 Distance Speed Collect Result_Dam',
                            'Stage2 Line3 Distance Speed Collect Result_Dam',
                            'Stage3 Line1 Distance Speed Collect Result_Dam',
                            'Stage3 Line3 Distance Speed Collect Result_Dam',

                            'Dispense Volume(Stage1) Collect Result_Fill1',
                            'Dispense Volume(Stage2) Collect Result_Fill1',
                            'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
                            'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
                            'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
                            'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
                            'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
                            'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',

                            'CURE START POSITION X Collect Result_Fill2',

                            'Chamber Temp. Unit Time_AutoClave',

                            'Receip No Collect Result_Dam',
                            'Receip No Collect Result_Fill1',
                            'Receip No Collect Result_Fill2',
                            'Production Qty Collect Result_Dam',
                            'Production Qty Collect Result_Fill1',
                            'Production Qty Collect Result_Fill2',
                            'PalletID Collect Result_Dam',
                            'PalletID Collect Result_Fill1',
                            'PalletID Collect Result_Fill2'
                            ]

additional_drop_column2 = [
                            'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
                            'Dispense Volume(Stage3) Collect Result_Dam',
                            'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
                            'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
                            'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
                            'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
                            'Head Clean Position Z Collect Result_Dam',
                            'Head Zero Position Z Collect Result_Dam',
                            'Stage1 Line1 Distance Speed Collect Result_Dam',
                            'Stage1 Line3 Distance Speed Collect Result_Dam',
                            'Stage2 Line1 Distance Speed Collect Result_Dam',
                            'Stage2 Line2 Distance Speed Collect Result_Dam',
                            'Stage2 Line3 Distance Speed Collect Result_Dam',
                            'Stage3 Circle1 Distance Speed Collect Result_Dam',
                            'Stage3 Line1 Distance Speed Collect Result_Dam',
                            'Stage3 Line3 Distance Speed Collect Result_Dam',
                            

                            'Dispense Volume(Stage2) Collect Result_Fill1',
                            'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
                            'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
                            'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
                            'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
                            'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
                            'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',

                            'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
                            'CURE START POSITION X Collect Result_Fill2',

                            'Chamber Temp. Unit Time_AutoClave',

                            'Receip No Collect Result_Dam',
                            'Receip No Collect Result_Fill1',
                            'Receip No Collect Result_Fill2',
                            'Production Qty Collect Result_Dam',
                            'Production Qty Collect Result_Fill1',
                            'Production Qty Collect Result_Fill2',
                            'PalletID Collect Result_Dam',
                            'PalletID Collect Result_Fill1',
                            'PalletID Collect Result_Fill2'
                            ]

# drop_column에 추가적인 열들을 더함
drop_column1.extend(additional_drop_column1)
drop_column2.extend(additional_drop_column2)

# 데이터 프레임에서 추가적인 열들 제거
df_model1 = df_model1.drop(columns=additional_drop_column1)
df_model2 = df_model2.drop(columns=additional_drop_column2)

# 결과 확인
print(f"Equipment #1에서 제거된 모든 열: {drop_column1}")
print(f"Equipment #2에서 제거된 모든 열: {drop_column2}")

Equipment #1에서 제거된 모든 열: ['Equipment_Dam', 'CURE END POSITION X Collect Result_Dam', 'CURE END POSITION Z Collect Result_Dam', 'CURE START POSITION X Collect Result_Dam', 'CURE START POSITION Θ Collect Result_Dam', 'Equipment_Fill1', 'Equipment_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2', 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam', 'Dispense Volume(Stage3) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam', 'Head Clean Position Z Collect Res

In [13]:
len(drop_column1)

46

In [14]:
len(drop_column2)

46

In [15]:
# 데이터와 타겟 분리
X_model1 = df_model1.drop(columns='target')
y_model1 = df_model1['target']
X_model2 = df_model2.drop(columns='target')
y_model2 = df_model2['target']

# 모델 훈련
model1 = RandomForestClassifier(n_estimators=300, 
                                max_depth=8, 
                                min_samples_split=5, 
                                min_samples_leaf=2, 
                                max_features='sqrt', 
                                class_weight='balanced', 
                                random_state=110)
model1.fit(X_model1, y_model1)

model2 = RandomForestClassifier(n_estimators=300, 
                                max_depth=8, 
                                min_samples_split=5, 
                                min_samples_leaf=2, 
                                max_features='sqrt', 
                                class_weight='balanced', 
                                random_state=110)
model2.fit(X_model2, y_model2)

### 모델 평가

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

val_y = df_val["target"]

#  세 가지 조건: 열 간 값이 다르면 불량으로 분류
condition_receip = (
    (df_val['Receip No Collect Result_Dam'] != df_val['Receip No Collect Result_Fill1']) |
    (df_val['Receip No Collect Result_Dam'] != df_val['Receip No Collect Result_Fill2']) |
    (df_val['Receip No Collect Result_Fill1'] != df_val['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (df_val['Production Qty Collect Result_Dam'] != df_val['Production Qty Collect Result_Fill1']) |
    (df_val['Production Qty Collect Result_Dam'] != df_val['Production Qty Collect Result_Fill2']) |
    (df_val['Production Qty Collect Result_Fill1'] != df_val['Production Qty Collect Result_Fill2'])
)

condition_palletid = (
    (df_val['PalletID Collect Result_Dam'] != df_val['PalletID Collect Result_Fill1']) |
    (df_val['PalletID Collect Result_Dam'] != df_val['PalletID Collect Result_Fill2']) |
    (df_val['PalletID Collect Result_Fill1'] != df_val['PalletID Collect Result_Fill2'])
)

# 장비가 다르면 불량으로 분류
val_equip1 = df_val[categorical_features].eq(0).all(axis=1)
val_equip2 = df_val[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~val_equip1 & ~val_equip2

# 네 조건 중 하나라도 만족하면 불량으로 분류
condition_abnormal = condition_receip | condition_production_qty | condition_palletid | condition_equip_different

# 불량 데이터와 정상 데이터로 분리
val_abnormal = df_val[condition_abnormal]  # 불량 데이터
val_filtered = df_val[~condition_abnormal]  # 정상 데이터


# 예측 함수 정의
def apply_prediction(df, categorical_features, model1, model2, constant_columns_model1, constant_columns_model2):
    predictions = []
    for _, row in df.iterrows():
        if row.name in val_abnormal.index:
            # abnormal 데이터인 경우 'AbNormal'로 예측
            predictions.append('AbNormal')

        elif all(row[feature] == 0 for feature in categorical_features):
            filtered_row = row.drop(labels=constant_columns_model1)
            filtered_row_df = pd.DataFrame([filtered_row], columns=filtered_row.index)
            predictions.append(model1.predict(filtered_row_df)[0])
            
        elif all(row[feature] == 1 for feature in categorical_features):
            filtered_row = row.drop(labels=constant_columns_model2)
            filtered_row_df = pd.DataFrame([filtered_row], columns=filtered_row.index)
            predictions.append(model2.predict(filtered_row_df)[0])
            
        else:
            predictions.append('AbNormal')
    
    return predictions

drop_column1.append('target')
drop_column2.append('target')

# 데이터에 대해 예측 수행
val_predictions = apply_prediction(df_val, categorical_features, model1, model2, drop_column1, drop_column2)

# 정확도 계산
val_accuracy = accuracy_score(val_y, val_predictions)

# F1 스코어 계산
val_f1_score = f1_score(val_y, val_predictions, pos_label="AbNormal")

# 결과 출력
print(f"검증 데이터 정확도: {val_accuracy:.4f}")
print(f"검증 데이터 F1 스코어: {val_f1_score:.4f}")

검증 데이터 정확도: 0.9061
검증 데이터 F1 스코어: 0.1641


### 테스트

In [17]:
import pandas as pd
test_data = pd.read_csv('./data/test_cleaned.csv')

In [18]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

test_data[categorical_features] = encoder.transform(test_data[categorical_features])

columns_to_drop = [
    'Model.Suffix', 'Chamber Temp. Judge Value_AutoClave']

# 열 삭제
test_data = test_data.drop(columns=columns_to_drop)

In [19]:
# 세 가지 조건: 열 간 값이 다르면 불량으로 분류
condition_receip = (
    (test_data['Receip No Collect Result_Dam'] != test_data['Receip No Collect Result_Fill1']) |
    (test_data['Receip No Collect Result_Dam'] != test_data['Receip No Collect Result_Fill2']) |
    (test_data['Receip No Collect Result_Fill1'] != test_data['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (test_data['Production Qty Collect Result_Dam'] != test_data['Production Qty Collect Result_Fill1']) |
    (test_data['Production Qty Collect Result_Dam'] != test_data['Production Qty Collect Result_Fill2']) |
    (test_data['Production Qty Collect Result_Fill1'] != test_data['Production Qty Collect Result_Fill2'])
)

condition_palletid = (
    (test_data['PalletID Collect Result_Dam'] != test_data['PalletID Collect Result_Fill1']) |
    (test_data['PalletID Collect Result_Dam'] != test_data['PalletID Collect Result_Fill2']) |
    (test_data['PalletID Collect Result_Fill1'] != test_data['PalletID Collect Result_Fill2'])
)

# 장비가 다르면 불량으로 분류
test_equip1 = test_data[categorical_features].eq(0).all(axis=1)
test_equip2 = test_data[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~equip1 & ~equip2

# 네 조건 중 하나라도 만족하면 불량으로 분류
test_condition_abnormal = condition_receip | condition_production_qty | condition_palletid | condition_equip_different

# 불량 데이터와 정상 데이터로 분리
test_abnormal = test_data[test_condition_abnormal]  # 불량 데이터
test_filtered = test_data[~test_condition_abnormal]  # 정상 데이터

# 정상 데이터에서 장비별로 데이터 분리
test_model1 = test_filtered[test_filtered[categorical_features].eq(0).all(axis=1)]
test_model2 = test_filtered[test_filtered[categorical_features].eq(1).all(axis=1)]

# drop_column1과 drop_column2에서 'target'을 제거
drop_column1 = [col for col in drop_column1 if col != 'target']
drop_column2 = [col for col in drop_column2 if col != 'target']

# `setID`를 제외한 데이터
test_data_for_prediction = test_data.drop(columns=['Set ID'])

# 데이터에 대해 예측 수행
test_predictions = apply_prediction(test_data_for_prediction, categorical_features, model1, model2, drop_column1, drop_column2)

  test_abnormal = test_data[test_condition_abnormal]  # 불량 데이터
  test_filtered = test_data[~test_condition_abnormal]  # 정상 데이터


In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["target"] = test_predictions

# 제출 파일 저장
df_sub.to_csv("./data/submission.csv", index=False)