In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

# 학습을 위해 train_data를 지정
train_data = pd.read_csv('./data/train_cleaned.csv')

# 범주형 열을 선택하여 인코딩 수행
categorical_features = ['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2']

# OrdinalEncoder 인스턴스 생성
encoder = OrdinalEncoder()

# 범주형 열에 대해서만 인코딩 수행
train_data[categorical_features] = encoder.fit_transform(train_data[categorical_features])

# 인코딩된 데이터프레임 확인
train_data

Unnamed: 0,Equipment_Dam,Model.Suffix,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,...,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2,HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2,HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,target
0,0.0,AJX75334505,240.0,2.5,100,1030,-90,16,14.9,8.4,...,156.0,428.0,427.9,243.7,114.612,19.9,7.0,127,1,Normal
1,1.0,AJX75334501,1000.0,12.5,85,280,90,16,14.7,8.5,...,694.0,1324.2,1324.2,243.5,114.612,19.8,10.0,73,1,Normal
2,0.0,AJX75334501,240.0,2.5,70,1030,-90,16,13.2,6.5,...,156.0,428.0,427.9,243.7,85.000,19.7,8.0,483,1,Normal
3,1.0,AJX75334501,1000.0,12.5,70,280,90,16,13.2,7.6,...,694.0,1324.2,1324.2,243.5,85.000,19.9,11.0,105,1,Normal
4,1.0,AJX75334501,1000.0,12.5,85,280,90,16,14.7,8.5,...,694.0,1324.2,1324.2,243.5,114.612,20.0,15.0,78,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,0.0,AJX75334501,240.0,2.5,70,1030,-90,10,9.7,4.9,...,156.0,428.0,427.9,243.7,85.000,19.6,2.0,56,1,Normal
40502,0.0,AJX75334501,240.0,2.5,70,1030,-90,10,9.7,4.9,...,156.0,428.0,427.9,243.7,85.000,19.3,8.0,329,1,Normal
40503,0.0,AJX75334501,240.0,2.5,70,1030,-90,10,17.0,5.0,...,156.0,428.0,427.9,243.7,85.000,19.2,1.0,318,1,Normal
40504,1.0,AJX75334501,1000.0,12.5,70,280,90,10,9.7,4.9,...,694.0,1324.2,1324.2,243.5,85.000,20.1,13.0,117,1,Normal


In [3]:
# 각 범주에 대한 인코딩 값 확인
for feature, categories in zip(categorical_features, encoder.categories_):
    print(f"Feature: {feature}")
    for category, index in zip(categories, range(len(categories))):
        print(f"  {category}: {index}")

Feature: Equipment_Dam
  Dam dispenser #1: 0
  Dam dispenser #2: 1
Feature: Equipment_Fill1
  Fill1 dispenser #1: 0
  Fill1 dispenser #2: 1
Feature: Equipment_Fill2
  Fill2 dispenser #1: 0
  Fill2 dispenser #2: 1


In [4]:
columns_to_drop = [
    'Model.Suffix', 'Chamber Temp. Judge Value_AutoClave']

# 열 삭제
train_data = train_data.drop(columns=columns_to_drop)

In [5]:
# 세 가지 조건: 열 간 값이 다르면 불량으로 분류
condition_receip = (
    (train_data['Receip No Collect Result_Dam'] != train_data['Receip No Collect Result_Fill1']) |
    (train_data['Receip No Collect Result_Dam'] != train_data['Receip No Collect Result_Fill2']) |
    (train_data['Receip No Collect Result_Fill1'] != train_data['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (train_data['Production Qty Collect Result_Dam'] != train_data['Production Qty Collect Result_Fill1']) |
    (train_data['Production Qty Collect Result_Dam'] != train_data['Production Qty Collect Result_Fill2']) |
    (train_data['Production Qty Collect Result_Fill1'] != train_data['Production Qty Collect Result_Fill2'])
)

condition_palletid = (
    (train_data['PalletID Collect Result_Dam'] != train_data['PalletID Collect Result_Fill1']) |
    (train_data['PalletID Collect Result_Dam'] != train_data['PalletID Collect Result_Fill2']) |
    (train_data['PalletID Collect Result_Fill1'] != train_data['PalletID Collect Result_Fill2'])
)

# 장비가 다르면 불량으로 분류
equip1 = train_data[categorical_features].eq(0).all(axis=1)
equip2 = train_data[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~equip1 & ~equip2

# 네 조건 중 하나라도 만족하면 불량으로 분류
condition_abnormal = condition_receip | condition_production_qty | condition_palletid | condition_equip_different

# 불량 데이터와 정상 데이터로 분리
df_abnormal = train_data[condition_abnormal]  # 불량 데이터
df_filtered = train_data[~condition_abnormal]  # 정상 데이터

# 정상 데이터에서 장비별로 데이터 분리
df_model1 = df_filtered[df_filtered[categorical_features].eq(0).all(axis=1)]
df_model2 = df_filtered[df_filtered[categorical_features].eq(1).all(axis=1)]

# 결과 확인
print(f"불량 데이터: {len(df_abnormal)} 행")
print(f"정상 데이터: {len(df_filtered)} 행")
print(f"Equipment #1 데이터: {len(df_model1)} 행")
print(f"Equipment #2 데이터: {len(df_model2)} 행")

불량 데이터: 94 행
정상 데이터: 40412 행
Equipment #1 데이터: 24968 행
Equipment #2 데이터: 15444 행


In [32]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 110

# train/val split
df_train, df_val = train_test_split(
    train_data,
    test_size=0.3,
    stratify=train_data["target"],
    random_state=RANDOM_STATE,
)

def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 26709, AbNormal: 1645 ratio: 0.06158972630948369
  Total: Normal: 11447, AbNormal: 705 ratio: 0.06158818904516467


In [33]:
condition_receip = (
    (df_train['Receip No Collect Result_Dam'] != df_train['Receip No Collect Result_Fill1']) |
    (df_train['Receip No Collect Result_Dam'] != df_train['Receip No Collect Result_Fill2']) |
    (df_train['Receip No Collect Result_Fill1'] != df_train['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (df_train['Production Qty Collect Result_Dam'] != df_train['Production Qty Collect Result_Fill1']) |
    (df_train['Production Qty Collect Result_Dam'] != df_train['Production Qty Collect Result_Fill2']) |
    (df_train['Production Qty Collect Result_Fill1'] != df_train['Production Qty Collect Result_Fill2'])
)

condition_palletid = (
    (df_train['PalletID Collect Result_Dam'] != df_train['PalletID Collect Result_Fill1']) |
    (df_train['PalletID Collect Result_Dam'] != df_train['PalletID Collect Result_Fill2']) |
    (df_train['PalletID Collect Result_Fill1'] != df_train['PalletID Collect Result_Fill2'])
)

# 장비가 다르면 불량으로 분류
equip1 = df_train[categorical_features].eq(0).all(axis=1)
equip2 = df_train[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~equip1 & ~equip2

# 네 조건 중 하나라도 만족하면 불량으로 분류
condition_abnormal = condition_receip | condition_production_qty | condition_palletid | condition_equip_different

# 불량 데이터와 정상 데이터로 분리
df_abnormal = df_train[condition_abnormal]  # 불량 데이터
df_filtered = df_train[~condition_abnormal]  # 정상 데이터

# 정상 데이터에서 장비별로 데이터 분리
df_model1 = df_filtered[df_filtered[categorical_features].eq(0).all(axis=1)]
df_model2 = df_filtered[df_filtered[categorical_features].eq(1).all(axis=1)]

# 결과 확인
print(f"불량 데이터: {len(df_abnormal)} 행")
print(f"정상 데이터: {len(df_filtered)} 행")
print(f"Equipment #1 데이터: {len(df_model1)} 행")
print(f"Equipment #2 데이터: {len(df_model2)} 행")

# 장비별로 고유값이 1개인 컬럼 제외
# 모든 행이 동일한 값을 가지는 열을 찾고, 그 값을 함께 저장
def unique_column_drop(df):
    constant_columns = [col for col in df.columns if df[col].nunique() == 1]
    # 데이터 프레임에서 해당 열 삭제
    return df.drop(columns=constant_columns), constant_columns

# 장비별 drop columns 저장
df_model1, drop_column1 = unique_column_drop(df_model1)
df_model2, drop_column2 = unique_column_drop(df_model2)

불량 데이터: 61 행
정상 데이터: 28293 행
Equipment #1 데이터: 17443 행
Equipment #2 데이터: 10850 행


In [34]:
df_abnormal['target'].unique()

array(['AbNormal'], dtype=object)

In [35]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# 수치형 피처와 범주형 피처 구분
categorical_columns = ['PalletID Collect Result_Dam', 'Production Qty Collect Result_Dam', 'Receip No Collect Result_Dam',
                        'PalletID Collect Result_Fill1', 'Production Qty Collect Result_Fill1', 'Receip No Collect Result_Fill1',
                        'PalletID Collect Result_Fill2', 'Production Qty Collect Result_Fill2', 'Receip No Collect Result_Fill2']

# 수치형 피처는 나머지 컬럼들 (categorical_features를 제외한 컬럼들)
numerical_features1 = df_model1.columns.difference(categorical_columns + ['target']).tolist()
numerical_features2 = df_model2.columns.difference(categorical_columns + ['target']).tolist()

# 모델 1과 모델 2의 각 수치형 피처를 스케일링
scaler1 = StandardScaler()
scaler2 = StandardScaler()

df_model1_num = df_model1[numerical_features1]
df_model2_num = df_model2[numerical_features2]

df_model1_scaled = scaler1.fit_transform(df_model1_num)
df_model2_scaled = scaler2.fit_transform(df_model2_num)

# 스케일링된 수치형 데이터와 범주형 데이터 결합
df_model1_processed = pd.DataFrame(df_model1_scaled, columns=numerical_features1)
df_model1_processed[categorical_columns] = df_model1[categorical_columns].reset_index(drop=True)

df_model2_processed = pd.DataFrame(df_model2_scaled, columns=numerical_features2)
df_model2_processed[categorical_columns] = df_model2[categorical_columns].reset_index(drop=True)

In [36]:
print(f"numerical_features1: {len(numerical_features1)}\n"
      f"numerical_features2: {len(numerical_features2)}")

numerical_features1: 66
numerical_features2: 68


In [37]:
# # 추가로 제거할 열들
# additional_drop_column1 = [
#                             'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
#                             'Dispense Volume(Stage3) Collect Result_Dam',
#                             'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
#                             'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
#                             'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
#                             'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
#                             'Head Clean Position Z Collect Result_Dam',
#                             'Stage1 Line3 Distance Speed Collect Result_Dam',
#                             'Stage2 Line1 Distance Speed Collect Result_Dam',
#                             'Stage2 Line2 Distance Speed Collect Result_Dam',
#                             'Stage2 Line3 Distance Speed Collect Result_Dam',
#                             'Stage3 Line1 Distance Speed Collect Result_Dam',
#                             'Stage3 Line3 Distance Speed Collect Result_Dam',
#                             'Head Zero Position Z Collect Result_Dam',

#                             'Dispense Volume(Stage1) Collect Result_Fill1',
#                             'Dispense Volume(Stage2) Collect Result_Fill1',
#                             'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
#                             'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
#                             'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
#                             'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
#                             'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
#                             'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',

#                             'CURE START POSITION X Collect Result_Fill2',

#                             'Chamber Temp. Unit Time_AutoClave',

#                             'Receip No Collect Result_Dam',
#                             'Receip No Collect Result_Fill1',
#                             'Receip No Collect Result_Fill2',
#                             'Production Qty Collect Result_Dam',
#                             'Production Qty Collect Result_Fill1',
#                             'Production Qty Collect Result_Fill2',
#                             'PalletID Collect Result_Dam',
#                             'PalletID Collect Result_Fill1',
#                             'PalletID Collect Result_Fill2'
#                             ]

# additional_drop_column2 = [
#                             'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
#                             'Dispense Volume(Stage3) Collect Result_Dam',
#                             'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
#                             'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
#                             'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
#                             'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
#                             'Head Clean Position Z Collect Result_Dam',
#                             'Stage1 Line1 Distance Speed Collect Result_Dam',
#                             'Stage1 Line3 Distance Speed Collect Result_Dam',
#                             'Stage2 Line1 Distance Speed Collect Result_Dam',
#                             'Stage2 Line2 Distance Speed Collect Result_Dam',
#                             'Stage2 Line3 Distance Speed Collect Result_Dam',
#                             'Stage3 Circle1 Distance Speed Collect Result_Dam',
#                             'Stage3 Line1 Distance Speed Collect Result_Dam',
#                             'Stage3 Line3 Distance Speed Collect Result_Dam',
#                             'Head Zero Position Z Collect Result_Dam',

#                             'Dispense Volume(Stage2) Collect Result_Fill1',
#                             'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
#                             'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
#                             'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
#                             'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
#                             'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
#                             'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',

#                             'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
#                             'CURE START POSITION X Collect Result_Fill2',

#                             'Chamber Temp. Unit Time_AutoClave',

#                             'Receip No Collect Result_Dam',
#                             'Receip No Collect Result_Fill1',
#                             'Receip No Collect Result_Fill2',
#                             'Production Qty Collect Result_Dam',
#                             'Production Qty Collect Result_Fill1',
#                             'Production Qty Collect Result_Fill2',
#                             'PalletID Collect Result_Dam',
#                             'PalletID Collect Result_Fill1',
#                             'PalletID Collect Result_Fill2'
#                             ]

# # drop_column에 추가적인 열들을 더함
# drop_column1.extend(additional_drop_column1)
# drop_column2.extend(additional_drop_column2)

# # 데이터 프레임에서 추가적인 열들 제거
# df_model1_processed = df_model1_processed.drop(columns=additional_drop_column1)
# df_model2_processed = df_model2_processed.drop(columns=additional_drop_column2)

# # 결과 확인
# print(f"Equipment #1에서 제거된 모든 열: {drop_column1}")
# print(f"Equipment #2에서 제거된 모든 열: {drop_column2}")

In [38]:
len(drop_column1)

13

In [39]:
len(drop_column2)

11

## RandomForest

In [13]:
# 'target' 열을 원래 데이터프레임에서 분리
y_model1 = df_model1['target']
y_model2 = df_model2['target']

# 모델 학습에 사용할 데이터 (target 열이 없음)
X_model1 = df_model1_processed
X_model2 = df_model2_processed

# 모델 훈련
model1 = RandomForestClassifier(n_estimators=300, 
                                max_depth=8, 
                                min_samples_split=5, 
                                min_samples_leaf=2, 
                                max_features='sqrt', 
                                class_weight='balanced', 
                                random_state=110)
model1.fit(X_model1, y_model1)

model2 = RandomForestClassifier(n_estimators=300, 
                                max_depth=8, 
                                min_samples_split=5, 
                                min_samples_leaf=2, 
                                max_features='sqrt', 
                                class_weight='balanced', 
                                random_state=110)
model2.fit(X_model2, y_model2)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

condition_receip = (
    (df_val['Receip No Collect Result_Dam'] != df_val['Receip No Collect Result_Fill1']) |
    (df_val['Receip No Collect Result_Dam'] != df_val['Receip No Collect Result_Fill2']) |
    (df_val['Receip No Collect Result_Fill1'] != df_val['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (df_val['Production Qty Collect Result_Dam'] != df_val['Production Qty Collect Result_Fill1']) |
    (df_val['Production Qty Collect Result_Dam'] != df_val['Production Qty Collect Result_Fill2']) |
    (df_val['Production Qty Collect Result_Fill1'] != df_val['Production Qty Collect Result_Fill2'])
)

condition_palletid = (
    (df_val['PalletID Collect Result_Dam'] != df_val['PalletID Collect Result_Fill1']) |
    (df_val['PalletID Collect Result_Dam'] != df_val['PalletID Collect Result_Fill2']) |
    (df_val['PalletID Collect Result_Fill1'] != df_val['PalletID Collect Result_Fill2'])
)

# abnormal로 제외할 조건들
val_equip1 = df_val[categorical_features].eq(0).all(axis=1)
val_equip2 = df_val[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~val_equip1 & ~val_equip2

# 네 조건 중 하나라도 만족하면 불량으로 분류
condition_abnormal = condition_receip | condition_production_qty | condition_palletid | condition_equip_different

# 불량 데이터와 정상 데이터로 분리
val_abnormal = df_val[condition_abnormal]  # 불량 데이터
val_filtered = df_val[~condition_abnormal]  # 정상 데이터

# 정상 데이터에서 장비별로 데이터 분리
val_model1 = val_filtered[val_filtered[categorical_features].eq(0).all(axis=1)]
val_model2 = val_filtered[val_filtered[categorical_features].eq(1).all(axis=1)]

# 검증 데이터 스케일링
val_model1_num = val_model1[numerical_features1]  # 범주9 + 타겟 제거
val_model2_num = val_model2[numerical_features2]

val_model1_scaled = scaler1.transform(val_model1_num)
val_model2_scaled = scaler2.transform(val_model2_num)

val_model1_processed = pd.DataFrame(val_model1_scaled, columns=numerical_features1)  # 타겟 제외 수치+범주
val_model1_processed[categorical_columns] = val_model1[categorical_columns].reset_index(drop=True)
# val_model1_processed = val_model1_processed.drop(columns=drop_column1, errors='ignore')  # drop_colunm 제거

val_model2_processed = pd.DataFrame(val_model2_scaled, columns=numerical_features2)
val_model2_processed[categorical_columns] = val_model2[categorical_columns].reset_index(drop=True)
# val_model2_processed =  val_model2_processed.drop(columns=drop_column2, errors='ignore')

In [16]:
val_y = df_val["target"]

val_model1_predictions = model1.predict(val_model1_processed)
val_model2_predictions = model2.predict(val_model2_processed)

# 예측을 위한 데이터 인덱스
val_abnormal_idx = df_val[condition_abnormal].index
val_model1_index = val_model1.index
val_model2_index = val_model2.index

# 검증 데이터의 전체 예측을 위한 결과 결합
final_predictions = []

for idx in df_val.index:
    if idx in val_abnormal_idx:
        final_predictions.append('AbNormal')
    elif idx in val_model1_index:
        final_predictions.append(val_model1_predictions[val_model1_index.get_loc(idx)])
    elif idx in val_model2_index:
        final_predictions.append(val_model2_predictions[val_model2_index.get_loc(idx)])
    else:
        final_predictions.append('AbNormal')

# 정확도 및 F1 스코어 계산
val_accuracy = accuracy_score(val_y, final_predictions)
val_f1_score = f1_score(val_y, final_predictions, pos_label="AbNormal")

# 결과 출력
print(f"검증 데이터 정확도: {val_accuracy:.4f}")
print(f"검증 데이터 F1 스코어: {val_f1_score:.4f}")

검증 데이터 정확도: 0.8171
검증 데이터 F1 스코어: 0.1949


In [17]:
df_val['pred'] = final_predictions
df_val['pred'].value_counts()

pred
Normal      10097
AbNormal     2055
Name: count, dtype: int64

In [18]:
# 혼동 행렬 생성
cm = confusion_matrix(val_y, final_predictions)

print(f"F1 스코어: {val_f1_score}")
print(f"혼동 행렬:\n{cm}")

F1 스코어: 0.19492753623188405
혼동 행렬:
[[ 269  436]
 [1786 9661]]


## SVM

In [40]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

# 'target' 열을 원래 데이터프레임에서 분리
y_model1 = df_model1['target']
y_model2 = df_model2['target']

# 모델 학습에 사용할 데이터 (target 열이 없음)
X_model1 = df_model1_processed
X_model2 = df_model2_processed

# 모델1 생성 및 훈련
model1 = SVC(kernel='linear', C=0.5, gamma='auto', random_state=110)
model1.fit(X_model1, y_model1)

# 모델2 생성 및 훈련 (여기서는 하이퍼파라미터를 다르게 설정할 수 있습니다)
model2 = SVC(kernel='linear', C=0.5, gamma='auto', random_state=110)
model2.fit(X_model2, y_model2)

In [41]:
from sklearn.metrics import accuracy_score, f1_score

condition_receip = (
    (df_val['Receip No Collect Result_Dam'] != df_val['Receip No Collect Result_Fill1']) |
    (df_val['Receip No Collect Result_Dam'] != df_val['Receip No Collect Result_Fill2']) |
    (df_val['Receip No Collect Result_Fill1'] != df_val['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (df_val['Production Qty Collect Result_Dam'] != df_val['Production Qty Collect Result_Fill1']) |
    (df_val['Production Qty Collect Result_Dam'] != df_val['Production Qty Collect Result_Fill2']) |
    (df_val['Production Qty Collect Result_Fill1'] != df_val['Production Qty Collect Result_Fill2'])
)

condition_palletid = (
    (df_val['PalletID Collect Result_Dam'] != df_val['PalletID Collect Result_Fill1']) |
    (df_val['PalletID Collect Result_Dam'] != df_val['PalletID Collect Result_Fill2']) |
    (df_val['PalletID Collect Result_Fill1'] != df_val['PalletID Collect Result_Fill2'])
)

# abnormal로 제외할 조건들
val_equip1 = df_val[categorical_features].eq(0).all(axis=1)
val_equip2 = df_val[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~val_equip1 & ~val_equip2

# 네 조건 중 하나라도 만족하면 불량으로 분류
condition_abnormal = condition_receip | condition_production_qty | condition_palletid | condition_equip_different

# 불량 데이터와 정상 데이터로 분리
val_abnormal = df_val[condition_abnormal]  # 불량 데이터
val_filtered = df_val[~condition_abnormal]  # 정상 데이터

# 정상 데이터에서 장비별로 데이터 분리
val_model1 = val_filtered[val_filtered[categorical_features].eq(0).all(axis=1)]
val_model2 = val_filtered[val_filtered[categorical_features].eq(1).all(axis=1)]

# 검증 데이터 스케일링
val_model1_num = val_model1[numerical_features1]  # 범주9 + 타겟 제거
val_model2_num = val_model2[numerical_features2]

val_model1_scaled = scaler1.transform(val_model1_num)
val_model2_scaled = scaler2.transform(val_model2_num)

val_model1_processed = pd.DataFrame(val_model1_scaled, columns=numerical_features1)  # 타겟 제외 수치+범주
val_model1_processed[categorical_columns] = val_model1[categorical_columns].reset_index(drop=True)
# val_model1_processed = val_model1_processed.drop(columns=drop_column1, errors='ignore')  # drop_colunm 제거

val_model2_processed = pd.DataFrame(val_model2_scaled, columns=numerical_features2)
val_model2_processed[categorical_columns] = val_model2[categorical_columns].reset_index(drop=True)
# val_model2_processed = val_model2_processed.drop(columns=drop_column2, errors='ignore')

In [42]:
val_y = df_val["target"]

val_model1_predictions = model1.predict(val_model1_processed)
val_model2_predictions = model2.predict(val_model2_processed)

# 예측을 위한 데이터 인덱스
val_abnormal_idx = df_val[condition_abnormal].index
val_model1_index = val_model1.index
val_model2_index = val_model2.index

# 검증 데이터의 전체 예측을 위한 결과 결합
final_predictions = []

for idx in df_val.index:
    if idx in val_abnormal_idx:
        final_predictions.append('AbNormal')
    elif idx in val_model1_index:
        final_predictions.append(val_model1_predictions[val_model1_index.get_loc(idx)])
    elif idx in val_model2_index:
        final_predictions.append(val_model2_predictions[val_model2_index.get_loc(idx)])
    else:
        final_predictions.append('AbNormal')

# 정확도 및 F1 스코어 계산
val_accuracy = accuracy_score(val_y, final_predictions)
val_f1_score = f1_score(val_y, final_predictions, pos_label="AbNormal")

# 결과 출력
print(f"검증 데이터 정확도: {val_accuracy:.4f}")
print(f"검증 데이터 F1 스코어: {val_f1_score:.4f}")

검증 데이터 정확도: 0.9456
검증 데이터 F1 스코어: 0.1314


In [30]:
df_val['pred'] = final_predictions
df_val['pred'].value_counts()

pred
Normal      12093
AbNormal       59
Name: count, dtype: int64

In [31]:
# 혼동 행렬 생성
cm = confusion_matrix(val_y, final_predictions)

print(f"F1 스코어: {val_f1_score}")
print(f"혼동 행렬:\n{cm}")

F1 스코어: 0.13612565445026178
혼동 행렬:
[[   52   653]
 [    7 11440]]


## 테스트

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

test_data = pd.read_csv('./data/test_cleaned.csv')
test_data[categorical_features] = encoder.transform(test_data[categorical_features])

columns_to_drop = [
    'Model.Suffix', 'Chamber Temp. Judge Value_AutoClave']

# 열 삭제
test_data = test_data.drop(columns=columns_to_drop)

In [None]:
condition_receip = (
    (test_data['Receip No Collect Result_Dam'] != test_data['Receip No Collect Result_Fill1']) |
    (test_data['Receip No Collect Result_Dam'] != test_data['Receip No Collect Result_Fill2']) |
    (test_data['Receip No Collect Result_Fill1'] != test_data['Receip No Collect Result_Fill2'])
)

condition_production_qty = (
    (test_data['Production Qty Collect Result_Dam'] != test_data['Production Qty Collect Result_Fill1']) |
    (test_data['Production Qty Collect Result_Dam'] != test_data['Production Qty Collect Result_Fill2']) |
    (test_data['Production Qty Collect Result_Fill1'] != test_data['Production Qty Collect Result_Fill2'])
)

condition_palletid = (
    (test_data['PalletID Collect Result_Dam'] != test_data['PalletID Collect Result_Fill1']) |
    (test_data['PalletID Collect Result_Dam'] != test_data['PalletID Collect Result_Fill2']) |
    (test_data['PalletID Collect Result_Fill1'] != test_data['PalletID Collect Result_Fill2'])
)

# abnormal로 제외할 조건들
test_equip1 = test_data[categorical_features].eq(0).all(axis=1)
test_equip2 = test_data[categorical_features].eq(1).all(axis=1)
condition_equip_different = ~test_equip1 & ~test_equip2

# 네 조건 중 하나라도 만족하면 불량으로 분류
condition_abnormal = condition_receip | condition_production_qty | condition_palletid | condition_equip_different

# 불량 데이터와 정상 데이터로 분리
test_abnormal = test_data[condition_abnormal]  # 불량 데이터
test_filtered = test_data[~condition_abnormal]  # 정상 데이터

# 정상 데이터에서 장비별로 데이터 분리
test_model1 = test_filtered[test_filtered[categorical_features].eq(0).all(axis=1)]
test_model2 = test_filtered[test_filtered[categorical_features].eq(1).all(axis=1)]

test_model1_for_prediction = test_model1.drop(columns=['Set ID'])
test_model2_for_prediction = test_model2.drop(columns=['Set ID'])

In [None]:
# 테스트 데이터 스케일링
test_model1_num = test_model1[numerical_features1]  # 범주9 + 타겟 제거
test_model2_num = test_model2[numerical_features2]

test_model1_scaled = scaler1.transform(test_model1_num)
test_model2_scaled = scaler2.transform(test_model2_num)

test_model1_processed = pd.DataFrame(test_model1_scaled, columns=numerical_features1)  # 타겟 제외 수치+범주
test_model1_processed[categorical_columns] = test_model1[categorical_columns].reset_index(drop=True)
# test_model1_processed = test_model1_processed.drop(columns=drop_column1, errors='ignore')  # drop_colunm 제거

test_model2_processed = pd.DataFrame(test_model2_scaled, columns=numerical_features2)
test_model2_processed[categorical_columns] = test_model2[categorical_columns].reset_index(drop=True)
# test_model2_processed =  test_model2_processed.drop(columns=drop_column2, errors='ignore')

In [None]:
test_model1_predictions = model1.predict(test_model1_processed)
test_model2_predictions = model2.predict(test_model2_processed)

# 예측을 위한 데이터 인덱스
test_model1_index = test_model1.index
test_model2_index = test_model2.index

# 검증 데이터의 전체 예측을 위한 결과 결합
final_predictions = []

for idx in test_data.index:
    if idx in test_model1_index:
        final_predictions.append(test_model1_predictions[test_model1_index.get_loc(idx)])
    elif idx in test_model2_index:
        final_predictions.append(test_model2_predictions[test_model2_index.get_loc(idx)])
    else:
        final_predictions.append('AbNormal')

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["target"] = final_predictions

# 제출 파일 저장
# df_sub.to_csv("./data/submission.csv", index=False)