<a href="https://colab.research.google.com/github/nxmyxxn/LG_Aimers_5th/blob/main/code_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ! pip install catboost

In [None]:
import os
import random
from typing import List, Optional
from collections import Counter

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import faiss
from scipy.stats import mode
import category_encoders as ce

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import f1_score
from catboost import Pool,CatBoostClassifier


DO_VALID = False

ROOT_PATH=os.path.abspath('.')

if DO_VALID:
    TRAIN_PATH = os.path.join(ROOT_PATH, 'data', 'processed', 'train.csv')
    VALID_PATH = os.path.join(ROOT_PATH, 'data', 'processed', 'valid.csv')
else:
    TRAIN_PATH = os.path.join(ROOT_PATH, 'data', 'train.csv')
    VALID_PATH = None
TEST_PATH = os.path.join(ROOT_PATH, 'data', 'test.csv')

print(TRAIN_PATH)
print(TEST_PATH)
print(VALID_PATH)

  from .autonotebook import tqdm as notebook_tqdm


/home/elicer/data/train.csv
/home/elicer/data/test.csv
None


# Model1: Vector Similarity

In [None]:
RANDOM_STATE=110

In [None]:
train = pd.read_csv(TRAIN_PATH)
if DO_VALID:
    valid = pd.read_csv(VALID_PATH)
test = pd.read_csv(TEST_PATH)

In [None]:
def label2id(y):
    return pd.Series(y).map({'Normal':0, 'AbNormal':1})

def id2label(y):
    return pd.Series(y).map({0:'Normal', 1:'AbNormal'})

## 1.1 전처리

In [None]:
class BasicPreprocessor:
    def __call__(self, df,
                 ok2nan=False,
                 drop_unique_cols=False,
                 drop_duplicated_cols=False):
        # 온점(.)과 공백을 언더바(_)로 교체
        df.columns = df.columns.str.replace('.', '_')
        df.columns = df.columns.str.replace(' ', '_')

        # 잘못된 OK 값을 np.nan으로 대체
        if ok2nan:
            df = self.replace_ok_to_nan(df)

        # 모든 row 가 unique한 경우 Drop
        if drop_unique_cols:
            unique_cols = self.find_unique_columns(df)
            df = df.drop(columns=unique_cols)

        # 값이 완전히 동일한 column들 제거
        if drop_duplicated_cols:
            df = self.drop_duplicated_features( df, 'Model_Suffix')
            df = self.drop_duplicated_features( df, 'Workorder')

        return df

    def replace_ok_to_nan(self,df):
        cols = ["HEAD_NORMAL_COORDINATE_X_AXIS(Stage1)_Collect_Result_Dam",
                "HEAD_NORMAL_COORDINATE_X_AXIS(Stage1)_Collect_Result_Fill1",
                "HEAD_NORMAL_COORDINATE_X_AXIS(Stage1)_Collect_Result_Fill2"]
        for col in cols:
            if col in df.columns:
                df.loc[df[col] == "OK", col] = np.nan
                df[col] = df[col].astype(float)
                print('After replacement:' , df[col].unique())
        return df


    def find_unique_columns(self, df):
        unique_domain_columns = []
        for column in df.columns:
            unique_values = df[column].dropna().unique()
            if len(unique_values) <= 1:
                unique_domain_columns.append(column)
        return unique_domain_columns


    def drop_duplicated_features(self, df, core_name:str):
        duplicated = [feature for feature in df.columns if core_name in feature]
        if duplicated:
            df = df.rename(columns={duplicated.pop():core_name})
            df = df.drop(duplicated, axis=1)
        return df

In [None]:
preprocessor = BasicPreprocessor()
test = preprocessor(test, ok2nan=True, drop_unique_cols=True, drop_duplicated_cols=True)
train = preprocessor(train, ok2nan = True,  drop_unique_cols=True, drop_duplicated_cols=True)
if DO_VALID:
    valid = preprocessor(valid, ok2nan=True, drop_duplicated_cols=True)

After replacement: [162.4   nan 549.5 549.  550.3 550.  548.5]
After replacement: [837.7   nan 838.4 837.9 838.2 837.5]
After replacement: [305.    nan 835.5]
After replacement: [  nan 550.3 162.4 549.  549.5 550.  548.5]
After replacement: [  nan 838.4 837.7 837.9 838.2 837.5]
After replacement: [  nan 835.5 305. ]


In [None]:

class PreprocessorForVectorSimilarity(BasicPreprocessor):
    def __init__(self, cols_to_label_enc:Optional[List[str]]=None):
        self.mode_dict= None
        self.workorder_agg = None
        self.label_encoders = {}
        self.scalers = {}
        self.selectors = {}
        self.pcas = {}
        self.is_train=None

        if cols_to_label_enc is not None:
            self.cols_to_label_enc = cols_to_label_enc
        else:
            self.cols_to_label_enc = ['Equipment_Dam', 'Chamber_Temp__Judge_Value_AutoClave', 'Equipment_Fill1', 'Equipment_Fill2']

    def __call__(self, df, is_train):
        self.is_train=is_train
        df = self._fill_na_with_mode(df)
        df = self._add_workorder_stat(df)
        df = self._label_encoding(df)

        X = df.select_dtypes(exclude='object')
        if ('target' in df.columns) or is_train:
            y= df['target']
        else:
            y=None

        vectors_all, y_all = self._preprocess_for_subprocess(X, y, 'All', n_components=30)
        vectors_dam, y_dam= self._preprocess_for_subprocess(X, y, 'Dam', n_components=20)
        vectors_fl1, y_fl1 = self._preprocess_for_subprocess(X, y, 'Fill1', n_components=20)
        vectors_fl2, y_fl2 = self._preprocess_for_subprocess(X, y, 'Fill2', n_components=20)
        vectors_ac,  y_ac = self._preprocess_for_subprocess(X, y, 'AutoClave', n_components=13)

        return {
            'All':(vectors_all, y_all),
            'Dam': (vectors_dam, y_dam),
            'Fill1':(vectors_fl1, y_fl1),
            'Fill2':(vectors_fl2, y_fl2),
            'AutoClave':(vectors_ac, y_ac)
        }

    def _fill_na_with_mode(self, df):
        def check_null(df):
            # checking missing data
            total = df.isnull().sum().sort_values(ascending=False)
            percent = (df.isnull().sum() / df.isnull().count()*100).sort_values(ascending=False)
            missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
            return missing_data


        if self.is_train:
            mode_dict= dict()
            # 결측치 확인
            df_na = check_null(df)
            has_na = df_na[df_na['Total'] != 0].index.tolist()

            # 결측치 있을 경우 최빈값 대체
            if has_na:
                for col in has_na:
                    mode = df[col].value_counts().sort_values(ascending=False).index[0]
                    df[col] = df[col].fillna(value=mode)
                    mode_dict[col]=mode

            # column명 및 최빈값 저장
            self.mode_dict = mode_dict
        else:
            for col, mode in self.mode_dict.items():
                df[col] = df[col].fillna(value=mode)
        return df

    def _add_workorder_stat(self, df):
        if self.is_train:
            numeric = df.select_dtypes(exclude='object').columns.tolist()
            workorder_agg= df.groupby('Workorder', as_index=False)[numeric].agg(['std','mean']).reset_index()

            columns = ['index', 'Workorder']
            for var in workorder_agg.columns.levels[0]:
                if var not in ['index', 'Workorder']:
                    for stat in workorder_agg.columns.levels[1][:-1]:
                        columns.append(f'WO_{var}_{stat}')

            workorder_agg.columns = columns
            workorder_agg = workorder_agg.dropna(axis=1)
            workorder_agg = workorder_agg.drop('index', axis=1)

            self.workorder_agg = workorder_agg

        df = df.merge(self.workorder_agg, on='Workorder', how='left').fillna(value=0)
        return df

    def _label_encoding(self, df):
        for col in self.cols_to_label_enc:
            new_col = f'{col}_encoded'
            if self.is_train:
                self.label_encoders[col] = LabelEncoder()
                label_enc = self.label_encoders[col]
                df[new_col] = label_enc.fit_transform(df[col])
            else:
                label_enc = self.label_encoders[col]
                df[new_col] = label_enc.transform(df[col])
        return df

    def _preprocess_for_subprocess(self,
                                  X,
                                  y,
                                  subprocess:str,
                                  n_components:int
                                  ):

        X = self._get_process_df(subprocess, X)
        if self.is_train:
            X, y = self._oversampling(X, y)

            self.scalers[subprocess] = StandardScaler()
            scaler = self.scalers[subprocess]

            self.selectors[subprocess] = VarianceThreshold(threshold=0.05)
            selector =self.selectors[subprocess]

            self.pcas[subprocess] =  PCA(n_components=n_components)
            pca = self.pcas[subprocess]

            vectors = scaler.fit_transform(X)
            vectors = selector.fit_transform(vectors)
            vectors = pca.fit_transform(vectors)
        else:
            scaler = self.scalers[subprocess]
            selector =self.selectors[subprocess]
            pca = self.pcas[subprocess]

            vectors = scaler.transform(X)
            vectors = selector.transform(vectors)
            vectors = pca.transform(vectors)

        vectors = self._preprocess_for_vector_db(vectors)

        return vectors, y

    def _get_process_df(self, subprocess, X):
        if subprocess=='All':
            return X
        else:
            cols = [col for col in X.columns if subprocess in col]
        return X.loc[:, cols]

    def _oversampling(self, X, y):
        y = y.map({'Normal':0, 'AbNormal':1})
        smote = SMOTE(random_state=RANDOM_STATE)
        X, y = smote.fit_resample(X, y)

        print(f"  Total: Normal: {(y == 0).sum()}, AbNormal: {(y == 1).sum()}")
        y = y.map({0:'Normal', 1:'AbNormal'})
        return X, y


    def _preprocess_for_vector_db(self, vectors):
        vectors = np.array(vectors, dtype=np.float32)
        vectors = np.ascontiguousarray(vectors)
        faiss.normalize_L2(vectors)
        return vectors



In [None]:
vector_preprocessor = PreprocessorForVectorSimilarity()
train_dict = vector_preprocessor(train, is_train=True)
if DO_VALID:
    valid_dict = vector_preprocessor(valid, is_train=False)
test_dict = vector_preprocessor(test, is_train=False)


  Total: Normal: 38156, AbNormal: 38156
  Total: Normal: 38156, AbNormal: 38156
  Total: Normal: 38156, AbNormal: 38156
  Total: Normal: 38156, AbNormal: 38156
  Total: Normal: 38156, AbNormal: 38156


## 1.2 Model

In [None]:
class VectorSimilaritySubModel():
    def __init__(self, train_vectors, train_y):
        self.train_y = train_y
        self.index = faiss.IndexFlatIP(train_vectors.shape[1])
        self.index.add(train_vectors)

    def predict(self, vectors, k = 7, th = 6):
        y_pred = []
        for i, vector in enumerate(vectors):
            # if i > 10:
            #     break
            vector = vector.reshape(1, -1)
            distances, indices = self.index.search(vector, k)

            labels = self.train_y.iloc[indices.flatten()].tolist()
            cnt = labels.count('AbNormal')
            pred = 'AbNormal' if cnt > th else 'Normal'
            y_pred.append(pred)

        return y_pred

class VectorSimilarityModel():
    def __init__(self, train_dict):
        self.model_all = VectorSimilaritySubModel(*train_dict['All'])
        self.model_dam = VectorSimilaritySubModel(*train_dict['Dam'])
        self.model_fl1 = VectorSimilaritySubModel(*train_dict['Fill1'])
        self.model_fl2 = VectorSimilaritySubModel(*train_dict['Fill2'])
        self.model_ac = VectorSimilaritySubModel(*train_dict['AutoClave'])

    def predict(self, vector_dict):
        y_pred = self.model_all.predict(vector_dict['All'][0])
        y_pred_dam = self.model_dam.predict(vector_dict['Dam'][0], k=5, th=4)
        y_pred_fl1 = self.model_fl1.predict(vector_dict['Fill1'][0], k=10, th=7)
        y_pred_fl2 = self.model_fl2.predict(vector_dict['Fill2'][0], k=15, th=7)
        y_pred_ac = self.model_ac.predict(vector_dict['AutoClave'][0], k=15, th=11)

        predictions  = [
            label2id(y_pred),
            label2id(y_pred_dam),
            label2id(y_pred_fl1),
            label2id(y_pred_fl2),
            label2id(y_pred_ac)
        ]

        predictions = np.array(predictions)
        return mode(predictions, axis=0)[0]



In [None]:
model = VectorSimilarityModel(train_dict)

## 1.3 Validation

In [None]:
if DO_VALID:
    y_pred = model.predict(valid_dict)
    y_pred = id2label(y_pred)
    y_valid = valid_dict['All'][1]
    f1 = f1_score(y_valid, y_pred, pos_label='AbNormal')
    print(f1)

In [None]:
# y_valid.to_csv('val_vector_similarity.csv')

##  1.3 Test

In [None]:
test_pred  = model.predict(test_dict)
test_pred = id2label(test_pred)

In [None]:
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission_1.csv", index=False)

df_sub

df_sub['target'].value_counts()

target
Normal      16869
AbNormal      492
Name: count, dtype: int64

# Model2: RF+CatBoost and Isolaiton Forest

## 2.1 Read

In [None]:
RANDOM_SEED = 42
DO_VALID = True
TRAIN_PATH = os.path.join(ROOT_PATH, 'data', 'processed', 'train.csv')
VALID_PATH = os.path.join(ROOT_PATH, 'data', 'processed', 'valid.csv')

In [None]:
train_path = TRAIN_PATH
if DO_VALID:
    valid_path = VALID_PATH

drop_cols = ["HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam",
            "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1",
            "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2",
            ]

def read_csv(path, drop_cols = None):
    data = pd.read_csv(path)
    data = data.sort_index().reset_index(drop=True)
    labels = data[["target"]]

    features = data.drop("target", axis=1)
    if drop_cols is not None:
        features = data.drop(drop_cols, axis=1)
    return features, labels

X_train_raw, y_train_raw = read_csv(TRAIN_PATH) #, drop_cols)
if DO_VALID:
    X_valid_raw, y_valid_raw = read_csv(VALID_PATH) #, drop_cols)

In [None]:
print("Train Set:\n", y_train_raw.value_counts())
print()
if DO_VALID:
    print("Valid Set:\n", y_valid_raw.value_counts())

Train Set:
 target  
Normal      31156
AbNormal     2000
Name: count, dtype: int64

Valid Set:
 target  
Normal      7000
AbNormal     350
Name: count, dtype: int64


## 2.2 결측치 처리

In [None]:
print(X_train_raw.isnull().sum().sum())

59028


In [None]:
def find_unique_columns(df):
    unique_domain_columns = []
    for column in df.columns:
        unique_values = df[column].dropna().unique()
        if len(unique_values) <= 1:
            unique_domain_columns.append(column)
    return unique_domain_columns
# 모든 row 가 unique한 경우 Drop

unique_cols = find_unique_columns(X_train_raw)
X_train_raw = X_train_raw.drop(columns=unique_cols)

In [None]:
def fill_na_with_mean(df):
    for column in df.columns:
        if df[column].isnull().any():
            mean_value = df[column].mean()
            df[column].fillna(mean_value, inplace=True)

def fill_na_with_knn(df, is_test=False, imputers=None):
    if imputers is None:
        imputers = {}
    base = "HEAD NORMAL COORDINATE "
    stage = " AXIS(Stage1) Collect Result_"
    for process in ['Dam', 'Fill1', 'Fill2']:
        relevant_columns = []
        for axis in ['X', 'Y', 'Z']:
            col_name = base + axis + stage + process
            relevant_columns.append(col_name)

        target_col = base+'X'+stage+process
        df_subset = df[relevant_columns].copy()

        if is_test:
            assert imputers is not None
            imputer = imputers[target_col]
            imputed_values = imputer.transform(df_subset)
        else:
            imputer = KNNImputer(n_neighbors=5)

            imputed_values = imputer.fit_transform(df_subset)
            imputers[target_col] = imputer
        df[target_col] = imputed_values[:, 0]
    return imputers

# fill_na_with_mean(X_train_raw)
# fill_na_with_mean(X_test_raw)

im = fill_na_with_knn(X_train_raw)
if DO_VALID:
    _ = fill_na_with_knn(X_valid_raw, is_test=True, imputers=im)

In [None]:
print(X_train_raw.isnull().sum().sum())

0


## 2.3 전처리

In [None]:
def cb_preprocessing(x, y=None):
    if y is not None:
        y = y.replace({'AbNormal': 1, 'Normal': 0})
    categorical_columns = x.select_dtypes(exclude=[np.number]).columns
    return x, y, categorical_columns

def rf_preprocessing(x, y=None, is_test=False, TargetEncoder=None):
    if y is not None:
        y = y.replace({'AbNormal': 1, 'Normal': 0})
    categorical_columns = x.select_dtypes(exclude=[np.number]).columns

    if is_test:
        assert TargetEncoder is not None
        x = TargetEncoder.transform(x)
        target_encoder = None
    else:
        assert y is not None
        target_encoder = ce.TargetEncoder(cols=categorical_columns,
                                          handle_unknown='value',
                                          handle_missing='value')
        x = target_encoder.fit_transform(x, y)

    return x, y, target_encoder

cb_X_train_raw, cb_y_train_raw, cat = cb_preprocessing(X_train_raw, y_train_raw)
if DO_VALID:
    cb_X_valid_raw, cb_y_valid_raw, _ = cb_preprocessing(X_valid_raw, y_valid_raw)

rf_X_train_raw, rf_y_train_raw, te = rf_preprocessing(X_train_raw, y_train_raw, is_test=False)
if DO_VALID:
    rf_X_valid_raw, rf_y_valid_raw, _ = rf_preprocessing(X_valid_raw, y_valid_raw, is_test=True, TargetEncoder=te)

In [None]:
pd.concat([cb_X_train_raw,cb_y_train_raw], axis=1).head()

Unnamed: 0,Equipment_Dam,Model_Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,Dam dispenser #2,AJX75334501,4C1XH186-2,1000.0,12.5,90,85,280,90,16,...,50.0,91.8,270,50,114.612,20.0,11,125,1,0
1,Dam dispenser #2,AJX75334501,3H1X7976-1,1000.0,12.5,90,70,280,90,10,...,91.8,270.0,50,85,19.9,9.0,173,1,0,0
2,Dam dispenser #1,AJX75334501,3G1X8303-1,240.0,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.1,7.0,176,1,0,0
3,Dam dispenser #1,AJX75334501,3F1X9648-1,240.0,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,18.9,5.0,68,1,0,0
4,Dam dispenser #2,AJX75334501,3M1XC491-1,1000.0,12.5,90,70,280,90,16,...,50.0,91.8,270,50,85.0,19.7,14,424,1,0


## 2.4 모델 정의

In [None]:
cat_features = cat.tolist()
dam_columns = [col for col in cb_X_train_raw.columns if col.split('_')[-1] in ["Dam"]]
fill1_columns = [col for col in cb_X_train_raw.columns if col.split('_')[-1] in ["Fill1"]]
fill2_columns = [col for col in cb_X_train_raw.columns if col.split('_')[-1] in ["Fill2"]]
autoclave_columns = [col for col in cb_X_train_raw.columns if col.split('_')[-1] in ["AutoClave"]]

params = {
#     'iterations': 3000,                # 최대 3000번의 반복
    'learning_rate': 0.05,             # 학습률: 0.05 (초기값으로 적절히 설정)
    'depth': 6,                        # 트리의 깊이: 6 (복잡한 모델을 방지하기 위해 중간값 설정)
    'l2_leaf_reg': 3,                  # L2 정규화: 3 (모델 복잡도 제어)
    'one_hot_max_size': 10,            # one-hot 인코딩으로 변환할 카테고리형 변수의 최대 크기
    'random_seed': RANDOM_SEED,        # 재현성을 위한 랜덤 시드 설정
    'task_type': "CPU",                # CPU 사용 (GPU로 변경 가능)
    'loss_function': 'Logloss',        # 이진 분류를 위한 로그 손실 함수
    'eval_metric': "F1",               # 평가 지표: F1 스코어
    'auto_class_weights': 'Balanced',  # 자동 클래스 가중치: 불균형 데이터에 대응
    'early_stopping_rounds': 500,      # 조기 종료를 위한 patience 설정
    'verbose': 100                     # 100회 반복마다 결과 출력
}

## 2.5 학습

In [None]:
X_Dam = rf_X_train_raw[dam_columns]
X_Fill1 = rf_X_train_raw[fill1_columns]
X_Fill2 = rf_X_train_raw[fill2_columns]
X_Auto = rf_X_train_raw[autoclave_columns]
_X_Dam = cb_X_train_raw[dam_columns]
_X_Fill1 = cb_X_train_raw[fill1_columns]
_X_Fill2 = cb_X_train_raw[fill2_columns]
_X_Auto = cb_X_train_raw[autoclave_columns]

if DO_VALID:
    X_val_Dam = rf_X_valid_raw[dam_columns]
    X_val_Fill1 = rf_X_valid_raw[fill1_columns]
    X_val_Fill2 = rf_X_valid_raw[fill2_columns]
    X_val_Auto = rf_X_valid_raw[autoclave_columns]
    _X_val_Dam = cb_X_valid_raw[dam_columns]
    _X_val_Fill1 = cb_X_valid_raw[fill1_columns]
    _X_val_Fill2 = cb_X_valid_raw[fill2_columns]
    _X_val_Auto = cb_X_valid_raw[autoclave_columns]

Dam_model1 = RandomForestClassifier(random_state=RANDOM_SEED)
Dam_model1.fit(X_Dam, rf_y_train_raw)
Dam_model2 = CatBoostClassifier(**params, cat_features= list(set(cat_features) & set(dam_columns)), iterations=215)

if DO_VALID:
    Dam_model2.fit(_X_Dam, cb_y_train_raw, eval_set = [(_X_val_Dam, cb_y_valid_raw)])
else:
    Dam_model2.fit(_X_Dam, cb_y_train_raw)

Fill1_model1 = RandomForestClassifier(random_state=RANDOM_SEED)
Fill1_model1.fit(X_Fill1, rf_y_train_raw)
Fill1_model2 = CatBoostClassifier(**params, cat_features= list(set(cat_features) & set(fill1_columns)), iterations=161)

if DO_VALID:
    Fill1_model2.fit(_X_Fill1, cb_y_train_raw, eval_set = [(_X_val_Fill1, cb_y_valid_raw)])
else:
    Fill1_model2.fit(_X_Fill1, cb_y_train_raw)

Fill2_model1 = RandomForestClassifier(random_state=RANDOM_SEED)
Fill2_model1.fit(X_Fill2, rf_y_train_raw)
Fill2_model2 = CatBoostClassifier(**params, cat_features= list(set(cat_features) & set(fill2_columns)), iterations=14)

if DO_VALID:
    Fill2_model2.fit(_X_Fill2, cb_y_train_raw, eval_set = [(_X_val_Fill2, cb_y_valid_raw)])
else:
    Fill2_model2.fit(_X_Fill2, cb_y_train_raw)

Auto_model = RandomForestClassifier(random_state=RANDOM_SEED)
Auto_model.fit(X_Auto, rf_y_train_raw)

  Dam_model1.fit(X_Dam, rf_y_train_raw)


0:	learn: 0.6156927	test: 0.5680721	best: 0.5680721 (0)	total: 33.8ms	remaining: 7.23s
100:	learn: 0.6264656	test: 0.5674946	best: 0.5831942 (20)	total: 1.89s	remaining: 2.13s
200:	learn: 0.6484820	test: 0.5800907	best: 0.5831942 (20)	total: 3.73s	remaining: 260ms
214:	learn: 0.6494000	test: 0.5865606	best: 0.5865606 (214)	total: 3.94s	remaining: 0us

bestTest = 0.5865605756
bestIteration = 214



  Fill1_model1.fit(X_Fill1, rf_y_train_raw)


0:	learn: 0.6070551	test: 0.5752161	best: 0.5752161 (0)	total: 27.8ms	remaining: 4.44s
100:	learn: 0.6266228	test: 0.5821430	best: 0.5851321 (15)	total: 1.2s	remaining: 711ms
160:	learn: 0.6405573	test: 0.5811095	best: 0.5851321 (15)	total: 1.86s	remaining: 0us

bestTest = 0.5851321095
bestIteration = 15

Shrink model to first 16 iterations.


  Fill2_model1.fit(X_Fill2, rf_y_train_raw)


0:	learn: 0.5508904	test: 0.5253529	best: 0.5253529 (0)	total: 24.8ms	remaining: 322ms
13:	learn: 0.6073907	test: 0.5716709	best: 0.5807360 (6)	total: 207ms	remaining: 0us

bestTest = 0.5807359968
bestIteration = 6

Shrink model to first 7 iterations.


  Auto_model.fit(X_Auto, rf_y_train_raw)


In [None]:
dam_proba1 = Dam_model1.predict_proba(X_Dam)
dam_proba2 = Dam_model2.predict_proba(_X_Dam)
fill1_proba1 = Fill1_model1.predict_proba(X_Fill1)
fill1_proba2 = Fill1_model2.predict_proba(_X_Fill1)
fill2_proba1 = Fill2_model1.predict_proba(X_Fill2)
fill2_proba2 = Fill2_model2.predict_proba(_X_Fill2)
auto_proba = Auto_model.predict_proba(X_Auto)

probability = pd.DataFrame(np.column_stack((dam_proba1, fill1_proba1, fill2_proba1, auto_proba,
                                           dam_proba2, fill1_proba2, fill2_proba2)))

# Isolation Forest 모델 생성
Main_Model = IsolationForest(contamination=0.0275, random_state=RANDOM_SEED)
# 모델 학습
Main_Model.fit(probability)

## 2.6 Validation

In [None]:
if DO_VALID:
    dam_proba1 = Dam_model1.predict_proba(X_val_Dam)
    fill1_proba1 = Fill1_model1.predict_proba(X_val_Fill1)
    fill2_proba1 = Fill2_model1.predict_proba(X_val_Fill2)

    dam_proba2 = Dam_model2.predict_proba(_X_val_Dam)
    fill1_proba2 = Fill1_model2.predict_proba(_X_val_Fill1)
    fill2_proba2 = Fill2_model2.predict_proba(_X_val_Fill2)

    auto_proba = Auto_model.predict_proba(X_val_Auto)

    val_probability = pd.DataFrame(np.column_stack((dam_proba1, fill1_proba1, fill2_proba1, auto_proba,
                                               dam_proba2, fill1_proba2, fill2_proba2)))

    test_pred = Main_Model.predict(val_probability)
    test_pred = pd.DataFrame(test_pred).replace({1:"Normal", -1:"AbNormal"})

    y_valid_raw = y_valid_raw.replace({0:"Normal", 1:"AbNormal"})
    f1 = f1_score(y_valid_raw, test_pred, pos_label = "AbNormal")
    print(f1)

    test_pred.value_counts()

0.22553191489361704


## 2.7. Test

In [None]:
test_path = "data/test.csv"

X_test_raw, y_test_raw = read_csv(test_path)
print(X_test_raw.shape)
X_test_raw.columns = X_test_raw.columns.str.replace('.', '_')
drop_columns = set(X_test_raw.columns) - set(X_train_raw.columns)
X_test_raw = X_test_raw.drop(columns=drop_columns)

# 1. .OK -> nan
cols = ["HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam",
        "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1",
        "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2"]

for col in cols:
    X_test_raw.loc[X_test_raw[col] == "OK", col] = np.nan
    X_test_raw[col] = X_test_raw[col].astype(float)
print(X_test_raw.shape)
_ = fill_na_with_knn(X_test_raw, is_test=True, imputers=im)
cb_X_test_raw, _, _ = cb_preprocessing(X_test_raw)
rf_X_test_raw, _, _ = rf_preprocessing(X_test_raw, is_test=True, TargetEncoder=te)

(17361, 464)
(17361, 145)


In [None]:
X_test_Dam = rf_X_test_raw[dam_columns]
X_test_Fill1 = rf_X_test_raw[fill1_columns]
X_test_Fill2 = rf_X_test_raw[fill2_columns]
_X_test_Dam = cb_X_test_raw[dam_columns]
_X_test_Fill1 = cb_X_test_raw[fill1_columns]
_X_test_Fill2 = cb_X_test_raw[fill2_columns]
X_test_Auto = rf_X_test_raw[autoclave_columns]

dam_proba1 = Dam_model1.predict_proba(X_test_Dam)
fill1_proba1 = Fill1_model1.predict_proba(X_test_Fill1)
fill2_proba1 = Fill2_model1.predict_proba(X_test_Fill2)

dam_proba2 = Dam_model2.predict_proba(_X_test_Dam)
fill1_proba2 = Fill1_model2.predict_proba(_X_test_Fill1)
fill2_proba2 = Fill2_model2.predict_proba(_X_test_Fill2)

auto_proba = Auto_model.predict_proba(X_test_Auto)

test_probability = pd.DataFrame(np.column_stack((dam_proba1, fill1_proba1, fill2_proba1, auto_proba,
                                           dam_proba2, fill1_proba2, fill2_proba2)))

test_pred = Main_Model.predict(test_probability)
test_pred = pd.DataFrame(test_pred).replace({1:"Normal", -1:"AbNormal"})

In [None]:
test_pred.value_counts()

Normal      17044
AbNormal      317
Name: count, dtype: int64

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission_2.csv", index=False)

# Model3: Random Forest and Isolation Forest

In [None]:
ROOT_DIR = "/data"
RANDOM_STATE = 110
DO_VALID = False
train_data = pd.read_csv(TRAIN_PATH)

## 3.1 Preprocessing

In [None]:
# target label 분리
y_train = train_data['target']
X_train = train_data.iloc[:,:-1]

# Equipment 조합 Column 추가

def feature_extraction(df):
    def check_condition(row):
        # 첫 번째 조건 확인
        condition1 = (row['Equipment_Dam'] == 'Dam dispenser #1') and \
                     (row['Equipment_Fill1'] == 'Fill1 dispenser #1') and \
                     (row['Equipment_Fill2'] == 'Fill2 dispenser #1')

        # 두 번째 조건 확인
        condition2 = (row['Equipment_Dam'] == 'Dam dispenser #2') and \
                     (row['Equipment_Fill1'] == 'Fill1 dispenser #2') and \
                     (row['Equipment_Fill2'] == 'Fill2 dispenser #2')

        # 두 조건 중 하나라도 만족하면 True 반환
        return condition1 or condition2

    # 새로운 컬럼 생성
    df['Equipment_com'] = df.apply(check_condition, axis=1)
    return df

X_train = feature_extraction(X_train)

In [None]:
# 결측치 평균값으로 보정

def fill_na_with_mean(df):
    for column in df.columns:
        if df[column].isnull().any():  # 결측치가 있는지 확인
            mean_value = df[column].mean()  # 평균값 계산
            df[column].fillna(mean_value, inplace=True)
# 결측치를 최빈값으로 대체
fill_na_with_mean(X_train)

#Target Encoding 하기 위해 AbNormal과 Normal을 숫자값으로 변환

y_train_encoded = y_train.replace({'AbNormal': 1, 'Normal': 0})
object_columns = X_train.select_dtypes(include=['object'])

import category_encoders as ce

target_encoder = ce.TargetEncoder(cols=object_columns.columns, handle_unknown='value', handle_missing='value')
X_train = target_encoder.fit_transform(X_train, y_train_encoded)

## 3.2 Train Set을 각 공정으로 Split 한 후 모델 학습 (RF -> IF)

In [None]:
X_Dam = X_train.filter(like='Dam')
X_Fill1 = X_train.filter(like='Fill1')
X_Fill2 = X_train.filter(like="Fill2")
X_Auto = X_train.filter(like='Auto')

Dam_model = RandomForestClassifier(random_state=RANDOM_STATE)
Dam_model.fit(X_Dam, y_train)

Fill1_model = RandomForestClassifier(random_state=RANDOM_STATE)
Fill1_model.fit(X_Fill1, y_train)

Fill2_model = RandomForestClassifier(random_state=RANDOM_STATE)
Fill2_model.fit(X_Fill2, y_train)

Auto_model = RandomForestClassifier(random_state=RANDOM_STATE)
Auto_model.fit(X_Auto, y_train)


dam_proba = Dam_model.predict_proba(X_Dam)
fill1_proba = Fill1_model.predict_proba(X_Fill1)
fill2_proba = Fill2_model.predict_proba(X_Fill2)
auto_proba = Auto_model.predict_proba(X_Auto)
probability = pd.DataFrame(np.column_stack((dam_proba, fill1_proba, fill2_proba, auto_proba)))

from sklearn.ensemble import IsolationForest
# Isolation Forest 모델 생성
Main_Model = IsolationForest(contamination=0.0275, random_state=RANDOM_STATE)
# 모델 학습
Main_Model.fit(probability)

## 3.3 Validation

In [None]:
if DO_VALID:
    test_data = pd.read_csv(VALID_PATH)

    #target label 분리
    y_val = test_data['target']
    X_val = test_data.iloc[:,:-1]

    # preprocessing
    y_val_encoded = y_val.replace({'AbNormal': 1, 'Normal': 0})
    fill_na_with_mean(X_val)
    X_val = feature_extraction(X_val)
    X_val = target_encoder.transform(X_val)

In [None]:
if DO_VALID:
    # 만들어 놓은 모델로 확률값 계산
    X_val_Dam = X_val.filter(like='Dam')
    X_val_Fill1 = X_val.filter(like='Fill1')
    X_val_Fill2 = X_val.filter(like="Fill2")
    X_val_Auto = X_val.filter(like='Auto')

    dam_proba = Dam_model.predict_proba(X_val_Dam)
    fill1_proba = Fill1_model.predict_proba(X_val_Fill1)
    fill2_proba = Fill2_model.predict_proba(X_val_Fill2)
    auto_proba = Auto_model.predict_proba(X_val_Auto)
    val_probability = pd.DataFrame(np.hstack((dam_proba, fill1_proba, fill2_proba, auto_proba)))

In [None]:
if DO_VALID:
    test_pred = Main_Model.predict(val_probability)
    test_pred = pd.DataFrame(test_pred).replace({1:"Normal", -1 : "AbNormal"})

    # Eqipment 조합에 따라 AbNormal 후처리 해주기
    test_pred[X_val['Equipment_com'] == False] = 'AbNormal'

In [None]:
if DO_VALID:
    from sklearn.metrics import f1_score

    # F1 Score

    f1 = f1_score(y_val, test_pred, pos_label = "AbNormal")
    print(f1)

## 3.4 Test

In [None]:
table = pd.read_csv("data/test.csv")
table.columns = table.columns.str.replace('.', '_')

####### 전처리 by 의진  ################

# 1. .OK -> nan
cols = ["HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam",
        "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1",
        "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2"]

for col in cols:
    table.loc[table[col] == "OK", col] = np.nan
    table[col] = table[col].astype(float)

# 2. 필요없는 Column버리기
def find_unique_columns(df):
    unique_domain_columns = []

    for column in df.columns:
        unique_values = df[column].dropna().unique()
        if len(unique_values) <= 1:
            unique_domain_columns.append(column)

    return unique_domain_columns

unique_columns = find_unique_columns(table)
removed_table = table.drop(columns=unique_columns)

####################################


#preprocess

X_val = removed_table.iloc[:,1:]
X_val = feature_extraction(X_val)
fill_na_with_mean(X_val)
X_val = target_encoder.transform(X_val)

# filtering

X_val_Dam = X_val.filter(like='Dam')
X_val_Fill1 = X_val.filter(like='Fill1')
X_val_Fill2 = X_val.filter(like="Fill2")
X_val_Auto = X_val.filter(like='Auto')

#predict

dam_proba = Dam_model.predict_proba(X_val_Dam)
fill1_proba = Fill1_model.predict_proba(X_val_Fill1)
fill2_proba = Fill2_model.predict_proba(X_val_Fill2)
auto_proba = Auto_model.predict_proba(X_val_Auto)
val_probability = pd.DataFrame(np.hstack((dam_proba, fill1_proba, fill2_proba, auto_proba)))

test_pred = Main_Model.predict(val_probability)
test_pred = pd.DataFrame(test_pred).replace({1:"Normal", -1 : "AbNormal"})
test_pred[X_val['Equipment_com'] == False] = 'AbNormal'

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission_3.csv", index=False)

# Model4: RF and AutoEncoder

In [None]:
RANDOM_STATE = 110
train_data = pd.read_csv(TRAIN_PATH)

In [None]:
# Target 열 뺴기
y_train = train_data['target']
X_train = train_data.iloc[:,:-1]

#Equipment 조합에 따른 추가
def feature_extraction(df):
    def check_condition(row):
        # 첫 번째 조건 확인
        condition1 = (row['Equipment_Dam'] == 'Dam dispenser #1') and \
                     (row['Equipment_Fill1'] == 'Fill1 dispenser #1') and \
                     (row['Equipment_Fill2'] == 'Fill2 dispenser #1')

        # 두 번째 조건 확인
        condition2 = (row['Equipment_Dam'] == 'Dam dispenser #2') and \
                     (row['Equipment_Fill1'] == 'Fill1 dispenser #2') and \
                     (row['Equipment_Fill2'] == 'Fill2 dispenser #2')

        # 두 조건 중 하나라도 만족하면 True 반환
        return condition1 or condition2

    # 새로운 컬럼 생성
    df['Equipment_com'] = df.apply(check_condition, axis=1)
    return df
X_train = feature_extraction(X_train)

In [None]:
#결측치 평균값 보정

def fill_na_with_mean(df):
    for column in df.columns:
        if df[column].isnull().any():  # 결측치가 있는지 확인
            mean_value = df[column].mean()  # 평균값 계산
            df[column].fillna(mean_value, inplace=True)
fill_na_with_mean(X_train)

# Target Encoder 쓰기 위해 label 변경
y_train_encoded = y_train.replace({'AbNormal': 1, 'Normal': 0})
object_columns = X_train.select_dtypes(include=['object'])

import category_encoders as ce

target_encoder = ce.TargetEncoder(cols=object_columns.columns, handle_unknown='value', handle_missing='value')
X_train = target_encoder.fit_transform(X_train, y_train_encoded)

In [None]:
#Model 나누기

X_Dam = X_train.filter(like='Dam')
X_Fill1 = X_train.filter(like='Fill1')
X_Fill2 = X_train.filter(like="Fill2")
X_Auto = X_train.filter(like='Auto')

Dam_model = RandomForestClassifier(random_state=RANDOM_STATE)
Dam_model.fit(X_Dam, y_train)

Fill1_model = RandomForestClassifier(random_state=RANDOM_STATE)
Fill1_model.fit(X_Fill1, y_train)

Fill2_model = RandomForestClassifier(random_state=RANDOM_STATE)
Fill2_model.fit(X_Fill2, y_train)

Auto_model = RandomForestClassifier(random_state=RANDOM_STATE)
Auto_model.fit(X_Auto, y_train)


dam_proba = Dam_model.predict_proba(X_Dam)
fill1_proba = Fill1_model.predict_proba(X_Fill1)
fill2_proba = Fill2_model.predict_proba(X_Fill2)
auto_proba = Auto_model.predict_proba(X_Auto)
probability = pd.DataFrame(np.column_stack((dam_proba, fill1_proba, fill2_proba, auto_proba)))
probability = StandardScaler().fit_transform(probability)

In [None]:
## Auto Encoder

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
def build_autoencoder(input_dim, encoding_dim):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)

    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='adam', loss='mse')
    return autoencoder

def train_autoencoder(train_x, encoding_dim=14, epochs=50, batch_size=256):
    input_dim = train_x.shape[1]
    autoencoder = build_autoencoder(input_dim, encoding_dim)

    autoencoder.fit(train_x, train_x,
                    epochs=epochs,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_split=0.2)

    encoder = Model(inputs=autoencoder.input, outputs=autoencoder.layers[1].output)
    return autoencoder, encoder

def calculate_reconstruction_error(autoencoder, data):
    reconstructed_data = autoencoder.predict(data)
    reconstruction_error = np.mean(np.square(data - reconstructed_data), axis=1)
    return reconstruction_error

def classify_errors(errors, threshold):
    return np.where(errors > threshold, "AbNormal", "Normal")

2024-08-30 08:41:32.971404: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-30 08:41:33.010110: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
autoencoder, encoder = train_autoencoder(probability, encoding_dim=14, epochs=100, batch_size=256)
# autoencoder, encoder = train_autoencoder(probability, encoding_dim=32, epochs=75, batch_size=256)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [None]:
if DO_VALID:
    test_data = pd.read_csv(VALID_PATH)
    y_val = test_data['target']
    X_val = test_data.iloc[:,:-1]
    y_val_encoded = y_val.replace({'AbNormal': 1, 'Normal': 0})
    fill_na_with_mean(X_val)
    X_val = feature_extraction(X_val)
    X_val = target_encoder.transform(X_val)

In [None]:
if DO_VALID:
    #공정에 따라 나누기

    X_val_Dam = X_val.filter(like='Dam')
    X_val_Fill1 = X_val.filter(like='Fill1')
    X_val_Fill2 = X_val.filter(like="Fill2")
    X_val_Auto = X_val.filter(like='Auto')

    dam_proba = Dam_model.predict_proba(X_val_Dam)
    fill1_proba = Fill1_model.predict_proba(X_val_Fill1)
    fill2_proba = Fill2_model.predict_proba(X_val_Fill2)
    auto_proba = Auto_model.predict_proba(X_val_Auto)
    val_probability = pd.DataFrame(np.hstack((dam_proba, fill1_proba, fill2_proba, auto_proba)))
    val_probability = StandardScaler().fit_transform(val_probability)

In [None]:
if DO_VALID:
    # 재구성 오류 계산
    test_reconstruction_error = calculate_reconstruction_error(autoencoder, val_probability)

    # 오류에 기반한 이상치 판단 기준 설정 (예: 96% 이상의 재구성 오류를 이상치로 판단)

    threshold = np.percentile(test_reconstruction_error, 96)
    test_pred = classify_errors(test_reconstruction_error, threshold)
    test_pred = pd.DataFrame(test_pred)
    test_pred[X_val['Equipment_com'] == False] = 'AbNormal'

    test_pred.value_counts()

In [None]:
if DO_VALID:

    # 데이터 스플릿으로 y_valid와 모델 예측으로 y_pred를 구한 후 실행
    # 모델 검정이 없다면 y_true값으로 y_valid 대체
    f1 = f1_score(y_val, test_pred, pos_label = "AbNormal")
    print(f1)

## Test

In [None]:
table = pd.read_csv(TEST_PATH)
table.columns = table.columns.str.replace('.', '_')

# 1. .OK -> nan
cols = ["HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam",
        "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1",
        "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2"]

for col in cols:
    table.loc[table[col] == "OK", col] = np.nan
    table[col] = table[col].astype(float)

# 2. 필요없는 Column버리기
def find_unique_columns(df):
    unique_domain_columns = []

    for column in df.columns:
        unique_values = df[column].dropna().unique()
        if len(unique_values) <= 1:
            unique_domain_columns.append(column)

    return unique_domain_columns

unique_columns = find_unique_columns(table)
removed_table = table.drop(columns=unique_columns)

X_val = removed_table.iloc[:,1:]
X_val = feature_extraction(X_val)
fill_na_with_mean(X_val)
X_val = target_encoder.transform(X_val)

X_val_Dam = X_val.filter(like='Dam')
X_val_Fill1 = X_val.filter(like='Fill1')
X_val_Fill2 = X_val.filter(like="Fill2")
X_val_Auto = X_val.filter(like='Auto')

dam_proba = Dam_model.predict_proba(X_val_Dam)
fill1_proba = Fill1_model.predict_proba(X_val_Fill1)
fill2_proba = Fill2_model.predict_proba(X_val_Fill2)
auto_proba = Auto_model.predict_proba(X_val_Auto)
val_probability = pd.DataFrame(np.hstack((dam_proba, fill1_proba, fill2_proba, auto_proba)))
val_probability = StandardScaler().fit_transform(val_probability)

test_reconstruction_error = calculate_reconstruction_error(autoencoder, val_probability)

# 오류에 기반한 이상치 판단 기준 설정 (예: 95% 이상의 재구성 오류를 이상치로 판단)
threshold = np.percentile(test_reconstruction_error, 95)
test_pred = classify_errors(test_reconstruction_error, threshold)
test_pred = pd.DataFrame(test_pred)
test_pred[X_val['Equipment_com'] == False] = 'AbNormal'
test_pred.value_counts()



Normal      16488
AbNormal      873
Name: count, dtype: int64

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission_4.csv", index=False)

# Model5: One Class SVM

In [None]:
RANDOM_STATE = 110

train_data = pd.read_csv(TRAIN_PATH)

In [None]:
y_train = train_data['target']
X_train = train_data.iloc[:,:-1]

In [None]:
def fill_na_with_mean(df):
    for column in df.columns:
        if df[column].isnull().any():  # 결측치가 있는지 확인
            mean_value = df[column].mean()  # 평균값 계산
            df[column].fillna(mean_value, inplace=True)
# 결측치를 최빈값으로 대체
fill_na_with_mean(X_train)
y_train_encoded = y_train.replace({'AbNormal': 1, 'Normal': 0})
object_columns = X_train.select_dtypes(include=['object'])

import category_encoders as ce

target_encoder = ce.TargetEncoder(cols=object_columns.columns, handle_unknown='value', handle_missing='value')
X_train = target_encoder.fit_transform(X_train, y_train_encoded)

In [None]:
if DO_VALID:
    test_data = pd.read_csv(VALID_PATH)
    y_val = test_data['target']
    X_val = test_data.iloc[:,:-1]
    y_val_encoded = y_val.replace({'AbNormal': -1, 'Normal': 1})
    fill_na_with_mean(X_val)
    X_val = target_encoder.transform(X_val)
    X_val

In [None]:
from sklearn.svm import OneClassSVM
X_Auto = X_train.filter(like='Auto')
X_Result = X_train.filter(like='Result')
#X_Dam = X_train.filter(like='Dam')
X_Fill1 = X_train.filter(like='Fill1')
#X_Fill2 = X_train.filter(like="Fill2")
#X_Stage = X_train.filter(like="Stage")



Auto_model = RandomForestClassifier(random_state=RANDOM_STATE)
Auto_model.fit(X_Auto, y_train)
Result_model = RandomForestClassifier(random_state=RANDOM_STATE)
Result_model.fit(X_Result, y_train)
# Dam_model = RandomForestClassifier(random_state=RANDOM_STATE)
# Dam_model.fit(X_Dam, y_train)
Fill1_model = RandomForestClassifier(random_state=RANDOM_STATE)
Fill1_model.fit(X_Fill1, y_train)
# Fill2_model = RandomForestClassifier(random_state=RANDOM_STATE)
# Fill2_model.fit(X_Fill2, y_train)
# Stage_model = RandomForestClassifier(random_state=RANDOM_STATE)
# Stage_model.fit(X_Stage, y_train)


auto_proba = Auto_model.predict_proba(X_Auto)
result_proba = Result_model.predict_proba(X_Result)
# dam_proba = Dam_model.predict_proba(X_Dam)
fill1_proba = Fill1_model.predict_proba(X_Fill1)
#fill2_proba = Fill2_model.predict_proba(X_Fill2)
#stage_proba = Stage_model.predict_proba(X_Stage)


probability = pd.DataFrame(np.column_stack((auto_proba,result_proba,fill1_proba)))


# One-Class SVM 모델 생성
ocsvm_model = OneClassSVM(kernel='rbf', gamma="auto", nu=0.165)  # gamma 0.01
ocsvm_model.fit(probability)

In [None]:
X_val_Auto = X_val.filter(like='Auto')
X_val_Result = X_val.filter(like='Result')
#X_val_Dam = X_val.filter(like='Dam')
X_val_Fill1 = X_val.filter(like='Fill1')
#X_val_Fill2 = X_val.filter(like="Fill2")
#X_val_Stage = X_val.filter(like="Stage")


auto_proba = Auto_model.predict_proba(X_val_Auto)
result_proba = Result_model.predict_proba(X_val_Result)
#dam_proba = Dam_model.predict_proba(X_val_Dam)
fill1_proba = Fill1_model.predict_proba(X_val_Fill1)
#fill2_proba = Fill2_model.predict_proba(X_val_Fill2)
#stage_proba = Speed_model.predict_proba(X_val_Speed)


val_probability = pd.DataFrame(np.hstack((auto_proba,result_proba,fill1_proba)))#np.hstack

In [None]:
# 예측 수행 (1: 정상, -1: 비정상)
ocsvm_predictions = ocsvm_model.predict(val_probability)

# 결과 확인 (정상: 1, 비정상: -1)
print(ocsvm_predictions)
scaler = StandardScaler()
X_val_scaled = scaler.fit_transform(val_probability)

[1 1 1 ... 1 1 1]


In [None]:
ocsvm_predictions = pd.DataFrame(ocsvm_predictions).replace({1:"Normal", -1 : "AbNormal"})

In [None]:
ocsvm_predictions.value_counts()

Normal      16757
AbNormal      604
Name: count, dtype: int64

In [None]:
table = pd.read_csv(TEST_PATH)
table.columns = table.columns.str.replace('.', '_')

# 1. .OK -> nan
cols = ["HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam",
        "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1",
        "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2"]

for col in cols:
    table.loc[table[col] == "OK", col] = np.nan
    table[col] = table[col].astype(float)

# 2. 필요없는 Column버리기
def find_unique_columns(df):
    unique_domain_columns = []

    for column in df.columns:
        unique_values = df[column].dropna().unique()
        if len(unique_values) <= 1:
            unique_domain_columns.append(column)

    return unique_domain_columns

unique_columns = find_unique_columns(table)
removed_table = table.drop(columns=unique_columns)

X_val = removed_table.iloc[:,1:]
fill_na_with_mean(X_val)
X_val = target_encoder.transform(X_val)

X_val_Auto = X_val.filter(like='Auto')
X_val_Result = X_val.filter(like='Result')
X_val_Fill1 = X_val.filter(like='Fill1')

auto_proba = Auto_model.predict_proba(X_val_Auto)
result_proba = Result_model.predict_proba(X_val_Result)
fill1_proba = Fill1_model.predict_proba(X_val_Fill1)
val_probability = pd.DataFrame(np.hstack((auto_proba, result_proba, fill1_proba)))

test_pred = ocsvm_model.predict(val_probability)
test_pred = pd.DataFrame(test_pred).replace({1:"Normal", -1 : "AbNormal"})

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission_5.csv", index=False)

In [None]:
df_sub['target'].value_counts()

target
Normal      16757
AbNormal      604
Name: count, dtype: int64

# Hard Voting

## Test

In [None]:

DF1 = pd.read_csv("submission_1.csv")
DF2 = pd.read_csv("submission_2.csv")
DF3 = pd.read_csv("submission_3.csv")
DF4 = pd.read_csv("submission_4.csv")
DF5 = pd.read_csv("submission_5.csv")

In [None]:
predictions = pd.concat([DF1['target'], DF2['target'], DF3['target'], DF4['target'], DF5['target']], axis =1)
predictions = predictions.replace({"Normal" : 0, "AbNormal" : 1})
predictions['cc'] = predictions.sum(axis=1).apply(lambda x: 1 if x >= 2 else 0)
predictions = predictions.replace({ 0 : "Normal", 1 : "AbNormal"})
predictions

Unnamed: 0,target,target.1,target.2,target.3,target.4,cc
0,Normal,Normal,Normal,Normal,Normal,Normal
1,Normal,Normal,Normal,Normal,Normal,Normal
2,Normal,Normal,Normal,Normal,Normal,Normal
3,Normal,Normal,Normal,Normal,Normal,Normal
4,Normal,Normal,Normal,Normal,Normal,Normal
...,...,...,...,...,...,...
17356,Normal,Normal,Normal,Normal,Normal,Normal
17357,Normal,Normal,Normal,Normal,Normal,Normal
17358,Normal,Normal,Normal,Normal,Normal,Normal
17359,Normal,Normal,Normal,Normal,Normal,Normal


In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = predictions['cc']

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [None]:
df_sub.value_counts('target')

target
Normal      16562
AbNormal      799
Name: count, dtype: int64