# [DACON] 건설기계 오일 상태 분류 AI 경진대회
### 배경
- 건설기계 분야의 데이터를 분석, 활용하는 방안 제시
- 건설 장비 산업의 지능화에 대한 연구활동 홍보 및 우수 인재 발굴

### 주제
- 건설장비에서 작동 오일의 상태를 실시간으로 모니터링하기 위한 **오일 상태 판단 모델 개발**
- 적절한 교체 주기를 파악하고자 함

### 평가방식
- Macro F1 Score

### 지식 증류 학습?

### Library

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
import random

import matplotlib.pyplot as plt
import seaborn as sns

import pycaret.classification as pycaret_clf

### Data Load

In [2]:
main_path = "./data"

train_df = pd.read_csv(os.path.join(main_path, "train.csv"))
test_df = pd.read_csv(os.path.join(main_path, "test.csv"))

train_df.shape, test_df.shape

((14095, 54), (6041, 19))

In [3]:
# 실제 진단 환경(Inference)에 사용되는 컬럼
test_stage_features = ['COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR' , 'ANONYMOUS_2', 'AG', 'CO', 'CR', 'CU', 'FE', 'H2O', 'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V', 'V40', 'ZN']
test_stage_features.append("Y_LABEL")
len(test_stage_features)

19

In [4]:
train_df = train_df[test_stage_features]
train_df.shape

(14095, 19)

In [5]:
set(train_df.columns.tolist()) - set(test_df.columns.tolist()), set(test_df.columns.tolist()) - set(train_df.columns.tolist())

({'Y_LABEL'}, {'ID'})

In [6]:
# Test 데이터에서 필요없는 부분 제거
test_df = test_df.drop(columns="ID", axis=1)
test_df.shape

(6041, 18)

In [7]:
display(train_df.head())
display(test_df.head())

Unnamed: 0,COMPONENT_ARBITRARY,ANONYMOUS_1,YEAR,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,NI,PQINDEX,TI,V,V40,ZN,Y_LABEL
0,COMPONENT3,1486,2011,200,0,0,13,78,888,0.0,16,1,6,8504,5,0,154.0,75,0
1,COMPONENT2,1350,2021,375,0,0,0,31,2,0.0,0,0,0,19,0,0,44.0,652,0
2,COMPONENT2,2415,2015,200,0,0,1,2,4,0.0,0,0,0,17,0,0,72.6,412,1
3,COMPONENT3,7389,2010,200,0,0,0,1,37,0.0,1,0,0,44,0,0,133.3,7,0
4,COMPONENT3,3954,2015,200,0,0,0,0,71,0.0,0,0,0,217,0,0,133.1,128,0


Unnamed: 0,COMPONENT_ARBITRARY,ANONYMOUS_1,YEAR,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,NI,PQINDEX,TI,V,V40,ZN
0,COMPONENT1,2192,2016,200,0,0,0,1,12,0.0,0,0,0,10,0,0,91.3,1091
1,COMPONENT3,2794,2011,200,0,0,2,1,278,0.0,3,0,0,2732,1,0,126.9,12
2,COMPONENT2,1982,2010,200,0,0,0,16,5,0.0,0,0,0,11,0,0,44.3,714
3,COMPONENT3,1404,2009,200,0,0,3,4,163,0.0,4,3,0,8007,0,0,142.8,94
4,COMPONENT2,8225,2013,200,0,0,0,6,13,0.0,0,0,0,16,0,0,63.4,469


### PyCaret

In [8]:
setup_clf = pycaret_clf.setup(session_id=42, data=train_df, target="Y_LABEL", normalize=True, transformation=True, use_gpu=True)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,Y_LABEL
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(14095, 19)"
5,Missing Values,False
6,Numeric Features,13
7,Categorical Features,5
8,Ordinal Features,False
9,High Cardinality Features,False


In [9]:
clf = pycaret_clf.models()

In [44]:
top5 = pycaret_clf.compare_models(sort="Accuracy", n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.914,0.6496,0.0209,0.7217,0.0404,0.0361,0.1106,0.763
lightgbm,Light Gradient Boosting Machine,0.9136,0.6723,0.036,0.5605,0.0673,0.0579,0.1245,1.092
gbc,Gradient Boosting Classifier,0.9134,0.6909,0.0232,0.7,0.0446,0.0385,0.1115,1.695
lr,Logistic Regression,0.9127,0.6769,0.0,0.0,0.0,0.0,0.0,0.173
svm,SVM - Linear Kernel,0.9127,0.0,0.0,0.0,0.0,0.0,0.0,0.061
ridge,Ridge Classifier,0.9127,0.0,0.0,0.0,0.0,0.0,0.0,0.019
dummy,Dummy Classifier,0.9127,0.5,0.0,0.0,0.0,0.0,0.0,0.011
et,Extra Trees Classifier,0.9124,0.64,0.0418,0.45,0.0759,0.0631,0.1157,0.883
ada,Ada Boost Classifier,0.9117,0.6678,0.0012,0.05,0.0023,-0.0001,-0.0014,0.479
lda,Linear Discriminant Analysis,0.9116,0.6769,0.0058,0.3667,0.0114,0.0073,0.0331,0.105


In [46]:
tuned5 = [pycaret_clf.tune_model(model, n_iter=15) for model in top5]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9129,0.0,0.0,0.0,0.0,0.0,0.0
1,0.9129,0.0,0.0,0.0,0.0,0.0,0.0
2,0.9129,0.0,0.0,0.0,0.0,0.0,0.0
3,0.9129,0.0,0.0,0.0,0.0,0.0,0.0
4,0.9129,0.0,0.0,0.0,0.0,0.0,0.0
5,0.9119,0.0,0.0,0.0,0.0,0.0,0.0
6,0.9128,0.0,0.0,0.0,0.0,0.0,0.0
7,0.9128,0.0,0.0,0.0,0.0,0.0,0.0
8,0.9128,0.0,0.0,0.0,0.0,0.0,0.0
9,0.9128,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
blender = pycaret_clf.blend_models(estimator_list=tuned5)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9129,0.0,0.0,0.0,0.0,0.0,0.0
1,0.9129,0.0,0.0,0.0,0.0,0.0,0.0
2,0.9129,0.0,0.0,0.0,0.0,0.0,0.0
3,0.9129,0.0,0.0,0.0,0.0,0.0,0.0
4,0.9129,0.0,0.0,0.0,0.0,0.0,0.0
5,0.9119,0.0,0.0,0.0,0.0,0.0,0.0
6,0.9128,0.0,0.0,0.0,0.0,0.0,0.0
7,0.9128,0.0,0.0,0.0,0.0,0.0,0.0
8,0.9128,0.0,0.0,0.0,0.0,0.0,0.0
9,0.9128,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
final_model = pycaret_clf.finalize_model(blender)



In [49]:
pred = pycaret_clf.predict_model(final_model, data=test_df)

In [50]:
sub = pd.read_csv("./data/sample_submission.csv")
sub["Y_LABEL"] = pred["Label"]

In [51]:
sub.to_csv("./pycaret_clf.csv", index=False)

In [54]:
sub["Y_LABEL"].value_counts()

0    6041
Name: Y_LABEL, dtype: int64