In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

### 1. 데이터 불러오기

In [2]:
RANDOM_STATE = 42

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 464 entries, Wip Line_Dam to target
dtypes: float64(350), int64(77), object(37)
memory usage: 143.4+ MB


In [4]:
train['target'].value_counts()

target
Normal      38156
AbNormal     2350
Name: count, dtype: int64

### 2. 데이터 전처리

#### 2-1. 데이터가 전부 결측치인 경우 column 제거

In [9]:
for i in train.columns:
    if train[i].isnull().sum() == len(train):
        train = train.drop(i,axis=1)
        test = test.drop(i,axis=1)

In [10]:
test = test.drop('Set ID',axis=1)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 186 entries, Wip Line_Dam to target
dtypes: float64(72), int64(77), object(37)
memory usage: 57.5+ MB


#### 2-2. test셋의 고유값이 train셋의 고유값 안에 전부 포함되는 column만 추출

In [12]:
lst = []
for i in test.columns:
    x = 0
    lens = len(test[i].value_counts().keys())
    for value in test[i].value_counts().keys():
        if value in train[i].value_counts().keys():
            x += 1
    if x == lens:
        lst += [i]

In [13]:
# 파생변수('기포량')을 위해 '2nd Pressure Collect Result_AutoClave'를 포함함

train = train[['2nd Pressure Collect Result_AutoClave'] + lst] 
test = test[['2nd Pressure Collect Result_AutoClave'] + lst]

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 168 entries, 2nd Pressure Collect Result_AutoClave to target
dtypes: float64(65), int64(70), object(33)
memory usage: 51.9+ MB


#### 2-3. 1.4 : 1(정상 : 비정상)의 비율로 데이터셋 구축

In [15]:
df_normal = train[train["target"] == "Normal"]
df_abnormal = train[train["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(
    n=3290, replace=False, random_state=RANDOM_STATE
)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
Normal      3290
AbNormal    2350
Name: count, dtype: int64

In [16]:
df = df_concat
df_test = test

#### 2-4. 데이터 고유값 내 'OK'가 존재할 경우 column 제거

In [17]:
# 파생변수('cure diff')를 위해 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'을 남기고 제거
for i in df.columns:
    if 'OK' in list(df[i].value_counts().keys()) and i != 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam':
        print(i)
        df = df.drop(i,axis=1)

Insp Judge Code_Dam
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam
Insp Judge Code_AutoClave
1st Pressure Judge Value_AutoClave
2nd Pressure Judge Value_AutoClave
3rd Pressure Judge Value_AutoClave
Chamber Temp. Judge Value_AutoClave
GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave
GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave
Insp Judge Code_Fill1
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1
Insp Judge Code_Fill2
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2
HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2


In [18]:
df_test = df_test[[i for i in df.columns if i!='target']]

In [19]:
df['target'] = np.where(df['target'] == 'Normal',0,1)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5640 entries, 0 to 5639
Columns: 153 entries, 2nd Pressure Collect Result_AutoClave to target
dtypes: float64(65), int32(1), int64(70), object(17)
memory usage: 6.6+ MB


In [21]:
# 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam' 전처리
col = 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'
df[col] = np.where(df[col] == 'OK',0,df[col])
df_test[col] = np.where(df_test[col] == 'OK',0,df_test[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = np.where(df_test[col] == 'OK',0,df_test[col])


In [22]:
df[col] = df[col].fillna(0)
df_test[col] = df_test[col].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test[col].fillna(0)


In [23]:
df[col] = df[col].astype('float64')
df_test[col] = df_test[col].astype('float64')

df[col].value_counts().keys()

df[col] = np.where(df[col] == 0,df[col].mean(),df[col])
df_test[col] = np.where(df_test[col] == 0,df_test[col].mean(),df_test[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test[col].astype('float64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = np.where(df_test[col] == 0,df_test[col].mean(),df_test[col])


In [24]:
df.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5640 entries, 0 to 5639
Columns: 153 entries, 2nd Pressure Collect Result_AutoClave to target
dtypes: float64(66), int32(1), int64(70), object(16)
memory usage: 6.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Columns: 152 entries, 2nd Pressure Collect Result_AutoClave to WorkMode Collect Result_Fill2
dtypes: float64(66), int64(70), object(16)
memory usage: 20.1+ MB


In [25]:
print(df.isnull().sum().sum())
print(df_test.isnull().sum().sum())

0
0


In [26]:
# 고유값이 한 개인 컬럼 제거
for i in df.columns:
    if len(df[i].value_counts().keys()) == 1:
        print(i)
        df = df.drop(i,axis=1)
        df_test = df_test.drop(i,axis=1)

Wip Line_Dam
Process Desc._Dam
Insp. Seq No._Dam
CURE STANDBY POSITION X Collect Result_Dam
CURE STANDBY POSITION Z Collect Result_Dam
CURE STANDBY POSITION Θ Collect Result_Dam
CURE START POSITION Z Collect Result_Dam
Wip Line_AutoClave
Process Desc._AutoClave
Equipment_AutoClave
Insp. Seq No._AutoClave
Wip Line_Fill1
Process Desc._Fill1
Insp. Seq No._Fill1
Wip Line_Fill2
Process Desc._Fill2
Insp. Seq No._Fill2
CURE END POSITION Θ Collect Result_Fill2
CURE STANDBY POSITION X Collect Result_Fill2
CURE STANDBY POSITION Θ Collect Result_Fill2
CURE START POSITION Θ Collect Result_Fill2
DISCHARGED SPEED OF RESIN Collect Result_Fill2
DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill2
DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill2
DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill2
Dispense Volume(Stage1) Collect Result_Fill2
Dispense Volume(Stage2) Collect Result_Fill2
Dispense Volume(Stage3) Collect Result_Fill2


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5640 entries, 0 to 5639
Columns: 125 entries, 2nd Pressure Collect Result_AutoClave to target
dtypes: float64(62), int32(1), int64(55), object(7)
memory usage: 5.4+ MB


In [18]:
# type이 object인 항목 중 상관관계가 1인 변수 쌍이 있기에 이를 제거
df = df.drop(['Model.Suffix_AutoClave','Model.Suffix_Fill1','Model.Suffix_Fill2'],axis=1)
df_test = df_test.drop(['Model.Suffix_AutoClave','Model.Suffix_Fill1','Model.Suffix_Fill2'],axis=1)

In [19]:
df.select_dtypes('object').columns

Index(['Equipment_Dam', 'Model.Suffix_Dam', 'Equipment_Fill1',
       'Equipment_Fill2'],
      dtype='object')

In [20]:
press = sorted([i for i in df.columns if 'AutoClave' in i and 'Collect Result' in i][:3])
press

['1st Pressure Collect Result_AutoClave',
 '2nd Pressure Collect Result_AutoClave',
 '3rd Pressure Collect Result_AutoClave']

#### 2-5. 파생변수 생성 

In [21]:
# 초기압력 0.1, 초기 부피 100으로 가정
# 몰의 법칙을 응용해서 -> 초기 압력 * 초기 부피 / 최종압력 = 최종 부피

df['기체부피1'] = (100 * 0.1) / df[press[0]]
df['기체부피2'] = (df['기체부피1'] * df[press[0]]) / df[press[1]]
df['기체부피3'] = (df['기체부피2'] * df[press[1]]) / df[press[2]]
df['기포량'] = 100 - df['기체부피3']

df_test['기체부피1'] = (100 * 0.1) / df_test[press[0]]
df_test['기체부피2'] = (df_test['기체부피1'] * df_test[press[0]]) / df_test[press[1]]
df_test['기체부피3'] = (df_test['기체부피2'] * df_test[press[1]]) / df_test[press[2]]
df_test['기포량'] = 100 - df_test['기체부피3']

df = df.drop([i for i in df.columns if '기체부피' in i],axis=1)
df_test = df_test.drop([i for i in df_test.columns if '기체부피' in i],axis=1)

In [22]:
c = pd.read_csv('주요칼럼.csv')
# train셋을 2725개씩 14등분하여 feature_importance와 permutation_importance를 교집합하여 상위 60개의 컬럼을 추출함

lst = []

for i in c.columns:
    lst += [c[i].tolist()]
    
ai = np.unique(lst,return_counts=True)
best_col = []

for i in range(len(ai[0])):
    if ai[1][i] >= 8:
        best_col += [ai[0][i]]

In [23]:
np.median(ai[1])

8.0

In [24]:
coordinate = [i for i in best_col if 'HEAD NORMAL COORDINATE' in i and 'Stage3' in i and ('Dam' in i or 'Fill2' in i)]
coordinate

['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2']

In [25]:
# Dam, Fill2의 HEAD NORMAL COORDINATE 좌표 차이
df['corr_diff'] = np.sqrt(((df[coordinate[0]] - df[coordinate[1]])**2) + 
                        ((df[coordinate[2]] - df[coordinate[3]])**2) + 
                        ((df[coordinate[4]] - df[coordinate[5]])**2))

df_test['corr_diff'] = np.sqrt(((df_test[coordinate[0]] - df_test[coordinate[1]])**2) + 
                        ((df_test[coordinate[2]] - df_test[coordinate[3]])**2) + 
                        ((df_test[coordinate[4]] - df_test[coordinate[5]])**2))

In [26]:
res = [i for i in best_col if i in train.columns]

In [27]:
for i in df.select_dtypes('object').columns:
    df[i] = df[i].str[-1:].astype('int32')*10
    df_test[i] = df_test[i].str[-1:].astype('int32')*10
    print(i)

Equipment_Dam
Model.Suffix_Dam
Equipment_Fill1
Equipment_Fill2


In [28]:
df.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5640 entries, 0 to 5639
Columns: 124 entries, 2nd Pressure Collect Result_AutoClave to corr_diff
dtypes: float64(64), int32(5), int64(55)
memory usage: 5.2 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Columns: 123 entries, 2nd Pressure Collect Result_AutoClave to corr_diff
dtypes: float64(64), int32(4), int64(55)
memory usage: 16.0 MB


In [29]:
df['workmodes'] = df[[i for i in res if 'WorkMode' in i][:2]].mean(axis='columns')
df_test['workmodes'] = df_test[[i for i in res if 'WorkMode' in i][:2]].mean(axis='columns')

df['cure_diff'] = df['CURE START POSITION X Collect Result_Dam'] - df[[i for i in df.columns if 'HEAD NORMAL COORDINATE X' in i and 'Dam' in i]].min(axis='columns')
df_test['cure_diff'] = df_test['CURE START POSITION X Collect Result_Dam'] - df_test[[i for i in df_test.columns if 'HEAD NORMAL COORDINATE X' in i and 'Dam' in i]].min(axis='columns')

In [30]:
df['Equipment'] = df[[i for i in df.columns if 'Equipment' in i]].sum(axis='columns')
df_test['Equipment'] = df_test[[i for i in df_test.columns if 'Equipment' in i]].sum(axis='columns')

In [31]:
target0 = list(df.groupby('target').var().reset_index().iloc[0])[1:]
target1 = list(df.groupby('target').var().reset_index().iloc[1])[1:]

In [32]:
# target을 기준으로 그룹화했을 때 분산의 차이가 전체 합의 10% 이상인 column만 추출해서 평균을 냄

pers = []

for i in range(len(target0)):
    per = round(abs(target0[i] - target1[i]) / (target0[i] + target1[i]),1)
    if per > 0:
        pers += [df.groupby('target').var().reset_index().columns[1:][i]]

In [33]:
df['means'] = df[pers].mean(axis='columns')
df_test['means'] = df_test[pers].mean(axis='columns')

In [34]:
# target과 상관관계가 너무 높아서 int로 치환
df['means'] = df['means'].astype('int64')
df_test['means'] = df_test['means'].astype('int64')

In [35]:
# 고유값의 개수가 3개 이상인 경우 각 고유값별로 target이 1일 확률이 0.5 이상이면 1, 아니면 0으로 치환
l = []

for col in df.columns:
    if len(df[col].value_counts().keys()) >= 3:
        per0 = []
        per1 = []
        per = []

        for i in list(df[col].value_counts().keys()):
            a = len(df[df[col] == i])
            b = len(df[(df[col] == i) & (df['target'] == 1)])
            per += [round(b/a,1)]

            if (round(b/a,1)) >= 0.5:
                per0 += [i]
            else:
                per1 += [i]

        if len([i for i in np.unique(per) if i >= 0.5]) >= 1:
            df[col] = np.where(df[col].isin(per0),1,0)
            df_test[col] = np.where(df_test[col].isin(per0),1,0)
            l += [col]
        else:
            df = df.drop(col,axis=1)
            df_test = df_test.drop(col,axis=1)

In [36]:
for i in df.columns:
    if len(df[i].value_counts().keys()) == 1:
        df = df.drop(i,axis=1)
        df_test = df_test.drop(i,axis=1)

In [37]:
corr = pd.DataFrame(df.corrwith(df['target'])).reset_index()
corr = corr.fillna(0)
corr.sort_values(0,ascending=False).head(20)

Unnamed: 0,index,0
120,target,1.0
122,corr_diff,0.226973
16,HEAD NORMAL COORDINATE X AXIS(Stage2) Collect ...,0.209264
126,means,0.198401
18,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect ...,0.189296
19,HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect ...,0.186735
17,HEAD NORMAL COORDINATE X AXIS(Stage3) Collect ...,0.184402
35,Stage1 Circle1 Distance Speed Collect Result_Dam,0.171857
124,cure_diff,0.168463
20,HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect ...,0.166341


In [38]:
df = df.drop([i for i in df.columns if 'Equipment_' in i],axis=1)
df_test = df_test.drop([i for i in df_test.columns if 'Equipment_' in i],axis=1)

In [39]:
l = []
for i in df.columns:
    if list(df[i].value_counts().keys()) == [0,1]:
        l += [i]

In [40]:
l = [i for i in l if i != 'target']

In [41]:
df['01'] = df[l].sum(axis='columns')
df_test['01'] = df_test[l].sum(axis='columns')

  df['01'] = df[l].sum(axis='columns')
  df_test['01'] = df_test[l].sum(axis='columns')


In [42]:
# 고유값이 [0,1]인 column만 추출해서 평균을 구해서 1,0으로 치환
col = '01'
per0 = []
per1 = []
per = []

for i in list(df[col].value_counts().keys()):
    a = len(df[df[col] == i])
    b = len(df[(df[col] == i) & (df['target'] == 1)])
    per += [round(b/a,1)]

    if (round(b/a,1)) >= 0.6:
        per0 += [i]
    else:
        per1 += [i]

print(np.unique(per,return_counts=True))
    
df[col] = np.where(df[col].isin(per0),1,0)
df_test[col] = np.where(df_test[col].isin(per0),1,0)

(array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), array([ 2,  1,  4, 10, 14, 23, 14,  3,  3,  1,  3], dtype=int64))


In [43]:
df.head()

Unnamed: 0,2nd Pressure Collect Result_AutoClave,Model.Suffix_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,...,Head Purge Position Z Collect Result_Fill2,WorkMode Collect Result_Fill2,target,기포량,corr_diff,workmodes,cure_diff,Equipment,means,01
0,0,0,1000.0,12.5,90,280,90,0,0,0,...,50,0,0,0,0,0,0,0,0,0
1,0,0,240.0,2.5,-90,1030,-90,1,0,0,...,50,0,0,0,0,0,0,0,0,0
2,0,0,240.0,2.5,-90,1030,-90,0,0,0,...,50,0,0,0,0,0,0,0,0,0
3,0,0,240.0,2.5,-90,1030,-90,1,1,1,...,85,1,0,0,1,1,1,0,1,1
4,1,0,1000.0,12.5,90,280,90,1,0,0,...,50,0,0,0,0,0,0,0,0,0


In [44]:
df.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5640 entries, 0 to 5639
Columns: 125 entries, 2nd Pressure Collect Result_AutoClave to 01
dtypes: float64(13), int32(95), int64(17)
memory usage: 3.3 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17361 entries, 0 to 17360
Columns: 124 entries, 2nd Pressure Collect Result_AutoClave to 01
dtypes: float64(13), int32(94), int64(17)
memory usage: 10.2 MB


In [45]:
# target이 한 쪽으로 치우칠것을 방지하여 데이터를 섞음
df = df.sample(frac=1,random_state=RANDOM_STATE).reset_index(drop=True)

In [46]:
xtrain = df.drop(['target'],axis=1)
ytrain = df['target']
xtest = df_test

In [47]:
xtrain.shape

(5640, 124)

In [48]:
# pca를 통해 pca.explained_variance_ratio_가 1인 지점을 찾아 변환
from sklearn.decomposition import PCA

component = 0

for i in range(1,125):
    pca = PCA(n_components=i, random_state=RANDOM_STATE) # 주성분을 몇개로 할지 결정
    a = pca.fit_transform(xtrain)
    if (sum(pca.explained_variance_ratio_)) == 1:
        print(i)
        component = i
        break

67


In [49]:
print(component)

67


In [50]:
pca = PCA(n_components=component) # 주성분을 몇개로 할지 결정
xtrain = pca.fit_transform(xtrain)
xtest = pca.transform(xtest)

In [51]:
from sklearn.preprocessing import StandardScaler

# svm 학습을 위한 정규화
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [52]:
from sklearn import svm
clf = svm.SVC(kernel='rbf',random_state=RANDOM_STATE)
clf.fit(xtrain,ytrain)
print(clf.score(xtrain,ytrain))

0.6852836879432624


In [53]:
from sklearn.model_selection import cross_val_score

for metric in ['accuracy','precision_macro', 'recall_macro', 'f1_macro']:
    scores = cross_val_score(clf, xtrain,ytrain, cv=5, scoring=metric)

    print(f'{metric}: {scores}')
    print(f'평균 {metric} 스코어: {scores.mean()}')

accuracy: [0.65248227 0.66312057 0.63829787 0.66046099 0.6462766 ]
평균 accuracy 스코어: 0.6521276595744682
precision_macro: [0.64242424 0.65835782 0.62557582 0.65274119 0.6351178 ]
평균 precision_macro 스코어: 0.6428433746200094
recall_macro: [0.6212766  0.62826748 0.60577508 0.62841945 0.61443769]
평균 recall_macro 스코어: 0.6196352583586625
f1_macro: [0.62038462 0.62631684 0.60353816 0.62761656 0.61294883]
평균 f1_macro 스코어: 0.6181610013755157


In [54]:
pred = clf.predict(xtest)

In [55]:
df_sub = pd.read_csv("submission원본.csv")
df_sub["target"] = pred


df_sub['target'] = np.where(df_sub['target'] == 0,'Normal','AbNormal')

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [56]:
re = pd.read_csv('submission.csv')
re['target'].value_counts()

target
Normal      13511
AbNormal     3850
Name: count, dtype: int64