In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV

# Load training and validation data
train_data = pd.read_excel('Train.xlsx')
test_data = pd.read_excel('Test.xlsx')

train_data.columns = train_data.iloc[0, :]
train_data = train_data.iloc[1:, 1:].reset_index(drop = True)

test_data.columns = test_data.iloc[0, :]
test_data = test_data.iloc[1:, 2:].reset_index(drop = True)

# Fit hyperparameters with GridSearchCV
params = {'n_estimators': [50, 100],
          'max_samples': [10, 20],
          'contamination': [0.01, 0.05]}

model = GridSearchCV(IsolationForest(random_state=42), params, cv=5)
model.fit(train_data)

# Fit final model with best hyperparameters
best_params = model.best_params_
print('Best hyperparameters:', best_params)

model = IsolationForest(random_state=42, **best_params)
model.fit(train_data)

# Predict on validation data
pred = model.predict(test_data)
pred = np.where(pred == 1, 0, 1)  # Convert predictions to 0 for normal, 1 for abnormal

# Add predictions to Test.xlsx
test_data['Process variables'] = pred
test_data.to_excel('Test_predictions.xlsx', index=False)

TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator IsolationForest(random_state=42) does not.

## 1. 사용할 패키지 불러오기

In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV

## 2. 데이터 불러오기

#### (1) Pandas의 read_excel 함수를 사용하면 xlsx 형식의 파일을 불러올 수 있다.

In [15]:
train_data = pd.read_excel('Train.xlsx', header=None)
test_data = pd.read_excel('Test.xlsx', header=None)

#### (2) 데이터의 정보가 2번 Index부터 시작하므로 전처리를 해줘야 한다.

In [16]:
train_data.columns = train_data.iloc[1, :]
train_data = train_data.iloc[2:, 1:].reset_index(drop = True)

test_data.columns = test_data.iloc[1, :]
test_data = test_data.iloc[2:, 2:].reset_index(drop = True)

## 3. Hyper-parameter Setting

Scikit-learn 패키지의 GridSearchCV 함수를 사용하면, Hyper-parameter tuning을 수행할 수 있다.  
params (Dictionary)에 파라미터 목록을 추가하면 된다.

In [18]:
# Fit hyperparameters with GridSearchCV
params = {'n_estimators': [50, 100],
          'max_samples': [10, 20],
          'contamination': [0.01, 0.05]}

model = GridSearchCV(IsolationForest(random_state=42), params, cv=5, scoring="accuracy")
model.fit(train_data, np.array([1] * train_data.shape[0]))

GridSearchCV(cv=5, estimator=IsolationForest(random_state=42),
             param_grid={'contamination': [0.01, 0.05], 'max_samples': [10, 20],
                         'n_estimators': [50, 100]},
             scoring='accuracy')

In [19]:
# Fit final model with best hyperparameters
best_params = model.best_params_
print('Best hyperparameters:', best_params)

Best hyperparameters: {'contamination': 0.01, 'max_samples': 20, 'n_estimators': 50}


## 4. 예측

In [20]:
# Predict on validation data
pred = model.predict(test_data)
pred = np.where(pred == 1, 0, 1)  # Convert predictions to 0 for normal, 1 for abnormal

# Add predictions to Test.xlsx
test_data['Process variables'] = pred
test_data.to_excel('Test_predictions.xlsx', index=False)