In [None]:
#Load Package

import os
import pandas as pd
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader_v2 import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib # 모델을 저장하고 불러오는 역할

In [None]:
#Set Path

train_folder = 'D:\\taeyong\\dacon\\train\\'
test_folder = 'D:\\taeyong\\dacon\\test\\'
train_label_path = 'D:\\taeyong\\dacon\\train_label.csv'

In [None]:
#Load Files

train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [None]:
# 모든 csv 파일의 상태_B로 변화는 시점이 같다라고 가정
# 하지만, 개별 csv파일의 상태_B로 변화는 시점은 상이할 수 있음

def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):  
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)    
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count())
        df_list = list(pool.imap(func_fixed, files))
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

In [None]:
#Train

train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=10, nrows=60)

In [None]:
#Preprocessing

X_train = train.drop(['label'], axis=1)
y_train = train['label']

In [None]:
#Parameters

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

parameters = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
#GridSearch

model = RandomForestClassifier(random_state = 42)

grid = GridSearchCV(model, param_grid = parameters, cv = 5)

grid.fit(X_train, Y_train)

score_df = pd.DataFrame(grid.cv_results_)

score_df

In [None]:
#Best

print('GridSearchCV 최적 parameters : ', grid.best_params_)
print('GridSeafchCV 최고 accuracy : ', grid.best_score_)

### grid.best_params_의 결과를 모델에 세팅해야 합니다!!!

In [None]:
#train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=40)

model = RandomForestClassifier(random_state=42, verbose=1, n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
#Validation

pred = model.predict(X_val)

In [None]:
#Accuracy

accuracy = np.round(accuracy_score(y_val, pred), 4)

In [None]:
#Test
# 마찬가지로, 모든 test 셋 데이터에 대해서 10초 부터 상태_B가 시작된다고 가정
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=60)

In [None]:
#Prediction

pred = model.predict_proba(test)

In [None]:
#Make submission

submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('dacon.csv', index=True)