In [1]:
#Load Package

import os
import pandas as pd
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader_v2 import data_loader_v2 # 자체적으로 만든 data loader version 2.0 ([데이콘 15회 대회] 데이터 설명 및 데이터 불러오기 영상 참조)

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import joblib # 모델을 저장하고 불러오는 역할

In [2]:
#Set Path

train_folder = 'E:\\DACON\\train\\'
test_folder = 'E:\\DACON\\test\\'
train_label_path = 'E:\\DACON\\train_label.csv'

In [3]:
#Load Files

train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [4]:
# 모든 csv 파일의 상태_B로 변화는 시점이 같다라고 가정
# 하지만, 개별 csv파일의 상태_B로 변화는 시점은 상이할 수 있음

def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):  
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)    
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count())
        df_list = list(pool.imap(func_fixed, files))
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

In [5]:
#Train

train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=10, nrows=60)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41400 entries, 0 to 99
Columns: 5122 entries, V0000 to label
dtypes: float64(5121), int64(1)
memory usage: 1.6 GB


In [7]:
#Preprocessing

X_train = train.drop(['label'], axis=1)
y_train = train['label']

In [10]:
#Parameters

#n_estimators = [int(x) for x in np.linspace(start = 500, stop = 2000, num = 4)]
n_estimators = [1000]
#max_features = ["auto", "sqrt"]
#max_depth = [int(x) for x in np.linspace(50, 110, num = 2)]
#max_depth = [50]
min_samples_split = [2, 5, 10]
min_samples_leaf = [2, 3]
#bootstrap = [True, False]

parameters = {'n_estimators': n_estimators,
               #'max_features': max_features,
               #'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               #'bootstrap': bootstrap
             }

In [11]:
#GridSearch

model = RandomForestClassifier(verbose = 1, random_state = 0, n_jobs=12)

grid = GridSearchCV(model, param_grid = parameters, cv = 4)

grid.fit(X_train, y_train)

score_df = pd.DataFrame(grid.cv_results_)

score_df

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:   23.3s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:  1.6min
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:  3.6min
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:  6.3min
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:  8.1min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.4s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    4.4s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    5.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:  

[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    8.2s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   15.0s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:   19.2s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:   16.9s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:  1.5min
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:  3.5min
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:  6.4min
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:  8.2min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    1.2s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.9s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    5.4s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    6.9s finished
[P

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.4s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    8.2s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   15.0s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:   19.2s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:   16.9s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:  1.5min
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:  3.5min
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:  6.3min
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:  8.0min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:  

[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.7s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    5.0s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    6.4s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    3.4s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    8.2s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   15.1s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:   19.4s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:   16.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:  1.4min
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:  3.4min
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:  6.2min
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:  7.9min finished
[P

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,...,split3_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,mean_train_score,std_train_score
0,489.197828,2.506609,6.008653,0.082614,2,2,1000,"{'min_samples_leaf': 2, 'min_samples_split': 2...",0.775379,0.822203,...,0.819517,0.810531,0.020493,3,1.0,0.999968,1.0,1.0,0.999992,1.4e-05
1,487.151089,6.021424,6.73571,0.34047,2,5,1000,"{'min_samples_leaf': 2, 'min_samples_split': 5...",0.775571,0.823546,...,0.82049,0.811329,0.020833,2,1.0,0.999935,0.999968,1.0,0.999976,2.7e-05
2,482.978037,6.862116,6.711779,0.191757,2,10,1000,"{'min_samples_leaf': 2, 'min_samples_split': 1...",0.776147,0.822395,...,0.821463,0.811667,0.020709,1,0.999935,0.999709,0.999679,0.999679,0.999751,0.000107
3,479.325297,5.228537,6.648206,0.07028,3,2,1000,"{'min_samples_leaf': 3, 'min_samples_split': 2...",0.774419,0.820284,...,0.820393,0.809638,0.020476,5,1.0,0.999774,0.999743,0.999679,0.999799,0.000121
4,479.139316,6.252636,6.738452,0.041099,3,5,1000,"{'min_samples_leaf': 3, 'min_samples_split': 5...",0.774419,0.820284,...,0.820393,0.809638,0.020476,5,1.0,0.999774,0.999743,0.999679,0.999799,0.000121
5,476.938178,5.403797,6.721759,0.121283,3,10,1000,"{'min_samples_leaf': 3, 'min_samples_split': 1...",0.775091,0.820956,...,0.817377,0.809903,0.020452,4,0.999903,0.999387,0.999261,0.999293,0.999461,0.000259


In [12]:
#Best

print('GridSearchCV 최적 parameters : ', grid.best_params_)
print('GridSeafchCV 최고 accuracy : ', grid.best_score_)

GridSearchCV 최적 parameters :  {'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 1000}
GridSeafchCV 최고 accuracy :  0.8116666666666666


### grid.best_params_의 결과를 모델에 세팅해야 합니다!!!

In [13]:
#train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0)

#model = RandomForestClassifier(n_estimators = 1000, min_samples_split=2, min_samples_leaf = 2,
#                               random_state=0, verbose=1, n_jobs=-1)
model = RandomForestClassifier(n_estimators = 1000, min_samples_split=10, min_samples_leaf = 2,
                               random_state=42, verbose=1, n_jobs=12)
model.fit(X_train, y_train)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:   18.5s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:  1.6min
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:  3.9min
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:  7.0min
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:  9.0min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=12,
            oob_score=False, random_state=42, verbose=1, warm_start=False)

In [14]:
#Validation

pred = model.predict(X_val)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    1.2s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    3.0s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    5.6s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:    7.2s finished


In [15]:
#Accuracy

accuracy = np.round(accuracy_score(y_val, pred), 4)
accuracy

0.8939

In [16]:
#Test
# 마찬가지로, 모든 test 셋 데이터에 대해서 10초 부터 상태_B가 시작된다고 가정
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=10, nrows=60)

In [17]:
#Prediction

pred = model.predict_proba(test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    6.1s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   15.4s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:   28.7s
[Parallel(n_jobs=12)]: Done 1000 out of 1000 | elapsed:   37.1s finished


In [18]:
#Make submission

submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('dacon.csv', index=True)