In [None]:
# For 10-fold cross varidation
# Divide the data into 10 parts and create 10 data sets with train-test=9:1
# Perform a grid search for each data set

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

In [None]:
# Data import
df = pd.read_table('dataset.txt')
# Check the imported data, display the first 5 rows
df.head()

In [None]:
# Data arrangement
data_x = df.drop(['species'], axis=1)
data_y = df['species']

In [None]:
# Divide the data into 10 parts and create 10 data sets with train-test=9:1
# Export the CSV files (train_01.csv, test_01.csv ~)
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits = 10, random_state=123, shuffle=True)
ranges = range(1,11)
for i, (train_idx, test_idx) in zip(ranges,(kfold.split(data_x, data_y))):
    X_train = data_x.iloc[train_idx, :]
    X_test = data_x.iloc[test_idx, :]
    y_train = data_y.iloc[train_idx]
    y_test = data_y.iloc[test_idx]
    train_concat = pd.concat([y_train, X_train],axis=1)
    test_concat = pd.concat([y_test, X_test],axis=1)
    train_concat.to_csv("train_"+"%02.f"%(i)+".csv", index=False)
    test_concat.to_csv("test_"+"%02.f"%(i)+".csv", index=False)

In [None]:
# Check the created data, display the first 5 rows
xx=pd.read_csv("test_01.csv")
xx.head()

In [None]:
# Model construction for grid search
clf_cv = RandomForestClassifier()

In [None]:
# Set parameters for search _1st
# Please change the number in [] you want to search
search_params = {
     'n_estimators'      : [10, 100, 200, 300, 500, 700, 1000, 1500],
      'max_features'      : ["sqrt", 20, 30, 40, 50, 60, 70],
      'random_state'      : [123],
}

In [None]:
# Grid search setting
gs = GridSearchCV(clf_cv,          # model
                  search_params,   # search parameters
                  cv=5,            # cross validation
                  verbose=True,    # display log 
                  n_jobs=-1)       # Number of parallel processing CPU cores. -1: using all processors

In [None]:
# Perform a grid search for each data set
ranges = range(1,11)
for i in ranges:
    train = pd.read_csv("train_"+"%02.f"%(i)+".csv")
    train_x = train.drop(['species', 'waves'], axis=1)
    train_y = train['species']
    gs.fit(train_x, train_y) 
    print(gs.best_estimator_) # print the result. checke 'n_estimators' and 'max_features'

In [None]:
# Set parameters for search _2st
# If you can't decide in the 1st step, narrow down or change the number of parameters and search again
search_params = {
     'n_estimators'      : [200,300,400,500,600],
      'max_features'      : ["sqrt"],
      'random_state'      : [123],
      'n_jobs'            : [1],
}

In [None]:
# Grid search setting_2nd
gs = GridSearchCV(clf_cv, 
                  search_params,   
                  cv=5,            
                  verbose=True,    
                  n_jobs=-1)       

In [None]:
# Perform a grid search for each data set_2nd
for i in ranges:
    train = pd.read_csv("train_"+"%02.f"%(i)+".csv")
    train_x = train.drop(['species', 'waves'], axis=1)
    train_y = train['species']
    gs.fit(train_x, train_y) 
    print(gs.best_estimator_)