In [None]:
# Create models using the parameters selected by grid search
# Export feature importance, confusion matrix array and class probability for each train-test set
# Calculate the score for each set
# Visualize confusion matrix
# OPTION: OOB error rate

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np

In [None]:
# Create models
# Set the parameters selected by grid search
clf = RandomForestClassifier(max_features='sqrt', n_estimators=200, random_state=123)

In [None]:
# Export the feature importance as CSV file for each set
# Export the confusion matrix array as numpy array for each set
# Export the class probability as CSV file for each set
# Calculate the accuracy score for each set
ranges = range(1,11)
score_all=['']*11
for i in ranges:
    train = pd.read_csv("train_"+"%02.f"%(i)+".csv")
    train_x = train.drop(['species','waves'], axis=1)
    train_y = train['species']
    test = pd.read_csv("test_"+"%02.f"%(i)+".csv")
    test_x = test.drop(['species','waves'], axis=1)
    test_y = test['species']
    test_w = test['waves']
    clf.fit(train_x, train_y)
    # feature importances export
    FN =list(train_x.columns)
    IF = list(zip(clf.feature_importances_,FN))
    impFeat = pd.DataFrame(IF,columns=["Importance","Feature_Name"])
    impFeat.to_csv("IF_"+"%02.f"%(i)+".csv")
    # for confusion matrix, make array
    y_pred = clf.predict(test_x)
    conf_mat = confusion_matrix(test_y, y_pred, 
                                labels=['Escherichia', 'Bacillus', 'Thermus', 'Thermococcus', 'Sulfolobus', 'Nitrososphaera']) 
                                # Sorting Classes. Sort in the specified order.
    np.save("model_array_"+"%02.f"%(i), conf_mat)
    # class probability export
    classproba = pd.DataFrame(clf.predict_proba(test_x), columns=clf.classes_)
    classproba = pd.concat([test_w, test_y, classproba],axis=1)
    classproba.to_csv("predict_proba_"+"%02.f"%(i)+".csv")
    # for score
    score_all[i] = clf.score(test_x, test_y)

In [None]:
# Display the score for each set
score_all.pop(0) 
print(score_all)

In [None]:
# Display the mean of scores
print(np.mean(score_all))

In [None]:
# Display the standard deviation of scores
print(np.std(score_all))

In [None]:
# Check the array
# The order of the classes will be in the order in which you sorted them earlier
print(np.load('model_array_01.npy'))

In [None]:
# Import the arrays
model_array_01 = np.load('model_array_01.npy')
model_array_02 = np.load('model_array_02.npy')
model_array_03 = np.load('model_array_03.npy')
model_array_04 = np.load('model_array_04.npy')
model_array_05 = np.load('model_array_05.npy')
model_array_06 = np.load('model_array_06.npy')
model_array_07 = np.load('model_array_07.npy')
model_array_08 = np.load('model_array_08.npy')
model_array_09 = np.load('model_array_09.npy')
model_array_10 = np.load('model_array_10.npy')

In [None]:
# Addition
model_array_sum = model_array_01 + model_array_02 + model_array_03 + model_array_04 + model_array_05 + model_array_06 + model_array_07 + model_array_08 + model_array_09 + model_array_10
model_array_sum

In [None]:
# Setting for visualization of the confusion matrix
# Display the number of samples: normalize=False
# For percentages: normalize=True
# Original script: https://scikit-learn.org/0.18/auto_examples/model_selection/plot_confusion_matrix.html
import matplotlib.pyplot as plt
%matplotlib inline
import itertools
import numpy as np
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Visualize the confusion matrix
# !!CAUTION!! The order of the labels to be included in the 'species' should be the same order as the 'labels' above (line 5).
species=['Escherichia', 'Bacillus', 'Thermus', 'Thermococcus', 'Sulfolobus', 'Nitrososphaera']
plot_confusion_matrix(model_array_sum, species)

In [None]:
# Visualize OOB error rate
# !!CAUTION!! This script is heavy running; 
#  it is not recommended to use the 'for' syntax to calculate 10 sets once.

# Author: Kian Ho <hui.kian.ho@gmail.com>
#         Gilles Louppe <g.louppe@gmail.com>
#         Andreas Mueller <amueller@ais.uni-bonn.de>
#
# License: BSD 3 Clause

import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 123

# Generate a binary classification dataset.
#X, y = make_classification(
    #n_samples=500,
    #n_features=25,
    #n_clusters_per_class=1,
    #n_informative=15,
    #random_state=RANDOM_STATE,
#)

# Because of the computation time required, 
# it is recommended to perform the calculation steadily, 
# one set at a time, without "for".
train = pd.read_csv("train_01.csv")
X = train.drop(['species','waves'], axis=1)
y = train['species']

# NOTE: Setting the `warm_start` construction parameter to `True` disables
# support for parallelized ensembles but is necessary for tracking the OOB
# error trajectory during training.

ensemble_clfs = [
    (
        "RandomForestClassifier, max_features='sqrt'",
        RandomForestClassifier(
            warm_start=True,
            oob_score=True,
            max_features="sqrt",
            random_state=RANDOM_STATE,
        ),
    ),
    (
        "RandomForestClassifier, max_features='log2'",
        RandomForestClassifier(
            warm_start=True,
            max_features="log2",
            oob_score=True,
            random_state=RANDOM_STATE,
        ),
    ),
    (
        "RandomForestClassifier, max_features=None",
        RandomForestClassifier(
            warm_start=True,
            max_features=None,
            oob_score=True,
            random_state=RANDOM_STATE,
        ),
    ),
]

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# Range of `n_estimators` values to explore.
min_estimators = 1
max_estimators = 500

for label, clf in ensemble_clfs:
    for i in range(min_estimators, max_estimators + 1, 5):
        clf.set_params(n_estimators=i)
        clf.fit(X, y)

        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()

In [None]:
# Extract the oob error rate as numerical data
# !!CAUTION!! This script is heavy running; 
#  it is not recommended to use the 'for' syntax to calculate 10 sets once.

import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 123

train = pd.read_csv("train_01.csv")
X = train.drop(['species','waves'], axis=1)
y = train['species']

ensemble_clfs = [ 
   ("RandomForestClassifier, max_features=sqrt", 
       RandomForestClassifier(warm_start=True, oob_score=True, 
                              max_features='sqrt', 
                              random_state=RANDOM_STATE)), 
] 

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs. 
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs) 

# Range of `n_estimators` values to explore. 
min_estimators = 1 
max_estimators = 500
number=[]
for label, clfs in ensemble_clfs: 
    for i in range(min_estimators, max_estimators + 1): 
        clfs.set_params(n_estimators=i) 
        clfs.fit(X, y) 

        # Record the OOB error for each `n_estimators=i` setting. 
        oob_error = 1 - clfs.oob_score_ 
        error_rate[label].append(oob_error)
        a = i
        number.append(a)
error_rate = pd.DataFrame(error_rate)
number = pd.DataFrame(number)
error_rate = pd.concat([number, error_rate],axis=1)
error_rate = error_rate.rename(columns = {0: 'No of Tree'})
error_rate

In [None]:
# Export
error_rate.to_csv("error_rate_01.csv") 