In [None]:
# simple script

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np

In [None]:
df = pd.read_table('sampledataset_rep1.txt')

In [None]:
train_x = df.drop(['species', 'waves'], axis=1) 
train_y = df['species'] 
(train_x, test_x ,train_y, test_y) = train_test_split(train_x, train_y, test_size = 0.2, stratify=train_y, random_state=123) 

In [None]:
# Create a model for grid search
clf_cv = RandomForestClassifier()

In [None]:
# Set parameters for search _1st
search_params = {
     'n_estimators'      : [10, 100, 200, 300, 500, 700, 1000, 1500],
      'max_features'      : ["sqrt", 20, 30, 40, 50, 60, 70],
      'random_state'      : [123],
}

In [None]:
# Grid search setting
gs = GridSearchCV(clf_cv,          # model
                  search_params,   # search parameters
                  cv=5,            # closs validation
                  verbose=True,    # display log 
                  n_jobs=-1)       # Number of parallel processing CPU cores. -1: using all processors
gs.fit(train_x, train_y) 
print(gs.best_estimator_)

In [None]:
# Create model
# Set the parameters selected by grid search
clf = RandomForestClassifier(max_features='sqrt', n_estimators=500, oob_score=True, random_state=123) 
clf.fit(train_x, train_y)

In [None]:
# Calcurate score
score = clf.score(test_x, test_y)
print(score)

In [None]:
# Feature importances export
FN =list(train_x.columns)
IF = list(zip(clf.feature_importances_,FN))
impFeat = pd.DataFrame(IF,columns=["Importance","Feature_Name"])
impFeat.to_csv("IF.csv")

In [None]:
# Class probability export
classproba = pd.DataFrame(clf.predict_proba(test_x), columns=clf.classes_)
classproba = pd.concat([test_y.reset_index(drop=True), classproba],axis=1)
classproba.to_csv("predict_proba.csv")

In [None]:
# For confusion matrix
y_pred = clf.predict(test_x)
conf_mat = confusion_matrix(test_y, y_pred, labels=['Escherichia', 'Bacillus', 'Thermus', 'Thermococcus', 'Sulfolobus', 'Nitrososphaera']) # Sorting Classes. Sort in the specified order.

In [None]:
# Setting for visualization of the confusion matrix
## Display the number of samples: normalize=False
## For percentages: normalize=True
# Original script: https://scikit-learn.org/0.18/auto_examples/model_selection/plot_confusion_matrix.html

import matplotlib.pyplot as plt
%matplotlib inline
import itertools
import numpy as np
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Visualize the confusion matrix
# !!CAUTION!! The order of the labels to be included in the 'species' should be the same order as the 'labels' above (line 5).
species=['Escherichia', 'Bacillus', 'Thermus', 'Thermococcus', 'Sulfolobus', 'Nitrososphaera']
plot_confusion_matrix(conf_mat, species)

In [None]:
# Visualize OOB error rate

# Author: Kian Ho <hui.kian.ho@gmail.com>
#         Gilles Louppe <g.louppe@gmail.com>
#         Andreas Mueller <amueller@ais.uni-bonn.de>
#
# License: BSD 3 Clause

# Original script: https://scikit-learn.org/stable/auto_examples/ensemble/plot_ensemble_oob.html
    
import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 123

# Generate a binary classification dataset.
#X, y = make_classification(
    #n_samples=500,
    #n_features=25,
    #n_clusters_per_class=1,
    #n_informative=15,
    #random_state=RANDOM_STATE,
#)

X = df.drop(['species', 'waves'], axis=1) 
y = df['species'] 

# NOTE: Setting the `warm_start` construction parameter to `True` disables
# support for parallelized ensembles but is necessary for tracking the OOB
# error trajectory during training.

ensemble_clfs = [
    (
        "RandomForestClassifier, max_features='sqrt'",
        RandomForestClassifier(
            warm_start=True,
            oob_score=True,
            max_features="sqrt",
            random_state=RANDOM_STATE,
        ),
    ),
    (
        "RandomForestClassifier, max_features='log2'",
        RandomForestClassifier(
            warm_start=True,
            max_features="log2",
            oob_score=True,
            random_state=RANDOM_STATE,
        ),
    ),
    (
        "RandomForestClassifier, max_features=None",
        RandomForestClassifier(
            warm_start=True,
            max_features=None,
            oob_score=True,
            random_state=RANDOM_STATE,
        ),
    ),
]

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# Range of `n_estimators` values to explore.
min_estimators = 1
max_estimators = 500

for label, clf in ensemble_clfs:
    for i in range(min_estimators, max_estimators + 1, 5):
        clf.set_params(n_estimators=i)
        clf.fit(X, y)

        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()

In [None]:
# Extract the oob error rate as numerical data
import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 123

X = df.drop(['species', 'waves'], axis=1) 
y = df['species']

ensemble_clfs = [ 
   ("RandomForestClassifier, max_features=sqrt", 
       RandomForestClassifier(warm_start=True, oob_score=True, 
                              max_features='sqrt', 
                              random_state=RANDOM_STATE)), 
] 

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs. 
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs) 

# Range of `n_estimators` values to explore. 
min_estimators = 1 
max_estimators = 500
number=[]
for label, clfs in ensemble_clfs: 
    for i in range(min_estimators, max_estimators + 1): 
        clfs.set_params(n_estimators=i) 
        clfs.fit(X, y) 

        # Record the OOB error for each `n_estimators=i` setting. 
        oob_error = 1 - clfs.oob_score_ 
        error_rate[label].append(oob_error)
        a = i
        number.append(a)
error_rate = pd.DataFrame(error_rate)
number = pd.DataFrame(number)
error_rate = pd.concat([number, error_rate],axis=1)
error_rate = error_rate.rename(columns = {0: 'No of Tree'})
error_rate

In [None]:
# Export
error_rate.to_csv("error_rate.csv") 