## This script tests whether water column microbiomes (16S amplicon data) can be used to predict levels of a eukaryotic phytoplankton, in this case members of the family Noelaerhabdaceae (coccolithophores). It uses a Random Forest ML approach to assess this correlation.

In [1]:
import pandas as pd
import numpy as np
import glob as glob
import os as os
from sklearn.ensemble import RandomForestClassifier as sklRF

In [None]:
# equivalent RAPIDS libraries
import cudf # RAPIDS equivalent of pandas
import cupy as cp # RAPIDS equivalent of numpy
from cuml import RandomForestclassifier as cuRF # RAPIDS Random Forest

In [2]:
os.chdir("/Users/nastassia.patin/GitHub/MarineDNA/Data")

### Import microbiome data set (relative abundances)
#### The raw data will have a column with sample names and all other columns will be ASV relative abundances.

In [5]:
df = cudf.read_csv("RREAS2018_16S_ASVs_with_Noelaerhab.csv")
df = cudf.rename(columns={"Unnamed: 0": "Sample"})
df.head()

Unnamed: 0,Sample,a404bf4d32831f2133794e5fe5d0a73e,f35c0a576bb1ab3577191cb12352c249,588253f8c914de60cbb9577946854a39,d883e3b29658c861ac524bbfc2cef34d,3468430277ce2317dec6f413dcfaa03e,625cabfbd66e4f32de52ec86cb441744,af4ec853b8f3af897984b19fd4b71d91,d42ff5ae074d8ae7b783eac352a53d9b,6b9dd64d080d53d989a8407d9806429e,...,80c4735759e2048a7cc2f56f19a1f304,1349330a18891c59f576209fe532626b,ebe4de7f14399b72623a976ee09b8e77,78567cbcdf70b7f2c368a1870633f111,5dc6c2efdb748cb3508b4e8592239b83,6f477eb9c90036177a1a870f0d4d5cc8,fd5567754d768cc191c36f6aaa99c51d,89713a21b54d7e0c55542ee840df5edf,2e289caa87e3a7af21c26b12f60b0ac9,Noelaerhabdaceae
0,Lasker18Sc122_17,0.006743,0.002309,0.219094,0.0,0.006297,0.001286,0.002335,0.0,0.015258,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010784
1,Lasker18Sc122_19,0.006891,0.002272,0.219868,0.0,0.005595,0.000884,0.002302,0.0,0.014071,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014332
2,Lasker18Sc122_3,0.051873,0.002164,0.024401,0.000583,0.000897,0.032282,0.000601,0.024567,0.00357,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031148
3,Lasker18Sc122_5,0.051078,0.001516,0.024989,0.0,0.001002,0.040638,0.000538,0.026383,0.004132,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023434
4,Lasker18Sc123_17,0.00039,0.17513,0.043393,0.003064,0.010398,0.000817,0.013554,0.0,0.008058,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041543


In [6]:
df = cudf.drop(columns=['Sample'])
df.head()

Unnamed: 0,a404bf4d32831f2133794e5fe5d0a73e,f35c0a576bb1ab3577191cb12352c249,588253f8c914de60cbb9577946854a39,d883e3b29658c861ac524bbfc2cef34d,3468430277ce2317dec6f413dcfaa03e,625cabfbd66e4f32de52ec86cb441744,af4ec853b8f3af897984b19fd4b71d91,d42ff5ae074d8ae7b783eac352a53d9b,6b9dd64d080d53d989a8407d9806429e,76dfe03cf1770d4ae5c703515096f2e9,...,80c4735759e2048a7cc2f56f19a1f304,1349330a18891c59f576209fe532626b,ebe4de7f14399b72623a976ee09b8e77,78567cbcdf70b7f2c368a1870633f111,5dc6c2efdb748cb3508b4e8592239b83,6f477eb9c90036177a1a870f0d4d5cc8,fd5567754d768cc191c36f6aaa99c51d,89713a21b54d7e0c55542ee840df5edf,2e289caa87e3a7af21c26b12f60b0ac9,Noelaerhabdaceae
0,0.006743,0.002309,0.219094,0.0,0.006297,0.001286,0.002335,0.0,0.015258,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010784
1,0.006891,0.002272,0.219868,0.0,0.005595,0.000884,0.002302,0.0,0.014071,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014332
2,0.051873,0.002164,0.024401,0.000583,0.000897,0.032282,0.000601,0.024567,0.00357,0.024891,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031148
3,0.051078,0.001516,0.024989,0.0,0.001002,0.040638,0.000538,0.026383,0.004132,0.023718,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023434
4,0.00039,0.17513,0.043393,0.003064,0.010398,0.000817,0.013554,0.0,0.008058,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041543


In [7]:
# Randomly assign some rows (samples) to be used for training set
# In this case we will use 80% of the samples as training
df['is_train'] = cp.random.uniform(0, 1, len(df)) <= .80

In [8]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [9]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 52
Number of observations in the test data: 17


### Pre Process Data

In [10]:
# Create a list of the feature column's names
features = df.columns[1:]

# View features
features

Index(['f35c0a576bb1ab3577191cb12352c249', '588253f8c914de60cbb9577946854a39',
       'd883e3b29658c861ac524bbfc2cef34d', '3468430277ce2317dec6f413dcfaa03e',
       '625cabfbd66e4f32de52ec86cb441744', 'af4ec853b8f3af897984b19fd4b71d91',
       'd42ff5ae074d8ae7b783eac352a53d9b', '6b9dd64d080d53d989a8407d9806429e',
       '76dfe03cf1770d4ae5c703515096f2e9', '71a33c961f2ca92f6ce97184fcb81f01',
       ...
       '1349330a18891c59f576209fe532626b', 'ebe4de7f14399b72623a976ee09b8e77',
       '78567cbcdf70b7f2c368a1870633f111', '5dc6c2efdb748cb3508b4e8592239b83',
       '6f477eb9c90036177a1a870f0d4d5cc8', 'fd5567754d768cc191c36f6aaa99c51d',
       '89713a21b54d7e0c55542ee840df5edf', '2e289caa87e3a7af21c26b12f60b0ac9',
       'Noelaerhabdaceae', 'is_train'],
      dtype='object', length=28743)

#### train['Noelaerhabdaceae'] contains the actual sample classifications. Before we can use it, we need to convert each category into a digit

In [11]:
y = cudf.factorize(train['Noelaerhabdaceae'])[0]
y

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51])

In [12]:
z = cudf.factorize(train['Noelaerhabdaceae'])[1]
z

Float64Index([   0.0311481666324164,    0.0234340001544759,
                 0.0415430267062314,    0.0267222499711837,
                 0.0282736010311878,    0.0265794316090779,
                 0.0172533148729964,    0.0063763284017503,
                 0.0330862163646348,    0.0047985265130707,
                 0.0421524089833002,    0.0253739997680621,
                 0.0108585094977595,    0.0135778611395298,
                 0.0513186526242492,    0.0158427045759954,
                 0.0114168145877037,    0.0789184901170093,
                 0.0038611301986191,    0.0039390504006302,
                 0.0145813734713076,    0.0107960557591743,
                  0.008884226751156,     0.005098411192791,
                  0.012693601098849,    0.0017858939531139,
                 0.0022062113334464,    0.0031138298264362,
                  0.002458686614314,    0.0061369625645283,
                   0.00404168554599,    0.0038812366085093,
                 0.0040940000925197, 8.2

### Train the Random Forest Classifier

In [13]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = cuRF(n_estimators=10, max_depth=10, n_bins=10, 
           n_streams=10, random_state=0, bootstrap=True) ## what are appropriate parameters here?

# Train the Classifier to take the training features and learn how they relate to the training y (Noelaerhabdaceae level)
clf.fit(train[features], y)

RandomForestClassifier(n_jobs=2, random_state=0)

### Apply Classifier to test data

In [14]:
# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(test[features])

array([14, 14,  6,  7, 13, 10, 11, 24, 13, 28, 29, 30, 33, 49,  1, 28, 41])

In [15]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[0.  , 0.01, 0.05, 0.04, 0.02, 0.  , 0.01, 0.  , 0.06, 0.  , 0.07,
        0.07, 0.  , 0.01, 0.08, 0.  , 0.  , 0.03, 0.04, 0.03, 0.  , 0.02,
        0.02, 0.01, 0.04, 0.  , 0.01, 0.02, 0.  , 0.01, 0.03, 0.01, 0.  ,
        0.03, 0.01, 0.01, 0.02, 0.  , 0.03, 0.03, 0.  , 0.06, 0.02, 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.04, 0.02, 0.03, 0.01],
       [0.  , 0.01, 0.04, 0.03, 0.01, 0.  , 0.01, 0.  , 0.05, 0.  , 0.07,
        0.07, 0.  , 0.01, 0.08, 0.  , 0.  , 0.03, 0.04, 0.03, 0.01, 0.01,
        0.02, 0.01, 0.07, 0.  , 0.01, 0.01, 0.  , 0.01, 0.02, 0.02, 0.  ,
        0.02, 0.01, 0.02, 0.02, 0.  , 0.03, 0.04, 0.  , 0.07, 0.02, 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.03, 0.03, 0.03, 0.01],
       [0.  , 0.01, 0.01, 0.01, 0.  , 0.  , 0.24, 0.  , 0.06, 0.  , 0.06,
        0.06, 0.  , 0.01, 0.08, 0.  , 0.  , 0.04, 0.01, 0.  , 0.02, 0.02,
        0.01, 0.  , 0.03, 0.  , 0.02, 0.02, 0.  , 0.02, 0.01, 0.01, 0.01,
        0.01, 0.01, 0.02, 0.01, 0.  , 0.02, 0.03, 0.  , 0.05, 0.  , 0.  

### Evaluate classifier

In [16]:
# Revert predicted state to original classification state
preds = z[clf.predict(test[features])]

In [17]:
# View the PREDICTED categories for each sample
preds

Float64Index([   0.0513186526242492,    0.0513186526242492,
                 0.0172533148729964,    0.0063763284017503,
                 0.0135778611395298,    0.0421524089833002,
                 0.0253739997680621,     0.012693601098849,
                 0.0135778611395298,     0.002458686614314,
                 0.0061369625645283,      0.00404168554599,
              8.243275547971742e-06,     0.006407444840672,
                 0.0234340001544759,     0.002458686614314,
                 0.0027978650137741],
             dtype='float64')

In [18]:
# View the ACTUAL categories for each sample
test['Noelaerhabdaceae']

0     0.010784
1     0.014332
9     0.016998
11    0.013429
13    0.017008
20    0.048208
24    0.066700
32    0.011881
33    0.007392
37    0.003301
39    0.011675
42    0.004921
45    0.000016
50    0.011581
51    0.002274
56    0.000188
57    0.000753
Name: Noelaerhabdaceae, dtype: float64

### Create a confusion matrix

In [19]:
cudf.crosstab(test['Noelaerhabdaceae'], preds, rownames=['Actual Value'], colnames=['Predicted Value'])

Predicted Category,0.000008,0.002459,0.002798,0.004042,0.006137,0.006376,0.006407,0.012694,0.013578,0.017253,0.023434,0.025374,0.042152,0.051319
Actual Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1.6e-05,1,0,0,0,0,0,0,0,0,0,0,0,0,0
0.000188,0,1,0,0,0,0,0,0,0,0,0,0,0,0
0.000753,0,0,1,0,0,0,0,0,0,0,0,0,0,0
0.002274,0,0,0,0,0,0,0,0,0,0,1,0,0,0
0.003301,0,1,0,0,0,0,0,0,0,0,0,0,0,0
0.004921,0,0,0,1,0,0,0,0,0,0,0,0,0,0
0.007392,0,0,0,0,0,0,0,0,1,0,0,0,0,0
0.010784,0,0,0,0,0,0,0,0,0,0,0,0,0,1
0.011581,0,0,0,0,0,0,1,0,0,0,0,0,0,0
0.011675,0,0,0,0,1,0,0,0,0,0,0,0,0,0


## Identify the most important features driving the correlation and re-train the RF using only those important features

### View feature importance

In [20]:
# View a list of the features and their importance scores
fimp_list = list(zip(train[features], clf.feature_importances_))

#### Remove features with an importance above some value

In [21]:
# Convert to dataframe
feature_imps = cudf.DataFrame(fimp_list)
feature_nonzero_imps = feature_imps.loc[feature_imps[1] > 0.001]
feature_imps.head()

Unnamed: 0,0,1
0,f35c0a576bb1ab3577191cb12352c249,0.0002
1,588253f8c914de60cbb9577946854a39,0.000599
2,d883e3b29658c861ac524bbfc2cef34d,0.000199
3,3468430277ce2317dec6f413dcfaa03e,0.000477
4,625cabfbd66e4f32de52ec86cb441744,0.000398


In [22]:
feature_imps.shape

(28743, 2)

In [23]:
feature_nonzero_imps.shape

(25, 2)

In [24]:
# Make a list of features with nonzero importance
asvs_nonzero = feature_nonzero_imps[0].tolist()

In [25]:
# Subsample dataframe for features with nonzero importance
df2 = df[asvs_nonzero]
df2.head()

Unnamed: 0,9bbdb9accefe3cf4f38fa0a237f15d43,3c732a7edae66c0e41e06067d0ac801a,bd1acc8433c9d5a93f22dcba76db225d,7aa69e82df6183bab4b3dbf888d0ab3d,45d54055f9567ba9c90f470183cee9ca,1d6abc1035990ff3ad16f9f5e6c4a1af,79e3ac10a2d72159ce7374eda836582d,54832dfaf034c0e58236174e9a2791e9,03740853bd28777bef922a85a3c74091,8c08ef523cae456ab86f0cabee081741,...,dd83277028fa431a26f2faae741979f0,16fd22fde282842cae5bbc310468919b,0d0242ceaa3e22f204cc618fa79fdd29,c205ba2a38ed29be2b5f0ed31d2a7fbc,46c078b215e6afc9d0f8e883b2149f0d,92e15795dcfe996a5a4d7cf4dcfc28ad,69dedcbaeab47307bcc3f510479e2714,40ffc8bf74be7250c774b5af7b3f0a73,fffbaed071efc8ea737375aa1d9d3954,22f3cda1f73f4e9be22d740a281f6fcf
0,0.006966,0.002414,0.0,0.000236,0.0,0.0,0.006389,0.0,0.000603,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.011099,0.003095,0.00061,0.0,0.0,0.0,0.004589,0.0,0.000518,6.1e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.002127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.4e-05,0.000166,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.001492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000367,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.028056,0.002915,0.0,0.001801,0.006666,0.0,0.0,0.0,0.002804,0.0,...,5.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
df.shape

(69, 28744)

In [27]:
df2.shape

(69, 25)

In [28]:
df2['Noelaerhabdaceae'] = df.loc[:]['Noelaerhabdaceae']
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Noelaerhabdaceae'] = df.loc[:]['Noelaerhabdaceae']


Unnamed: 0,9bbdb9accefe3cf4f38fa0a237f15d43,3c732a7edae66c0e41e06067d0ac801a,bd1acc8433c9d5a93f22dcba76db225d,7aa69e82df6183bab4b3dbf888d0ab3d,45d54055f9567ba9c90f470183cee9ca,1d6abc1035990ff3ad16f9f5e6c4a1af,79e3ac10a2d72159ce7374eda836582d,54832dfaf034c0e58236174e9a2791e9,03740853bd28777bef922a85a3c74091,8c08ef523cae456ab86f0cabee081741,...,16fd22fde282842cae5bbc310468919b,0d0242ceaa3e22f204cc618fa79fdd29,c205ba2a38ed29be2b5f0ed31d2a7fbc,46c078b215e6afc9d0f8e883b2149f0d,92e15795dcfe996a5a4d7cf4dcfc28ad,69dedcbaeab47307bcc3f510479e2714,40ffc8bf74be7250c774b5af7b3f0a73,fffbaed071efc8ea737375aa1d9d3954,22f3cda1f73f4e9be22d740a281f6fcf,Noelaerhabdaceae
0,0.006966,0.002414,0.0,0.000236,0.0,0.0,0.006389,0.0,0.000603,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010784
1,0.011099,0.003095,0.00061,0.0,0.0,0.0,0.004589,0.0,0.000518,6.1e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014332
2,0.002127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.4e-05,0.000166,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031148
3,0.001492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000367,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023434
4,0.028056,0.002915,0.0,0.001801,0.006666,0.0,0.0,0.0,0.002804,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041543


In [29]:
df2.shape

(69, 26)

In [30]:
df.shape

(69, 28744)

### Repeat model training and running

In [31]:
# Randomly assign some rows (samples) to be used for training set
df2['is_train'] = cp.random.uniform(0, 1, len(df2)) <= .8
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df2[df2['is_train']==True], df2[df2['is_train']==False]
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 51
Number of observations in the test data: 18


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['is_train'] = np.random.uniform(0, 1, len(df2)) <= .8


In [32]:
train.head()

Unnamed: 0,9bbdb9accefe3cf4f38fa0a237f15d43,3c732a7edae66c0e41e06067d0ac801a,bd1acc8433c9d5a93f22dcba76db225d,7aa69e82df6183bab4b3dbf888d0ab3d,45d54055f9567ba9c90f470183cee9ca,1d6abc1035990ff3ad16f9f5e6c4a1af,79e3ac10a2d72159ce7374eda836582d,54832dfaf034c0e58236174e9a2791e9,03740853bd28777bef922a85a3c74091,8c08ef523cae456ab86f0cabee081741,...,0d0242ceaa3e22f204cc618fa79fdd29,c205ba2a38ed29be2b5f0ed31d2a7fbc,46c078b215e6afc9d0f8e883b2149f0d,92e15795dcfe996a5a4d7cf4dcfc28ad,69dedcbaeab47307bcc3f510479e2714,40ffc8bf74be7250c774b5af7b3f0a73,fffbaed071efc8ea737375aa1d9d3954,22f3cda1f73f4e9be22d740a281f6fcf,Noelaerhabdaceae,is_train
0,0.006966,0.002414,0.0,0.000236,0.0,0.0,0.006389,0.0,0.000603,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010784,True
1,0.011099,0.003095,0.00061,0.0,0.0,0.0,0.004589,0.0,0.000518,6.1e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014332,True
2,0.002127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.4e-05,0.000166,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031148,True
3,0.001492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000367,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023434,True
4,0.028056,0.002915,0.0,0.001801,0.006666,0.0,0.0,0.0,0.002804,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041543,True


In [33]:
# Create a list of the feature column's names
# Final column value needs to be 2 LESS THAN # columns in df2
features = df2.columns[:6]
# View features
features

Index(['9bbdb9accefe3cf4f38fa0a237f15d43', '3c732a7edae66c0e41e06067d0ac801a',
       'bd1acc8433c9d5a93f22dcba76db225d', '7aa69e82df6183bab4b3dbf888d0ab3d',
       '45d54055f9567ba9c90f470183cee9ca', '1d6abc1035990ff3ad16f9f5e6c4a1af'],
      dtype='object')

In [35]:
# train['Noelaerhabdaceae'] contains the actual sample classifications. Before we can use it,
# we need to convert each category into a digit.
y = cudf.factorize(train['Noelaerhabdaceae'])[0]
z = cudf.factorize(train['Noelaerhabdaceae'])[1]

In [36]:
y

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])

In [37]:
z

Float64Index([    0.0107836842725926,     0.0143320870648929,
                  0.0311481666324164,     0.0234340001544759,
                  0.0415430267062314,     0.0267222499711837,
                  0.0282736010311878,        0.0169984063994,
                  0.0063763284017503,     0.0134294914600305,
                  0.0170080373347161,     0.0047985265130707,
                  0.0253739997680621,     0.0108585094977595,
                  0.0135778611395298,     0.0482077922077922,
                  0.0789184901170093,     0.0666998094291159,
                  0.0039390504006302,     0.0145813734713076,
                   0.008884226751156,      0.012693601098849,
                  0.0118812532912058,     0.0073922254974207,
                  0.0022062113334464,     0.0031138298264362,
                   0.002458686614314,      0.011675168444311,
                  0.0061369625645283,       0.00404168554599,
                  0.0049212787635287,     0.0038812366085093,
        

In [38]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = cuRF(n_jobs=2, random_state=0, bootstrap=True)

# Train the Classifier to take the training features and learn how they relate to the training y (E. hux level)
clf.fit(train[features], y)

RandomForestClassifier(n_jobs=2, random_state=0)

In [39]:
# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(test[features])
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[0.  , 0.02, 0.  , 0.1 , 0.  , 0.  , 0.29, 0.  , 0.02, 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.33, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.06, 0.  , 0.01, 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.03, 0.01, 0.  , 0.  , 0.01, 0.  , 0.  , 0.06, 0.  , 0.  ,
        0.  , 0.01, 0.01, 0.03, 0.  , 0.01, 0.  ],
       [0.  , 0.06, 0.02, 0.  , 0.  , 0.  , 0.11, 0.56, 0.  , 0.1 , 0.  ,
        0.01, 0.01, 0.  , 0.  , 0.02, 0.01, 0.02, 0.  , 0.  , 0.  , 0.01,
        0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  ,
        0.  , 0.  , 0.02, 0.  , 0.  , 0.02, 0.  ],
       [0.02, 0.02, 0.  , 0.  , 0.08, 0.01, 0.  , 0.01, 0.  , 0.  , 0.  ,
        0.  , 0.12, 0.02, 0.  , 0.1 , 0.07, 0.2 , 0.02, 0.02, 0.  , 0.01,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  ,
        0.01, 0.  , 0.  , 0.05, 0.01, 0.  , 0.14, 0.03, 0.  , 0.  , 0.03,
        0.

In [40]:
# Revert predicted state to original classification state
preds = z[clf.predict(test[features])]

In [41]:
pd.crosstab(test['Noelaerhabdaceae'], preds, rownames=['Actual Category'], colnames=['Predicted Category'])

Predicted Category,0.000016,0.000188,0.000213,0.003939,0.006211,0.007392,0.012213,0.012411,0.013429,0.013578,0.014581,0.016998,0.048208,0.066700,0.078918
Actual Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
8e-06,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0.000493,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
0.001786,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
0.003301,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
0.003861,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
0.005098,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
0.006638,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
0.010007,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
0.010796,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
