## This script tests whether water column microbiomes (16S amplicon data) can be used to predict levels of a eukaryotic phytoplankton, in this case members of the family Noelaerhabdaceae (coccolithophores). It uses a Random Forest ML approach to assess this correlation.

In [1]:
import pandas as pd
import numpy as np
import glob as glob
import os as os
from sklearn.ensemble import RandomForestClassifier

In [2]:
os.chdir("/Users/nastassia.patin/GitHub/MarineDNA/Data")

### Import microbiome data set (relative abundances)
#### The raw data will have a column with sample names, and a column with the corresponding sample property (eg, 'High' or 'Low' phytoplankton levels). All other columns will be ASV relative abundances.

In [3]:
df = pd.read_csv("Flyer2018_16S_table_relfreq_forEhuxRF.tsv", sep='\t')
df.head()

Unnamed: 0,Sample,Noelaerhabdaceae,495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86851fb16bfafb87f3326,c8e360969108fa2125a3d56eb4dad24f,72143fd9e63fe40c1258948d2f0d79c3,7b6b178fad5599c0e9a734e4fb09fd64,4bbec3bb723375416616a87d785ac74a,0c35cfa523aa27921ef8544a16d1cd36,7ec69f2c62aad60e060e588ef687bdd0,...,995cc65bcfa53a868c42615004e99ad3,46b90aab075ecd8e4db549da708550d8,c4e1933274329209b7cf24daf18dfe0d,aa9e141a5e2781d280406c513bf34d45,d7682f536589fc5f920533513dd0002b,674933a0d44342a0647f7a5b4591f26e,bebe1b9a7e9aaa78172c1208111f4570,0128431733f67d02efad766d717fe6fd,41102a7dd1f4647ba5477c947daabc0e,51440f89c391fb32f9ee895db22bf8f8
0,CN18Fc12_8_eDNA,0.04473,0.139324,0.053004,0.036598,0.032812,0.039374,0.012367,0.0,0.022463,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CN18Fc21_6_eDNA,0.037524,0.13875,0.044532,0.040123,0.039901,0.027928,0.012481,0.010023,0.018428,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CN18Fc22_6_eDNA,0.033915,0.163262,0.039394,0.041777,0.043619,0.037151,0.010214,0.00707,0.01502,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CN18Fc24_6_eDNA,0.036907,0.139116,0.032631,0.045754,0.044055,0.035384,0.01406,0.006015,0.017341,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CN18Fc25_5_eDNA,0.037609,0.126853,0.050161,0.024458,0.034026,0.025193,0.011315,0.011631,0.017871,...,0.0,0.0,0.0,0.0,0.0,1e-05,0.0,0.0,0.0,0.0


In [4]:
df = df.drop(columns=['Sample'])
df.head()

Unnamed: 0,Noelaerhabdaceae,495c1bd1608a1dad54d3e2824ce899ef,a900b6678ce86851fb16bfafb87f3326,c8e360969108fa2125a3d56eb4dad24f,72143fd9e63fe40c1258948d2f0d79c3,7b6b178fad5599c0e9a734e4fb09fd64,4bbec3bb723375416616a87d785ac74a,0c35cfa523aa27921ef8544a16d1cd36,7ec69f2c62aad60e060e588ef687bdd0,61e9a50f4346bb3a5b16179b8eca71fa,...,995cc65bcfa53a868c42615004e99ad3,46b90aab075ecd8e4db549da708550d8,c4e1933274329209b7cf24daf18dfe0d,aa9e141a5e2781d280406c513bf34d45,d7682f536589fc5f920533513dd0002b,674933a0d44342a0647f7a5b4591f26e,bebe1b9a7e9aaa78172c1208111f4570,0128431733f67d02efad766d717fe6fd,41102a7dd1f4647ba5477c947daabc0e,51440f89c391fb32f9ee895db22bf8f8
0,0.04473,0.139324,0.053004,0.036598,0.032812,0.039374,0.012367,0.0,0.022463,0.047956,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.037524,0.13875,0.044532,0.040123,0.039901,0.027928,0.012481,0.010023,0.018428,0.024502,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.033915,0.163262,0.039394,0.041777,0.043619,0.037151,0.010214,0.00707,0.01502,0.019787,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.036907,0.139116,0.032631,0.045754,0.044055,0.035384,0.01406,0.006015,0.017341,0.023023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.037609,0.126853,0.050161,0.024458,0.034026,0.025193,0.011315,0.011631,0.017871,0.024233,...,0.0,0.0,0.0,0.0,0.0,1e-05,0.0,0.0,0.0,0.0


In [5]:
# Randomly assign some rows (samples) to be used for training set
# In this case we will use 80% of the samples as training
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .80

In [6]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [7]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 45
Number of observations in the test data: 13


### Pre Process Data

In [8]:
# Create a list of the feature column's names
features = df.columns[1:]

# View features
features

Index(['495c1bd1608a1dad54d3e2824ce899ef', 'a900b6678ce86851fb16bfafb87f3326',
       'c8e360969108fa2125a3d56eb4dad24f', '72143fd9e63fe40c1258948d2f0d79c3',
       '7b6b178fad5599c0e9a734e4fb09fd64', '4bbec3bb723375416616a87d785ac74a',
       '0c35cfa523aa27921ef8544a16d1cd36', '7ec69f2c62aad60e060e588ef687bdd0',
       '61e9a50f4346bb3a5b16179b8eca71fa', 'a140195871278e8fcf9447e42bad8786',
       ...
       '46b90aab075ecd8e4db549da708550d8', 'c4e1933274329209b7cf24daf18dfe0d',
       'aa9e141a5e2781d280406c513bf34d45', 'd7682f536589fc5f920533513dd0002b',
       '674933a0d44342a0647f7a5b4591f26e', 'bebe1b9a7e9aaa78172c1208111f4570',
       '0128431733f67d02efad766d717fe6fd', '41102a7dd1f4647ba5477c947daabc0e',
       '51440f89c391fb32f9ee895db22bf8f8', 'is_train'],
      dtype='object', length=2753)

#### train['Noelaerhabdaceae'] contains the actual sample classifications. Before we can use it, we need to convert each category into a digit

In [9]:
y = pd.factorize(train['Noelaerhabdaceae'])[0]
y

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])

In [10]:
z = pd.factorize(train['Noelaerhabdaceae'])[1]
z

Float64Index([0.037524328, 0.033915496,  0.03690718, 0.037608784, 0.043454822,
              0.008177717, 0.045000921, 0.038670189, 0.036937206, 0.022995027,
              0.039027223, 0.034245783, 0.064104903, 0.042513212, 0.069419505,
              0.085474811, 0.108641793, 0.075952145, 0.042380573, 0.063989472,
              0.065889828, 0.077185089, 0.066713215, 0.079557059, 0.075348378,
              0.071195981, 0.045407582, 0.050082606, 0.046224714, 0.062956229,
              0.014917035, 0.015871449, 0.097743175, 0.015270643, 0.122240527,
              0.017018272, 0.080645533, 0.001219692, 0.004067484,  0.06496855,
              0.000774962, 0.102916855, 0.111039231,  0.00947137, 0.014435067],
             dtype='float64')

### Train the Random Forest Classifier

In [11]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0, bootstrap=True)

# Train the Classifier to take the training features and learn how they relate to the training y (E. hux level)
clf.fit(train[features], y)

RandomForestClassifier(n_jobs=2, random_state=0)

### Apply Classifier to test data

In [12]:
# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(test[features])

array([42, 42, 17, 16,  0, 25, 25, 28, 31, 34, 39, 31, 40])

In [13]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[0.01, 0.04, 0.05, 0.02, 0.02, 0.04, 0.06, 0.03, 0.04, 0.02, 0.  ,
        0.02, 0.  , 0.  , 0.02, 0.05, 0.  , 0.03, 0.01, 0.04, 0.02, 0.06,
        0.02, 0.  , 0.01, 0.  , 0.02, 0.03, 0.01, 0.05, 0.  , 0.  , 0.04,
        0.01, 0.04, 0.  , 0.01, 0.  , 0.  , 0.03, 0.01, 0.02, 0.08, 0.03,
        0.01],
       [0.01, 0.02, 0.  , 0.  , 0.01, 0.  , 0.06, 0.01, 0.02, 0.01, 0.04,
        0.01, 0.01, 0.01, 0.  , 0.03, 0.01, 0.03, 0.  , 0.01, 0.04, 0.07,
        0.  , 0.01, 0.  , 0.01, 0.02, 0.05, 0.01, 0.02, 0.01, 0.  , 0.02,
        0.02, 0.06, 0.  , 0.01, 0.01, 0.  , 0.05, 0.03, 0.06, 0.16, 0.03,
        0.02],
       [0.02, 0.04, 0.02, 0.  , 0.02, 0.01, 0.01, 0.02, 0.07, 0.01, 0.03,
        0.05, 0.02, 0.03, 0.02, 0.03, 0.04, 0.08, 0.01, 0.08, 0.04, 0.04,
        0.04, 0.01, 0.03, 0.04, 0.02, 0.04, 0.02, 0.01, 0.  , 0.  , 0.01,
        0.01, 0.02, 0.01, 0.01, 0.  , 0.  , 0.  , 0.01, 0.  , 0.02, 0.01,
        0.  ],
       [0.02, 0.02, 0.04, 0.04, 0.  , 0.02, 0.01, 0.05, 0.01, 0.06,

### Evaluate classifier

In [14]:
# Revert predicted state to original classification state ("High" and "Low")
preds = z[clf.predict(test[features])]

In [15]:
# View the PREDICTED categories for each sample
preds

Float64Index([0.111039231, 0.111039231, 0.075952145, 0.108641793, 0.037524328,
              0.071195981, 0.071195981, 0.046224714, 0.015871449, 0.122240527,
               0.06496855, 0.015871449, 0.000774962],
             dtype='float64')

In [16]:
# View the ACTUAL categories for each sample
test['Noelaerhabdaceae']

0     0.044730
5     0.082510
19    0.052337
25    0.033234
29    0.054843
30    0.079624
33    0.058926
35    0.056334
40    0.012016
47    0.131767
48    0.079187
53    0.016251
56    0.013119
Name: Noelaerhabdaceae, dtype: float64

### Create a confusion matrix

In [17]:
pd.crosstab(test['Noelaerhabdaceae'], preds, rownames=['Actual Category'], colnames=['Predicted Category'])

Predicted Category,0.000775,0.015871,0.037524,0.046225,0.064969,0.071196,0.075952,0.108642,0.111039,0.122241
Actual Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.012016,0,1,0,0,0,0,0,0,0,0
0.013119,1,0,0,0,0,0,0,0,0,0
0.016251,0,1,0,0,0,0,0,0,0,0
0.033234,0,0,0,0,0,0,0,1,0,0
0.04473,0,0,0,0,0,0,0,0,1,0
0.052337,0,0,0,0,0,0,1,0,0,0
0.054843,0,0,1,0,0,0,0,0,0,0
0.056334,0,0,0,1,0,0,0,0,0,0
0.058926,0,0,0,0,0,1,0,0,0,0
0.079187,0,0,0,0,1,0,0,0,0,0


## Identify the most important features driving the correlation and re-train the RF using only those important features

### View feature importance

In [18]:
# View a list of the features and their importance scores
fimp_list = list(zip(train[features], clf.feature_importances_))

#### Remove features with an importance above some value

In [19]:
# Convert to dataframe
feature_imps = pd.DataFrame(fimp_list)
feature_nonzero_imps = feature_imps.loc[feature_imps[1] > 0.001]
feature_imps.head()

Unnamed: 0,0,1
0,495c1bd1608a1dad54d3e2824ce899ef,0.001826
1,a900b6678ce86851fb16bfafb87f3326,0.0
2,c8e360969108fa2125a3d56eb4dad24f,0.000232
3,72143fd9e63fe40c1258948d2f0d79c3,0.0
4,7b6b178fad5599c0e9a734e4fb09fd64,0.002042


In [20]:
feature_imps.shape

(2753, 2)

In [21]:
feature_nonzero_imps.shape

(302, 2)

In [22]:
# Make a list of features with nonzero importance
asvs_nonzero = feature_nonzero_imps[0].tolist()

In [23]:
# Subsample dataframe for features with nonzero importance
df2 = df[asvs_nonzero]
df2.head()

Unnamed: 0,495c1bd1608a1dad54d3e2824ce899ef,7b6b178fad5599c0e9a734e4fb09fd64,33e3d7409266a86935f571199ff9cc58,5ac48f1cd4727a11d81457fc680d0aef,20f6b9bc7e52e18f6472c4a34e27aafa,aac3fed77594bf011ae74840d1c7d22c,9ee8cfa3e2c4643f5de1f0f3151a9ed6,4583fd18d2f1cd6a78f626db363e5d24,ae1c8acafd0f5311b02ff603be1d9aa0,1c0640727449285edabbda36ba0d8d8e,...,e2d27f7244a5221d9349074a6a2f9511,1ecbf534671efdd85b3be045d387cc4b,70b827f508e5ca3fd05934a999d6fbe9,15be7172c5442dad250930877f752cc4,45960dbc970fd248a87c25ebbeea8ab5,f0a5d1aa51d06f04bc3e730043fe5708,36590fcc25275105fa297cf6a6e231e8,aa419ed5de4d84a23d61aa83b4212141,36be56aa5d88c7e36b006fa1f30613c7,b4a63a64583e650413fe6795b21d229c
0,0.139324,0.039374,0.020949,0.014639,0.010096,0.013125,0.014134,0.006815,0.008582,0.004038,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.13875,0.027928,0.019522,0.010514,0.004409,0.009008,0.010832,0.005915,0.004187,0.002855,...,0.0,0.0,0.0,3.2e-05,0.0,0.0,0.0,0.0,0.0,0.0
2,0.163262,0.037151,0.020268,0.010835,0.003104,0.009353,0.009653,0.00719,0.003905,0.005027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.139116,0.035384,0.01861,0.009666,0.003613,0.010408,0.009061,0.00578,0.0058,0.005858,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.126853,0.025193,0.017095,0.009375,0.006056,0.006148,0.012469,0.006117,0.004902,0.002206,...,0.0,0.0,0.0,0.0,2e-05,0.0,0.0,0.0,0.0,0.0


In [24]:
df.shape

(58, 2754)

In [25]:
df2.shape

(58, 302)

In [26]:
df2['Noelaerhabdaceae'] = df.loc[:]['Noelaerhabdaceae']
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Noelaerhabdaceae'] = df.loc[:]['Noelaerhabdaceae']


Unnamed: 0,495c1bd1608a1dad54d3e2824ce899ef,7b6b178fad5599c0e9a734e4fb09fd64,33e3d7409266a86935f571199ff9cc58,5ac48f1cd4727a11d81457fc680d0aef,20f6b9bc7e52e18f6472c4a34e27aafa,aac3fed77594bf011ae74840d1c7d22c,9ee8cfa3e2c4643f5de1f0f3151a9ed6,4583fd18d2f1cd6a78f626db363e5d24,ae1c8acafd0f5311b02ff603be1d9aa0,1c0640727449285edabbda36ba0d8d8e,...,1ecbf534671efdd85b3be045d387cc4b,70b827f508e5ca3fd05934a999d6fbe9,15be7172c5442dad250930877f752cc4,45960dbc970fd248a87c25ebbeea8ab5,f0a5d1aa51d06f04bc3e730043fe5708,36590fcc25275105fa297cf6a6e231e8,aa419ed5de4d84a23d61aa83b4212141,36be56aa5d88c7e36b006fa1f30613c7,b4a63a64583e650413fe6795b21d229c,Noelaerhabdaceae
0,0.139324,0.039374,0.020949,0.014639,0.010096,0.013125,0.014134,0.006815,0.008582,0.004038,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04473
1,0.13875,0.027928,0.019522,0.010514,0.004409,0.009008,0.010832,0.005915,0.004187,0.002855,...,0.0,0.0,3.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.037524
2,0.163262,0.037151,0.020268,0.010835,0.003104,0.009353,0.009653,0.00719,0.003905,0.005027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033915
3,0.139116,0.035384,0.01861,0.009666,0.003613,0.010408,0.009061,0.00578,0.0058,0.005858,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036907
4,0.126853,0.025193,0.017095,0.009375,0.006056,0.006148,0.012469,0.006117,0.004902,0.002206,...,0.0,0.0,0.0,2e-05,0.0,0.0,0.0,0.0,0.0,0.037609


In [27]:
df2.shape

(58, 303)

In [28]:
df.shape

(58, 2754)

### Repeat model training and running

In [29]:
# Randomly assign some rows (samples) to be used for training set
df2['is_train'] = np.random.uniform(0, 1, len(df2)) <= .8
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df2[df2['is_train']==True], df2[df2['is_train']==False]
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 55
Number of observations in the test data: 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['is_train'] = np.random.uniform(0, 1, len(df2)) <= .8


In [30]:
train.head()

Unnamed: 0,495c1bd1608a1dad54d3e2824ce899ef,7b6b178fad5599c0e9a734e4fb09fd64,33e3d7409266a86935f571199ff9cc58,5ac48f1cd4727a11d81457fc680d0aef,20f6b9bc7e52e18f6472c4a34e27aafa,aac3fed77594bf011ae74840d1c7d22c,9ee8cfa3e2c4643f5de1f0f3151a9ed6,4583fd18d2f1cd6a78f626db363e5d24,ae1c8acafd0f5311b02ff603be1d9aa0,1c0640727449285edabbda36ba0d8d8e,...,70b827f508e5ca3fd05934a999d6fbe9,15be7172c5442dad250930877f752cc4,45960dbc970fd248a87c25ebbeea8ab5,f0a5d1aa51d06f04bc3e730043fe5708,36590fcc25275105fa297cf6a6e231e8,aa419ed5de4d84a23d61aa83b4212141,36be56aa5d88c7e36b006fa1f30613c7,b4a63a64583e650413fe6795b21d229c,Noelaerhabdaceae,is_train
0,0.139324,0.039374,0.020949,0.014639,0.010096,0.013125,0.014134,0.006815,0.008582,0.004038,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04473,True
1,0.13875,0.027928,0.019522,0.010514,0.004409,0.009008,0.010832,0.005915,0.004187,0.002855,...,0.0,3.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.037524,True
2,0.163262,0.037151,0.020268,0.010835,0.003104,0.009353,0.009653,0.00719,0.003905,0.005027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033915,True
3,0.139116,0.035384,0.01861,0.009666,0.003613,0.010408,0.009061,0.00578,0.0058,0.005858,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036907,True
4,0.126853,0.025193,0.017095,0.009375,0.006056,0.006148,0.012469,0.006117,0.004902,0.002206,...,0.0,0.0,2e-05,0.0,0.0,0.0,0.0,0.0,0.037609,True


In [31]:
# Create a list of the feature column's names
# Final column value needs to be 2 LESS THAN # columns in df2
features = df2.columns[:6]
# View features
features

Index(['495c1bd1608a1dad54d3e2824ce899ef', '7b6b178fad5599c0e9a734e4fb09fd64',
       '33e3d7409266a86935f571199ff9cc58', '5ac48f1cd4727a11d81457fc680d0aef',
       '20f6b9bc7e52e18f6472c4a34e27aafa', 'aac3fed77594bf011ae74840d1c7d22c'],
      dtype='object')

In [32]:
# train['Noelaerhabdaceae'] contains the actual sample classifications. Before we can use it,
# we need to convert each category into a digit. In this case there are two categories, "High" and "Low".
y = pd.factorize(train['Noelaerhabdaceae'])[0]
z = pd.factorize(train['Noelaerhabdaceae'])[1]

In [33]:
y

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54])

In [34]:
z

Float64Index([  0.0447304, 0.037524328, 0.033915496,  0.03690718, 0.037608784,
              0.082510466, 0.043454822, 0.008177717, 0.045000921, 0.038670189,
              0.036937206, 0.022995027, 0.039027223, 0.034245783, 0.064104903,
              0.042513212, 0.069419505, 0.085474811, 0.108641793, 0.052337133,
              0.075952145, 0.042380573, 0.063989472, 0.065889828, 0.077185089,
              0.033233934, 0.066713215, 0.079557059, 0.075348378, 0.054842731,
              0.079623818, 0.071195981, 0.045407582, 0.058926195, 0.050082606,
              0.056334249, 0.046224714, 0.062956229, 0.014917035, 0.015871449,
              0.097743175, 0.015270643, 0.122240527, 0.017018272, 0.080645533,
              0.131767065, 0.079187147, 0.004067484, 0.000774962, 0.102916855,
              0.016251256, 0.111039231,  0.00947137, 0.013118779, 0.014435067],
             dtype='float64')

In [35]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
# This model assigned weight 1 to 'High' class and weight 2 to 'Low' class
clf = RandomForestClassifier(n_jobs=2, random_state=0, bootstrap=True, class_weight={0:1,1:2})

# Train the Classifier to take the training features and learn how they relate to the training y (E. hux level)
clf.fit(train[features], y)

RandomForestClassifier(class_weight={0: 1, 1: 2}, n_jobs=2, random_state=0)

In [36]:
# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(test[features])
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.14, 0.16, 0.14, 0.06, 0.  , 0.17,
        0.  , 0.  , 0.  , 0.11, 0.04, 0.  , 0.  , 0.  , 0.06, 0.08, 0.04],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.03, 0.  , 0.  , 0.  , 0.04,
        0.03, 0.01, 0.15, 0.18, 0.28, 0.  , 0.  , 0.01, 0.02, 0.1 , 0.15],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  

In [37]:
# Revert predicted state to original classification state
preds = z[clf.predict(test[features])]

In [38]:
pd.crosstab(test['Noelaerhabdaceae'], preds, rownames=['Actual Category'], colnames=['Predicted Category'])

Predicted Category,0.000775,0.017018,0.102917
Actual Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.00122,1,0,0
0.012016,0,1,0
0.064969,0,0,1


### No better than the model with all features!