In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import MESS
import numpy as np
import pandas as pd

from IPython.display import display

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


## Use the Reunion spider data for testing
First quickly munge the spider data into the form we need.

In [2]:
sp_dir = "/home/isaac/SGD_empirical/empirical_reunion_spiders/"
sp_abunds = pd.read_csv(sp_dir + "spider_abunds_df.txt", index_col=0, header=0)
sp_pis = pd.read_csv(sp_dir + "spider.pis", sep=" ", index_col=1, header=None)
sp_pis.index = [x.split("/")[1].split(".")[0] for x in sp_pis.index]
sp_df = pd.concat([sp_abunds, sp_pis], axis=1, sort=False)
sp_df.columns = ["abundance", "pi"]
sp_df.to_csv(sp_dir + "spider.dat", header=True)
sp_df[:5]

Unnamed: 0,abundance,pi
GL-01a,150,0.011126
GL-01b,107,0.023247
GL-01c,32,0.000252
GL-02,2,0.0
GL-03,200,0.028499


# Select the best model

In [194]:
simfile = "/home/isaac/Continuosity/MESS/analysis/full-parameter-estimation/SIMOUT.txt"

cla = MESS.inference.Classifier(empirical_df=sp_df, simfile=simfile, algorithm="rf")
est, proba = cla.predict(select_features=True, param_search=True, quick=True, verbose=False)
display(est, proba)
display(cla.feature_importances())

Unnamed: 0,community_assembly_model
estimate,neutral


Unnamed: 0,filtering,neutral
community_assembly_model,0.29316,0.70684


Unnamed: 0,S,abund_h1,abund_h2,abund_h3,abund_h4,pi_h1,pi_h2,pi_h3,pi_h4,mean_pi,std_pi,skewness_pi,kurtosis_pi,iqr_pi,SGD_0,SGD_1,SGD_2,SGD_3
Feature importance,0.118627,0.208496,0.072976,0.061479,0.063611,0.056024,0.044238,0.035198,0.031519,0.046397,0.053783,0.028997,0.023547,0.027168,0.083716,0.012203,0.009674,0.022347


# Estimate parameters

In [196]:
simfile = "/home/isaac/Continuosity/MESS/analysis/full-parameter-estimation/SIMOUT.txt"

rgr = MESS.inference.Regressor(empirical_df=sp_df, simfile=simfile, algorithm="rfq")
rgr.set_targets(target_list=["alpha", "_lambda"])

est = rgr.predict(select_features=True, param_search=False, quick=True, verbose=True)
display(est, rgr.feature_importances())

Selecting features:
  alpha	['abund_h1', 'mean_pi', 'iqr_pi', 'abundance_pi_cor', 'SGD_0']
  _lambda	['abund_h1', 'abund_h2', 'abund_h3', 'abund_h4', 'median_pi', 'SGD_0']
All selected features: abund_h1 abund_h2 abund_h3 abund_h4 mean_pi median_pi iqr_pi abundance_pi_cor SGD_0
Calculating prediction interval(s)


Unnamed: 0,alpha,_lambda
estimate,4301.3,0.92206
lower 0.025,2801.0,0.837
upper 0.975,4803.0,0.955


Unnamed: 0,SGD_0,abund_h1,abund_h2,abund_h3,abund_h4,abundance_pi_cor,iqr_pi,mean_pi,median_pi
alpha,0.131318,0.200917,,,,0.1669,0.323787,0.177077,
_lambda,0.038867,0.07253,0.033238,0.045122,0.802928,,,,0.007314


Unnamed: 0,abund_h1,abund_h2,abund_h3,abund_h4,abundance_pi_cor,iqr_pi,mean_pi,skewness_pi
alpha,,,,0.185144,0.169929,0.276145,0.216505,0.152277
_lambda,0.080334,0.051802,0.046576,0.821288,,,,


## Save regressor to file/model to file

In [150]:
rgr.feature_importances()
#rgr.model_by_target

Unnamed: 0,S,abund_h1,abund_h2,abund_h3,abund_h4,pi_h1,pi_h2,pi_h3,pi_h4,mean_pi,std_pi,skewness_pi,kurtosis_pi,median_pi,iqr_pi,trees,abundance_pi_cor,SGD_0,SGD_1,SGD_2,SGD_3,SGD_4,SGD_5,SGD_6,SGD_7,SGD_8,SGD_9
alpha,0.015795,0.066961,0.018036,0.012658,0.025727,0.027068,0.009332,0.00248,0.002773,0.275834,0.008189,0.022957,0.010469,0.004996,0.369852,0.0,0.059932,0.050365,0.001897,0.001806,0.002297,0.002108,0.004731,0.002443,0.00057,0.0,0.000721
_lambda,0.00608,0.018718,0.006266,0.040721,0.859859,0.003194,0.001113,0.004015,0.001348,0.009954,0.010465,0.003409,0.005999,0.00155,0.002724,0.0,0.006535,0.013376,0.001096,0.0003,0.000758,3.5e-05,0.000916,0.000294,0.000857,0.000169,0.000251


# Trash below here

In [56]:
simfile = "/home/isaac/Continuosity/MESS/analysis/full-parameter-estimation/SIMOUT.txt"

cla = MESS.inference.Classifier(empirical_df=sp_df, simfile=simfile, algorithm="rf", verbose=True)
cla._base_model
cla.y[:2]
#rgr.set_targets(target_list="_lambda")
#rgr.predict(select_features=True, quick=False, verbose=True)

Got empirical summary statistics:     S   abund_h1   abund_h2   abund_h3  abund_h4      pi_h1      pi_h2      pi_h3      pi_h4   mean_pi    std_pi  skewness_pi  kurtosis_pi  median_pi   iqr_pi  trees  abundance_pi_cor  SGD_0  SGD_1  SGD_2  SGD_3  SGD_4  SGD_5  SGD_6  SGD_7  SGD_8  SGD_9
0  57  22.334174  14.092503  10.622744  8.877024  27.956422  21.660417  19.178423  17.853145  0.008283  0.010581      1.41016     0.756963   0.003258  0.01004      0          0.413119     32      7      5      0      2      4      1      3      1      2


Unnamed: 0,community_assembly_model
0,neutral
1,neutral


In [51]:
cla.features

Index([u'S', u'abund_h1', u'abund_h2', u'abund_h3', u'abund_h4', u'pi_h1', u'pi_h2', u'pi_h3', u'pi_h4', u'mean_pi', u'std_pi', u'skewness_pi', u'kurtosis_pi', u'median_pi', u'iqr_pi', u'trees', u'abundance_pi_cor', u'SGD_0', u'SGD_1', u'SGD_2', u'SGD_3', u'SGD_4', u'SGD_5', u'SGD_6', u'SGD_7', u'SGD_8', u'SGD_9'], dtype='object')