In [112]:
import pandas as pd
import numpy as np


import rfr_remote
import rfr_ipynb

In [113]:
def rmse_calc (traindf, testdf, output_type='none'):
    
    y_train = traindf['dft_train']
    y_test = testdf['dft_test']
    pred_train = traindf['mean_train']
    pred_test = testdf['mean_test']
        
    train_rmse, test_rmse = rfr_remote.rmse(y_train=y_train, y_test=y_test,
                                        trainpred=pred_train, testpred=pred_test)
    if output_type == 'none':
        rmse_df = pd.DataFrame(data={'train_rmse':[train_rmse], 'test_rmse':[test_rmse]})
    
    elif output_type == 'type':
        rmse_df = pd.DataFrame(data={'Type':[traindf['Type'].iloc[0]], 
                                     'train_rmse':[train_rmse], 'test_rmse':[test_rmse]})
    elif output_type == 'site':
        rmse_df = pd.DataFrame(data={'Site':[traindf['Site'].iloc[0]], 
                                     'train_rmse':[train_rmse], 'test_rmse':[test_rmse]})
    
    return rmse_df

In [114]:
def df_type(output_df):
    frames={}
    for typ, df_typ in output_df.groupby('Type'):
        frames[typ] = df_typ
    return frames

In [115]:
def pp_rmse_cal(traindf, testdf, output_type='none'):
    if output_type == 'none':
        rmse_df = rmse_calc(traindf, testdf, output_type='none')
        
        return rmse_df
    
    if output_type == 'type':
        
        train_type_frames = df_type(traindf)
        test_type_frames = df_type(testdf)
        
        type_list = []
        for typ in ['II-VI', 'III-V', 'IV-IV']:
            rmse_df = rmse_calc(train_type_frames[typ], test_type_frames[typ], output_type='type')
            type_list.append(rmse_df)
            
        type_df = pd.concat(type_list)
        
        return type_df

In [116]:
ha_bal_type = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_type/Lasso_HA_type.csv')
ha_bal_type=ha_bal_type.iloc[:, 1:]

In [117]:
hb_bal_type = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_type/Lasso_HB_type.csv')
hb_bal_type=hb_bal_type.iloc[:, 1:]

In [118]:
rfr_ipynb.counter(ha_bal_type, 'Type')

Total entries:  645


Unnamed: 0,Type,percent
III-V,218,33.8
II-VI,218,33.8
IV-IV,209,32.4


In [119]:
rfr_ipynb.counter(hb_bal_type, 'Type')

Total entries:  645


Unnamed: 0,Type,percent
III-V,218,33.8
II-VI,218,33.8
IV-IV,209,32.4


In [120]:
p32_bal_type = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_type/Lasso_(+3,+2)_type.csv')
p32_bal_type = p32_bal_type.iloc[:, 1:]

In [121]:
p21_bal_type = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_type/Lasso_(+2,+1)_type.csv')
p21_bal_type = p21_bal_type.iloc[:, 1:]

In [122]:
p0p1_bal_type = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_type/Lasso_(+1,0)_type.csv')
p0p1_bal_type = p0p1_bal_type.iloc[:, 1:]

In [123]:
m0m1_bal_type = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_type/Lasso_(0,-1)_type.csv')
m0m1_bal_type = m0m1_bal_type.iloc[:, 1:]

In [124]:
m12_bal_type = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_type/Lasso_(-1,-2)_type.csv')
m12_bal_type = m12_bal_type.iloc[:, 1:]

In [125]:
m23_bal_type = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_type/Lasso_(-2,-3)_type.csv')
m23_bal_type = m23_bal_type.iloc[:, 1:]

In [126]:
rfr_ipynb.counter(p32_bal_type, 'Type')

Total entries:  690


Unnamed: 0,Type,percent
II-VI,230,33.33
III-V,230,33.33
IV-IV,230,33.33


In [127]:
rfr_ipynb.counter(m23_bal_type, 'Type')

Total entries:  690


Unnamed: 0,Type,percent
II-VI,230,33.33
III-V,230,33.33
IV-IV,230,33.33


# Type

# dHA

In [14]:
ha_type_traindic, ha_type_testdic= \
    rfr_remote.rfr_predictor(ha_bal_type, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=2, min_samples_split=15,
                             num_trees=600, folds=5)

In [15]:
dHA_type_rmse = rfr_remote.rmse_calculator(ha_type_traindic, ha_type_testdic,
                                      output_type='none')

dHA_type_rmse_26, dHA_type_rmse_35, dHA_type_rmse_44 = \
    rfr_remote.rmse_calculator(ha_type_traindic, ha_type_testdic,
                                      output_type='type')

dHA_type_rmse_sub, dHA_type_rmse_int = \
    rfr_remote.rmse_calculator(ha_type_traindic, ha_type_testdic,
                                      output_type='site')

In [16]:
rfr_ipynb.type_site_df('type', [dHA_type_rmse_26, dHA_type_rmse_35, dHA_type_rmse_44], folds=5)

Unnamed: 0,output,train rmse,test rmse
0,II-VI,0.65 +/- 0.023,1.11 +/- 0.237
1,III-V,0.83 +/- 0.036,1.38 +/- 0.231
2,IV-IV,1.07 +/- 0.053,1.75 +/- 0.27


In [17]:
dHA_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.872041,1.4807
1,0.869669,1.5269
2,0.870291,1.24523
3,0.838889,1.52562
4,0.870606,1.45673
5,0.86 +/- 0.014,1.45 +/- 0.117


In [18]:
dHA_traindf, dHA_testdf = \
    rfr_remote.rfr_pp_predictor(ha_bal_type, d_start=5, max_depth=10, max_feat='auto', 
                                min_samp_leaf=2, min_samples_split=15, num_trees=600,
                                folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [110]:
rmse_df = pp_rmse_cal(dHA_traindf, dHA_testdf, output_type='none')
rmse_df

Unnamed: 0,train_rmse,test_rmse
0,0.847214,1.473305


In [111]:
rmse_df = pp_rmse_cal(dHA_traindf, dHA_testdf, output_type='type')
rmse_df

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.640779,1.154302
0,III-V,0.808338,1.392128
0,IV-IV,1.051335,1.813164


# dHB

In [161]:
dHB_type_traindic, dHB_type_testdic= \
    rfr_remote.rfr_predictor(hb_bal_type, d_start=5,  max_depth=10, max_feat='auto', 
                             min_samp_leaf=3, min_samples_split=15,
                             num_trees=1000, folds=5)

In [19]:
dHB_type_rmse = rfr_remote.rmse_calculator(dHB_type_traindic, dHB_type_testdic,
                                      output_type='none')

dHB_type_rmse_26, dHB_type_rmse_35, dHB_type_rmse_44 = \
    rfr_remote.rmse_calculator(dHB_type_traindic, dHB_type_testdic,
                                      output_type='type')

dHB_type_rmse_sub, dHB_type_rmse_int = \
    rfr_remote.rmse_calculator(dHB_type_traindic, dHB_type_testdic,
                                      output_type='site')

In [20]:
rfr_ipynb.type_site_df('type', [dHB_type_rmse_26, dHB_type_rmse_35, dHB_type_rmse_44], folds=5)

Unnamed: 0,output,train rmse,test rmse
0,II-VI,0.91 +/- 0.021,1.52 +/- 0.105
1,III-V,0.9 +/- 0.022,1.56 +/- 0.175
2,IV-IV,1.04 +/- 0.03,1.69 +/- 0.21


In [21]:
dHB_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.947842,1.50542
1,0.973825,1.68822
2,0.953942,1.52399
3,0.950028,1.65329
4,0.933123,1.61149
5,0.95 +/- 0.015,1.6 +/- 0.08


In [128]:
dHB_traindf, dHB_testdf = \
    rfr_remote.rfr_pp_predictor(hb_bal_type, d_start=5,  max_depth=10, max_feat='auto', 
                                min_samp_leaf=3, min_samples_split=15,
                                num_trees=1000, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [135]:
dHB_type25_rmse = pp_rmse_cal(dHB_traindf, dHB_testdf, output_type='none')
dHB_type25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.934856,1.597869


In [131]:
dHB_type25type_rmse = pp_rmse_cal(dHB_traindf, dHB_testdf, output_type='type')
dHB_type25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.89295,1.540014
0,III-V,0.891473,1.5381
0,IV-IV,1.018084,1.714076


# (+3, +2)

In [22]:
p32_type_traindic, p32_type_testdic= \
    rfr_remote.rfr_predictor(p32_bal_type, d_start=5, max_depth=7, max_feat='auto', 
                             min_samp_leaf=3, min_samples_split=3,
                             num_trees=450, folds=5)

In [23]:
#output_type = 'none' returns the overall RMSE of train and test
p32_type_rmse = rfr_remote.rmse_calculator(p32_type_traindic, p32_type_testdic,
                                      output_type='none')

#output_type = 'type' returns the RMSE of train and test separated by type
p32_type_rmse_26, p32_type_rmse_35, p32_type_rmse_44 = \
    rfr_remote.rmse_calculator(p32_type_traindic, p32_type_testdic,
                                      output_type='type')

#output_type = 'site' returns the RMSE of train and test separated by site
p32_type_rmse_sub, p32_type_rmse_int = \
    rfr_remote.rmse_calculator(p32_type_traindic, p32_type_testdic,
                                      output_type='site')

In [24]:
rfr_ipynb.type_site_df('type', [p32_type_rmse_26, p32_type_rmse_35, p32_type_rmse_44], folds=5)

Unnamed: 0,output,train rmse,test rmse
0,II-VI,0.22 +/- 0.015,0.38 +/- 0.108
1,III-V,0.25 +/- 0.016,0.4 +/- 0.093
2,IV-IV,0.19 +/- 0.006,0.31 +/- 0.042


In [25]:
p32_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.226419,0.336944
1,0.213719,0.441454
2,0.219403,0.376722
3,0.224143,0.405148
4,0.237869,0.28158
5,0.22 +/- 0.009,0.37 +/- 0.062


In [132]:
p32_traindf, p32_testdf = \
    rfr_remote.rfr_pp_predictor(p32_bal_type, d_start=5, max_depth=7, max_feat='auto', 
                                min_samp_leaf=3, min_samples_split=3,
                                num_trees=450, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [133]:
p32_type25_rmse = pp_rmse_cal(p32_traindf, p32_testdf, output_type='none')
p32_type25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.220743,0.375574


In [134]:
p32_type25type_rmse = pp_rmse_cal(p32_traindf, p32_testdf, output_type='type')
p32_type25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.223227,0.390967
0,III-V,0.249051,0.411015
0,IV-IV,0.185274,0.318401


# (+2, +1)

In [26]:
p21_type_traindic, p21_type_testdic= \
    rfr_remote.rfr_predictor(p21_bal_type, d_start=5, max_depth=9, max_feat='auto', 
                             min_samp_leaf=7, min_samples_split=2,
                             num_trees=600, folds=5)

In [27]:
#output_type = 'none' returns the overall RMSE of train and test
p21_type_rmse = rfr_remote.rmse_calculator(p21_type_traindic, p21_type_testdic,
                                      output_type='none')

#output_type = 'type' returns the RMSE of train and test separated by type
p21_type_rmse_26, p21_type_rmse_35, p21_type_rmse_44 = \
    rfr_remote.rmse_calculator(p21_type_traindic, p21_type_testdic,
                                      output_type='type')

#output_type = 'site' returns the RMSE of train and test separated by site
p21_type_rmse_sub, p21_type_rmse_int = \
    rfr_remote.rmse_calculator(p21_type_traindic, p21_type_testdic,
                                      output_type='site')

In [28]:
rfr_ipynb.type_site_df('type', [p21_type_rmse_26, p21_type_rmse_35, p21_type_rmse_44], folds=5)

Unnamed: 0,output,train rmse,test rmse
0,II-VI,0.3 +/- 0.008,0.45 +/- 0.065
1,III-V,0.29 +/- 0.006,0.43 +/- 0.041
2,IV-IV,0.27 +/- 0.012,0.39 +/- 0.075


In [29]:
p21_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.278347,0.440129
1,0.2911,0.468522
2,0.287208,0.427426
3,0.285281,0.436584
4,0.2966,0.349371
5,0.29 +/- 0.007,0.42 +/- 0.045


In [136]:
p21_traindf, p21_testdf = \
    rfr_remote.rfr_pp_predictor(p21_bal_type, d_start=5, max_depth=9, max_feat='auto', 
                                min_samp_leaf=7, min_samples_split=2,
                                num_trees=600, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [140]:
p21_type25_rmse = pp_rmse_cal(p21_traindf, p21_testdf, output_type='none')
p21_type25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.283364,0.425789


In [141]:
p21_type25type_rmse = pp_rmse_cal(p21_traindf, p21_testdf, output_type='type')
p21_type25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.299259,0.456919
0,III-V,0.283528,0.424996
0,IV-IV,0.266349,0.393055


# (+1, 0)

In [30]:
p01_type_traindic, p01_type_testdic= \
    rfr_remote.rfr_predictor(p0p1_bal_type, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=8, min_samples_split=12,
                             num_trees=150, folds=5)

In [31]:
#output_type = 'none' returns the overall RMSE of train and test
p01_type_rmse = rfr_remote.rmse_calculator(p01_type_traindic, p01_type_testdic,
                                      output_type='none')

#output_type = 'type' returns the RMSE of train and test separated by type
p01_type_rmse_26, p01_type_rmse_35, p01_type_rmse_44 = \
    rfr_remote.rmse_calculator(p01_type_traindic, p01_type_testdic,
                                      output_type='type')

#output_type = 'site' returns the RMSE of train and test separated by site
p01_type_rmse_sub, p01_type_rmse_int = \
    rfr_remote.rmse_calculator(p01_type_traindic, p01_type_testdic,
                                      output_type='site')

In [32]:
rfr_ipynb.type_site_df('type', [p01_type_rmse_26, p01_type_rmse_35, p01_type_rmse_44], folds=5)

Unnamed: 0,output,train rmse,test rmse
0,II-VI,0.31 +/- 0.012,0.46 +/- 0.062
1,III-V,0.26 +/- 0.004,0.39 +/- 0.039
2,IV-IV,0.27 +/- 0.012,0.38 +/- 0.073


In [33]:
p01_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.274933,0.462349
1,0.27827,0.429531
2,0.282051,0.421277
3,0.288867,0.391544
4,0.289407,0.351929
5,0.28 +/- 0.006,0.41 +/- 0.042


In [142]:
p01_traindf, p01_testdf = \
    rfr_remote.rfr_pp_predictor(p0p1_bal_type, d_start=5, max_depth=10, max_feat='auto', 
                                min_samp_leaf=8, min_samples_split=12,
                                num_trees=150, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [143]:
p01_type25_rmse = pp_rmse_cal(p01_traindf, p01_testdf, output_type='none')
p01_type25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.27645,0.411788


In [144]:
p01_type25type_rmse = pp_rmse_cal(p01_traindf, p01_testdf, output_type='type')
p01_type25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.304622,0.453458
0,III-V,0.254079,0.380669
0,IV-IV,0.268185,0.397711


# (0, -1)

In [34]:
m01_type_traindic, m01_type_testdic= \
    rfr_remote.rfr_predictor(m0m1_bal_type, d_start=5, max_depth=8, max_feat='auto', 
                             min_samp_leaf=5, min_samples_split=3,
                             num_trees=150, folds=5)

In [35]:
#output_type = 'none' returns the overall RMSE of train and test
m01_type_rmse = rfr_remote.rmse_calculator(m01_type_traindic, m01_type_testdic,
                                      output_type='none')

#output_type = 'type' returns the RMSE of train and test separated by type
m01_type_rmse_26, m01_type_rmse_35, m01_type_rmse_44 = \
    rfr_remote.rmse_calculator(m01_type_traindic, m01_type_testdic,
                                      output_type='type')

#output_type = 'site' returns the RMSE of train and test separated by site
m01_type_rmse_sub, m01_type_rmse_int = \
    rfr_remote.rmse_calculator(m01_type_traindic, m01_type_testdic,
                                      output_type='site')

In [36]:
rfr_ipynb.type_site_df('type', [m01_type_rmse_26, m01_type_rmse_35, m01_type_rmse_44], folds=5)

Unnamed: 0,output,train rmse,test rmse
0,II-VI,0.26 +/- 0.011,0.42 +/- 0.07
1,III-V,0.23 +/- 0.009,0.39 +/- 0.063
2,IV-IV,0.2 +/- 0.01,0.34 +/- 0.055


In [37]:
m01_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.240736,0.328408
1,0.235962,0.387724
2,0.228474,0.448719
3,0.233114,0.405759
4,0.234982,0.365878
5,0.23 +/- 0.004,0.39 +/- 0.045


In [145]:
m01_traindf, m01_testdf = \
    rfr_remote.rfr_pp_predictor(m0m1_bal_type, d_start=5, max_depth=8, max_feat='auto', 
                                min_samp_leaf=5, min_samples_split=3,
                                num_trees=150, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [146]:
m01_type25_rmse = pp_rmse_cal(m01_traindf, m01_testdf, output_type='none')
m01_type25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.228891,0.386261


In [147]:
m01_type25type_rmse = pp_rmse_cal(m01_traindf, m01_testdf, output_type='type')
m01_type25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.255318,0.428762
0,III-V,0.228607,0.389178
0,IV-IV,0.199309,0.335108


# (-1, -2)

In [38]:
m12_type_traindic, m12_type_testdic= \
    rfr_remote.rfr_predictor(m12_bal_type, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=8, min_samples_split=12,
                             num_trees=750, folds=5)

In [39]:
#output_type = 'none' returns the overall RMSE of train and test
m12_type_rmse = rfr_remote.rmse_calculator(m12_type_traindic, m12_type_testdic,
                                      output_type='none')

#output_type = 'type' returns the RMSE of train and test separated by type
m12_type_rmse_26, m12_type_rmse_35, m12_type_rmse_44 = \
    rfr_remote.rmse_calculator(m12_type_traindic, m12_type_testdic,
                                      output_type='type')

#output_type = 'site' returns the RMSE of train and test separated by site
m12_type_rmse_sub, m12_type_rmse_int = \
    rfr_remote.rmse_calculator(m12_type_traindic, m12_type_testdic,
                                      output_type='site')

In [40]:
rfr_ipynb.type_site_df('type', [m12_type_rmse_26, m12_type_rmse_35, m12_type_rmse_44], folds=5)

Unnamed: 0,output,train rmse,test rmse
0,II-VI,0.27 +/- 0.009,0.36 +/- 0.066
1,III-V,0.27 +/- 0.007,0.37 +/- 0.044
2,IV-IV,0.22 +/- 0.013,0.31 +/- 0.074


In [41]:
m12_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.263474,0.307459
1,0.263554,0.314294
2,0.248667,0.414343
3,0.249422,0.403763
4,0.259191,0.32226
5,0.26 +/- 0.007,0.35 +/- 0.052


In [148]:
m12_traindf, m12_testdf = \
    rfr_remote.rfr_pp_predictor(m12_bal_type, d_start=5, max_depth=10, max_feat='auto', 
                                min_samp_leaf=8, min_samples_split=12,
                                num_trees=750, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [149]:
m12_type25_rmse = pp_rmse_cal(m12_traindf, m12_testdf, output_type='none')
m12_type25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.253829,0.354199


In [150]:
m12_type25type_rmse = pp_rmse_cal(m12_traindf, m12_testdf, output_type='type')
m12_type25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.267775,0.37289
0,III-V,0.269111,0.374219
0,IV-IV,0.221728,0.311904


# (-2, -3)

In [42]:
m23_type_traindic, m23_type_testdic= \
    rfr_remote.rfr_predictor(m23_bal_type, d_start=5, max_depth=10, max_feat='sqrt', 
                             min_samp_leaf=2, min_samples_split=14,
                             num_trees=450, folds=5)

In [43]:
#output_type = 'none' returns the overall RMSE of train and test
m23_type_rmse = rfr_remote.rmse_calculator(m23_type_traindic, m23_type_testdic,
                                      output_type='none')

#output_type = 'type' returns the RMSE of train and test separated by type
m23_type_rmse_26, m23_type_rmse_35, m23_type_rmse_44 = \
    rfr_remote.rmse_calculator(m23_type_traindic, m23_type_testdic,
                                      output_type='type')

#output_type = 'site' returns the RMSE of train and test separated by site
m23_type_rmse_sub, m23_type_rmse_int = \
    rfr_remote.rmse_calculator(m23_type_traindic, m23_type_testdic,
                                      output_type='site')

In [44]:
rfr_ipynb.type_site_df('type', [m23_type_rmse_26, m23_type_rmse_35, m23_type_rmse_44], folds=5)

Unnamed: 0,output,train rmse,test rmse
0,II-VI,0.23 +/- 0.008,0.3 +/- 0.057
1,III-V,0.19 +/- 0.016,0.25 +/- 0.069
2,IV-IV,0.17 +/- 0.007,0.23 +/- 0.04


In [45]:
m23_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.202723,0.214469
1,0.198835,0.277896
2,0.181501,0.325089
3,0.198734,0.259607
4,0.195333,0.253196
5,0.2 +/- 0.008,0.27 +/- 0.04


In [151]:
m23_traindf, m23_testdf = \
    rfr_remote.rfr_pp_predictor(m23_bal_type, d_start=5, max_depth=10, max_feat='sqrt', 
                             min_samp_leaf=2, min_samples_split=14,
                             num_trees=450, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [152]:
m23_type25_rmse = pp_rmse_cal(m23_traindf, m23_testdf, output_type='none')
m23_type25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.192926,0.264484


In [153]:
m23_type25type_rmse = pp_rmse_cal(m23_traindf, m23_testdf, output_type='type')
m23_type25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.224106,0.305995
0,III-V,0.186078,0.25371
0,IV-IV,0.163746,0.227715


# Site

In [155]:
ha_bal_site = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_site/Lasso_HA_site.csv')
ha_bal_site=ha_bal_site.iloc[:, 1:]

In [156]:
hb_bal_site = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_site/Lasso_HB_site.csv')
hb_bal_site=hb_bal_site.iloc[:, 1:]

In [157]:
rfr_ipynb.counter(ha_bal_site, 'Site')

Total entries:  1104


Unnamed: 0,Site,percent
M_A,293,26.54
M_B,259,23.46
M_i_A,192,17.39
M_i_neut,183,16.58
M_i_B,177,16.03


In [158]:
rfr_ipynb.counter(hb_bal_site, 'Site')

Total entries:  1104


Unnamed: 0,Site,percent
M_A,293,26.54
M_B,259,23.46
M_i_A,192,17.39
M_i_neut,183,16.58
M_i_B,177,16.03


In [159]:
p32_bal_site = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_site/Lasso_(+3,+2)_site.csv')
p32_bal_site = p32_bal_site.iloc[:, 1:]

In [160]:
p21_bal_site = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_site/Lasso_(+2,+1)_site.csv')
p21_bal_site = p21_bal_site.iloc[:, 1:]

In [161]:
p0p1_bal_site = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_site/Lasso_(+1,0)_site.csv')
p0p1_bal_site = p0p1_bal_site.iloc[:, 1:]

In [162]:
m0m1_bal_site = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_site/Lasso_(0,-1)_site.csv')
m0m1_bal_site = m0m1_bal_site.iloc[:, 1:]

In [163]:
m12_bal_site = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_site/Lasso_(-1,-2)_site.csv')
m12_bal_site = m12_bal_site.iloc[:, 1:]

In [164]:
m23_bal_site = pd.read_csv('./xiaofeng_lasso/210928_balanceddata/balanced_site/Lasso_(-2,-3)_site.csv')
m23_bal_site = m23_bal_site.iloc[:, 1:]

In [165]:
rfr_ipynb.counter(p32_bal_site, 'Site')

Total entries:  706


Unnamed: 0,Site,percent
M_A,188,26.63
M_B,165,23.37
M_i_A,123,17.42
M_i_B,118,16.71
M_i_neut,112,15.86


In [166]:
rfr_ipynb.counter(m23_bal_site, 'Site')

Total entries:  706


Unnamed: 0,Site,percent
M_A,188,26.63
M_B,165,23.37
M_i_A,123,17.42
M_i_B,118,16.71
M_i_neut,112,15.86


# dHA

In [100]:
dHa_site_traindic, dHa_site_testdic= \
    rfr_remote.rfr_predictor(ha_bal_site, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=2, min_samples_split=15,
                             num_trees=600, folds=5)

In [101]:
dHA_site_rmse = rfr_remote.rmse_calculator(dHa_site_traindic, dHa_site_testdic,
                                      output_type='none')

dHA_site_rmse_26, dHA_site_rmse_35, dHA_site_rmse_44 = \
    rfr_remote.rmse_calculator(dHa_site_traindic, dHa_site_testdic,
                                      output_type='type')

dHA_site_rmse_sub, dHA_site_rmse_int = \
    rfr_remote.rmse_calculator(dHa_site_traindic, dHa_site_testdic,
                                      output_type='site')

In [103]:
rfr_ipynb.type_site_df('site', [dHA_site_rmse_sub, dHA_site_rmse_int], 5)

Unnamed: 0,output,train rmse,test rmse
0,sub,0.7 +/- 0.022,1.22 +/- 0.158
1,int,0.63 +/- 0.013,1.05 +/- 0.137


In [104]:
dHA_site_rmse

Unnamed: 0,train rmse,test rmse
0,0.68597,0.933506
1,0.67344,1.1203
2,0.650397,1.22153
3,0.652957,1.27422
4,0.680286,1.15097
5,0.67 +/- 0.016,1.14 +/- 0.13


In [167]:
dHA_traindf, dHA_testdf = \
    rfr_remote.rfr_pp_predictor(ha_bal_site, d_start=5, max_depth=10, max_feat='auto', 
                                min_samp_leaf=2, min_samples_split=15, num_trees=600,
                                folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [168]:
rmse_df = pp_rmse_cal(dHA_traindf, dHA_testdf, output_type='none')
rmse_df

Unnamed: 0,train_rmse,test_rmse
0,0.655491,1.13783


In [169]:
rmse_df = pp_rmse_cal(dHA_traindf, dHA_testdf, output_type='type')
rmse_df

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.443271,0.712512
0,III-V,0.898978,1.620797
0,IV-IV,1.129222,2.0243


# dHB

In [117]:
dHB_site_traindic, dHB_site_testdic= \
    rfr_remote.rfr_predictor(hb_bal_site, d_start=5,  max_depth=10, max_feat='auto', 
                             min_samp_leaf=3, min_samples_split=15,
                             num_trees=1000, folds=5)

In [118]:
dHB_site_rmse = rfr_remote.rmse_calculator(dHB_site_traindic, dHB_site_testdic,
                                      output_type='none')

dHB_site_rmse_26, dHB_site_rmse_35, dHB_site_rmse_44 = \
    rfr_remote.rmse_calculator(dHB_site_traindic, dHB_site_testdic,
                                      output_type='type')

dHB_site_rmse_sub, dHB_site_rmse_int = \
    rfr_remote.rmse_calculator(dHB_site_traindic, dHB_site_testdic,
                                      output_type='site')

In [119]:
rfr_ipynb.type_site_df('site', [dHB_site_rmse_sub, dHB_site_rmse_int], 5)

Unnamed: 0,output,train rmse,test rmse
0,sub,0.77 +/- 0.019,1.34 +/- 0.193
1,int,0.71 +/- 0.011,1.15 +/- 0.139


In [120]:
dHB_site_rmse

Unnamed: 0,train rmse,test rmse
0,0.748585,1.03517
1,0.745533,1.35248
2,0.731314,1.33357
3,0.722017,1.40583
4,0.750797,1.10511
5,0.74 +/- 0.012,1.25 +/- 0.165


In [170]:
dHB_traindf, dHB_testdf = \
    rfr_remote.rfr_pp_predictor(hb_bal_site, d_start=5,  max_depth=10, max_feat='auto', 
                                min_samp_leaf=3, min_samples_split=15,
                                num_trees=1000, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [171]:
dHB_site25_rmse = pp_rmse_cal(dHB_traindf, dHB_testdf, output_type='none')
dHB_site25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.725188,1.233472


In [172]:
dHB_site25type_rmse = pp_rmse_cal(dHB_traindf, dHB_testdf, output_type='type')
dHB_site25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.550353,0.911406
0,III-V,0.97314,1.709847
0,IV-IV,1.131491,1.941288


# (+3, +2)

In [121]:
p32_site_traindic, p32_site_testdic= \
    rfr_remote.rfr_predictor(p32_bal_site, d_start=5, max_depth=7, max_feat='auto', 
                             min_samp_leaf=3, min_samples_split=3,
                             num_trees=450, folds=5)

In [122]:
#output_type = 'none' returns the overall RMSE of train and test
p32_site_rmse = rfr_remote.rmse_calculator(p32_site_traindic, p32_site_testdic,
                                      output_type='none')

#output_type = 'type' returns the RMSE of train and test separated by type
p32_site_rmse_26, p32_site_rmse_35, p32_site_rmse_44 = \
    rfr_remote.rmse_calculator(p32_site_traindic, p32_site_testdic,
                                      output_type='type')

#output_type = 'site' returns the RMSE of train and test separated by site
p32_site_rmse_sub, p32_site_rmse_int = \
    rfr_remote.rmse_calculator(p32_site_traindic, p32_site_testdic,
                                      output_type='site')

In [123]:
rfr_ipynb.type_site_df('site', [p32_site_rmse_sub, p32_site_rmse_int], 5)

Unnamed: 0,output,train rmse,test rmse
0,sub,0.19 +/- 0.009,0.34 +/- 0.042
1,int,0.21 +/- 0.008,0.35 +/- 0.026


In [124]:
p32_site_rmse

Unnamed: 0,train rmse,test rmse
0,0.207204,0.329996
1,0.195545,0.362601
2,0.197319,0.37699
3,0.203156,0.334964
4,0.206276,0.318563
5,0.2 +/- 0.005,0.34 +/- 0.024


In [173]:
p32_traindf, p32_testdf = \
    rfr_remote.rfr_pp_predictor(p32_bal_site, d_start=5, max_depth=7, max_feat='auto', 
                                min_samp_leaf=3, min_samples_split=3,
                                num_trees=450, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [174]:
p32_site25_rmse = pp_rmse_cal(p32_traindf, p32_testdf, output_type='none')
p32_site25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.197885,0.337541


In [175]:
p32_site25type_rmse = pp_rmse_cal(p32_traindf, p32_testdf, output_type='type')
p32_site25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.186732,0.312872
0,III-V,0.219253,0.380436
0,IV-IV,0.200484,0.34666


# (+2, +1)

In [125]:
p21_site_traindic, p21_site_testdic= \
    rfr_remote.rfr_predictor(p21_bal_site, d_start=5, max_depth=9, max_feat='auto', 
                             min_samp_leaf=7, min_samples_split=2,
                             num_trees=600, folds=5)

In [126]:
#output_type = 'none' returns the overall RMSE of train and test
p21_site_rmse = rfr_remote.rmse_calculator(p21_site_traindic, p21_site_testdic,
                                      output_type='none')

#output_type = 'type' returns the RMSE of train and test separated by type
p21_site_rmse_26, p21_site_rmse_35, p21_site_rmse_44 = \
    rfr_remote.rmse_calculator(p21_site_traindic, p21_site_testdic,
                                      output_type='type')

#output_type = 'site' returns the RMSE of train and test separated by site
p21_site_rmse_sub, p21_site_rmse_int = \
    rfr_remote.rmse_calculator(p21_site_traindic, p21_site_testdic,
                                      output_type='site')

In [128]:
rfr_ipynb.type_site_df('site', [p21_site_rmse_sub, p21_site_rmse_int], 5)

Unnamed: 0,output,train rmse,test rmse
0,sub,0.28 +/- 0.004,0.43 +/- 0.013
1,int,0.26 +/- 0.01,0.37 +/- 0.056


In [127]:
p21_site_rmse

Unnamed: 0,train rmse,test rmse
0,0.271401,0.403144
1,0.26428,0.446676
2,0.272144,0.397924
3,0.27322,0.381705
4,0.274663,0.394668
5,0.27 +/- 0.004,0.4 +/- 0.025


In [176]:
p21_traindf, p21_testdf = \
    rfr_remote.rfr_pp_predictor(p21_bal_site, d_start=5, max_depth=9, max_feat='auto', 
                                min_samp_leaf=7, min_samples_split=2,
                                num_trees=600, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [177]:
p21_site25_rmse = pp_rmse_cal(p21_traindf, p21_testdf, output_type='none')
p21_site25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.265638,0.406083


In [178]:
p21_site25type_rmse = pp_rmse_cal(p21_traindf, p21_testdf, output_type='type')
p21_site25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.244884,0.372875
0,III-V,0.28877,0.44808
0,IV-IV,0.285785,0.433337


# (+1, 0)

In [179]:
p01_site_traindic, p01_site_testdic= \
    rfr_remote.rfr_predictor(p0p1_bal_site, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=8, min_samples_split=12,
                             num_trees=150, folds=5)

In [180]:
#output_type = 'none' returns the overall RMSE of train and test
p01_site_rmse = rfr_remote.rmse_calculator(p01_site_traindic, p01_site_testdic,
                                      output_type='none')

#output_type = 'type' returns the RMSE of train and test separated by type
p01_site_rmse_26, p01_site_rmse_35, p01_site_rmse_44 = \
    rfr_remote.rmse_calculator(p01_site_traindic, p01_site_testdic,
                                      output_type='type')

#output_type = 'site' returns the RMSE of train and test separated by site
p01_site_rmse_sub, p01_site_rmse_int = \
    rfr_remote.rmse_calculator(p01_site_traindic, p01_site_testdic,
                                      output_type='site')

In [181]:
rfr_ipynb.type_site_df('site', [p01_site_rmse_sub, p01_site_rmse_int], 5)

Unnamed: 0,output,train rmse,test rmse
0,sub,0.33 +/- 0.004,0.49 +/- 0.04
1,int,0.22 +/- 0.008,0.31 +/- 0.045


In [182]:
p01_site_rmse

Unnamed: 0,train rmse,test rmse
0,0.27962,0.417043
1,0.276755,0.412451
2,0.279728,0.428631
3,0.284275,0.376426
4,0.280035,0.407548
5,0.28 +/- 0.003,0.41 +/- 0.02


In [183]:
p01_traindf, p01_testdf = \
    rfr_remote.rfr_pp_predictor(p0p1_bal_site, d_start=5, max_depth=10, max_feat='auto', 
                                min_samp_leaf=8, min_samples_split=12,
                                num_trees=150, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [184]:
p01_site25_rmse = pp_rmse_cal(p01_traindf, p01_testdf, output_type='none')
p01_site25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.275257,0.409936


In [185]:
p01_site25type_rmse = pp_rmse_cal(p01_traindf, p01_testdf, output_type='type')
p01_site25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.27249,0.40179
0,III-V,0.275609,0.419125
0,IV-IV,0.280858,0.418734


# (0, -1)

In [133]:
m01_site_traindic, m01_site_testdic= \
    rfr_remote.rfr_predictor(m0m1_bal_site, d_start=5, max_depth=8, max_feat='auto', 
                             min_samp_leaf=5, min_samples_split=3,
                             num_trees=150, folds=5)

In [134]:
#output_type = 'none' returns the overall RMSE of train and test
m01_site_rmse = rfr_remote.rmse_calculator(m01_site_traindic, m01_site_testdic,
                                      output_type='none')

#output_type = 'type' returns the RMSE of train and test separated by type
m01_site_rmse_26, m01_site_rmse_35, m01_site_rmse_44 = \
    rfr_remote.rmse_calculator(m01_site_traindic, m01_site_testdic,
                                      output_type='type')

#output_type = 'site' returns the RMSE of train and test separated by site
m01_site_rmse_sub, m01_site_rmse_int = \
    rfr_remote.rmse_calculator(m01_site_traindic, m01_site_testdic,
                                      output_type='site')

In [135]:
rfr_ipynb.type_site_df('site', [m01_site_rmse_sub, m01_site_rmse_int], 5)

Unnamed: 0,output,train rmse,test rmse
0,sub,0.28 +/- 0.01,0.47 +/- 0.045
1,int,0.17 +/- 0.007,0.27 +/- 0.049


In [136]:
m01_site_rmse

Unnamed: 0,train rmse,test rmse
0,0.22691,0.404546
1,0.236385,0.369586
2,0.226044,0.400023
3,0.224632,0.391906
4,0.238724,0.354007
5,0.23 +/- 0.007,0.38 +/- 0.021


In [186]:
m01_traindf, m01_testdf = \
    rfr_remote.rfr_pp_predictor(m0m1_bal_site, d_start=5, max_depth=8, max_feat='auto', 
                                min_samp_leaf=5, min_samples_split=3,
                                num_trees=150, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [187]:
m01_site25_rmse = pp_rmse_cal(m01_traindf, m01_testdf, output_type='none')
m01_site25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.226249,0.384558


In [188]:
m01_site25type_rmse = pp_rmse_cal(m01_traindf, m01_testdf, output_type='type')
m01_site25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.223153,0.378418
0,III-V,0.246057,0.420274
0,IV-IV,0.213334,0.362397


# (-1, -2)

In [137]:
m12_site_traindic, m12_site_testdic= \
    rfr_remote.rfr_predictor(m12_bal_site, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=8, min_samples_split=12,
                             num_trees=750, folds=5)

In [138]:
#output_type = 'none' returns the overall RMSE of train and test
m12_site_rmse = rfr_remote.rmse_calculator(m12_site_traindic, m12_site_testdic,
                                      output_type='none')

#output_type = 'type' returns the RMSE of train and test separated by type
m12_site_rmse_26, m12_site_rmse_35, m12_site_rmse_44 = \
    rfr_remote.rmse_calculator(m12_site_traindic, m12_site_testdic,
                                      output_type='type')

#output_type = 'site' returns the RMSE of train and test separated by site
m12_site_rmse_sub, m12_site_rmse_int = \
    rfr_remote.rmse_calculator(m12_site_traindic, m12_site_testdic,
                                      output_type='site')

In [139]:
rfr_ipynb.type_site_df('site', [m12_site_rmse_sub, m12_site_rmse_int], 5)

Unnamed: 0,output,train rmse,test rmse
0,sub,0.28 +/- 0.008,0.39 +/- 0.057
1,int,0.22 +/- 0.006,0.31 +/- 0.051


In [140]:
m12_site_rmse

Unnamed: 0,train rmse,test rmse
0,0.242909,0.387552
1,0.245115,0.393961
2,0.24933,0.347367
3,0.25437,0.319132
4,0.255056,0.317612
5,0.25 +/- 0.005,0.35 +/- 0.036


In [189]:
m12_traindf, m12_testdf = \
    rfr_remote.rfr_pp_predictor(m12_bal_site, d_start=5, max_depth=10, max_feat='auto', 
                                min_samp_leaf=8, min_samples_split=12,
                                num_trees=750, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [190]:
m12_site25_rmse = pp_rmse_cal(m12_traindf, m12_testdf, output_type='none')
m12_site25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.246445,0.347018


In [191]:
m12_site25type_rmse = pp_rmse_cal(m12_traindf, m12_testdf, output_type='type')
m12_site25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.241004,0.337175
0,III-V,0.273508,0.396339
0,IV-IV,0.231111,0.317744


# (-2, -3)

In [199]:
m23_site_traindic, m23_site_testdic= \
    rfr_remote.rfr_predictor(m23_bal_site, d_start=5, max_depth=10, max_feat='sqrt', 
                             min_samp_leaf=2, min_samples_split=14,
                             num_trees=450, folds=5)

In [200]:
#output_type = 'none' returns the overall RMSE of train and test
m23_site_rmse = rfr_remote.rmse_calculator(m23_site_traindic, m23_site_testdic,
                                      output_type='none')

#output_type = 'type' returns the RMSE of train and test separated by type
m23_site_rmse_26, m23_site_rmse_35, m23_site_rmse_44 = \
    rfr_remote.rmse_calculator(m23_site_traindic, m23_site_testdic,
                                      output_type='type')

#output_type = 'site' returns the RMSE of train and test separated by site
m23_site_rmse_sub, m23_site_rmse_int = \
    rfr_remote.rmse_calculator(m23_site_traindic, m23_site_testdic,
                                      output_type='site')

In [201]:
rfr_ipynb.type_site_df('site', [m23_site_rmse_sub, m23_site_rmse_int], folds=5)

Unnamed: 0,output,train rmse,test rmse
0,sub,0.19 +/- 0.005,0.26 +/- 0.038
1,int,0.19 +/- 0.008,0.25 +/- 0.039


In [202]:
m23_site_rmse

Unnamed: 0,train rmse,test rmse
0,0.195068,0.246945
1,0.199802,0.2749
2,0.188098,0.288274
3,0.188892,0.251256
4,0.189819,0.244336
5,0.19 +/- 0.005,0.26 +/- 0.019


In [203]:
m23_traindf, m23_testdf = \
    rfr_remote.rfr_pp_predictor(m23_bal_site, d_start=5, max_depth=10, max_feat='sqrt', 
                             min_samp_leaf=2, min_samples_split=14,
                             num_trees=450, folds=5)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [204]:
m23_site25_rmse = pp_rmse_cal(m23_traindf, m23_testdf, output_type='none')
m23_site25_rmse

Unnamed: 0,train_rmse,test_rmse
0,0.188989,0.263424


In [205]:
m23_site25type_rmse = pp_rmse_cal(m23_traindf, m23_testdf, output_type='type')
m23_site25type_rmse

Unnamed: 0,Type,train_rmse,test_rmse
0,II-VI,0.19493,0.26362
0,III-V,0.194593,0.293796
0,IV-IV,0.169579,0.231137
