In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import rfr_remote

%matplotlib inline

In [2]:
from statistics import mean, stdev
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict

from sklearn.inspection import permutation_importance

from tqdm.notebook import tqdm

In [3]:
def stratify_df(df, label_type, label_site):
    '''
    This function modifies the dataframe so that during cross validation
    the data can be split into test/train datasets that are equally stratified
    in "type" and "site" as the original dataframe.

    Inputs
        - df: pandas df. A ML training dataset that contains targets and
        features.
        - label_type: int. column index of "Type" column. Default: 1.
        - label_site int. column index of "Site" column. Dafault: 4.

    Outputs
        - b: pandas series. A newly encoded column that uniquely identifies
        the 15 possible combinations (3 sc types x 5 impurity sites) that a
        datapoint in the set could fal into.
    '''
    labels = df[df.columns[[label_type, label_site]]]

    # encode sc type and site columns, then combine them into a new string col
    # i.e. sctype 1 and site 3 becomes new column of 13 (dtype: string)
    enc = OrdinalEncoder(dtype=np.int)
    a = enc.fit_transform(labels)
    a = pd.DataFrame(a, columns=["SC_Type", "Site"])
    a = a.applymap(str)
    a = a[["SC_Type", "Site"]].apply(lambda x: ''.join(x), axis=1)

    # encode the new string col to 0-14 (15 total classes -
    # 3 sctypes x 5 defsites)
    b = np.array(a).reshape(-1, 1)
    b = enc.fit_transform(b)

    return b

In [4]:
def descriptors_outputs(df, d_start, o):
    '''
    This function splits to dataframe up into separate dataframes of
    descriptors and outputs by column.

    Inputs
        - df: pandas df. A ML training dataset that contains targets and
        features.
        - d_start: int. column index to that the descriptors columns start at.
        In the input df, the descriptors must start at some column at index
        df_start to the last column in the dataframe. Default: 3.
        - o: int. column index of the output. Deafult: 0.
    Outputs
        - X: pandas df. Dataframe with descriptors.
        - y: pandas df. Dataframe with output.
    '''
    X = df[df.columns[d_start:]]
    y = df[df.columns[o]]

    return X, y

In [5]:
def traintest(X, y, train_idx, test_idx):
    '''
    This function splits the descriptors (X) and output (y) points into train
    and test sets. The size of test set depends on the number of folds of CV.

    Inputs
        - X: pandas df. Dataframe with descriptors.
        - y: pandas df. Dataframe with output.
        - train_idx: np array. Indexes of training points.
        - test_idx: np array. Indexes of testing points.

    Outputs
        - X_train: np array. descriptor values of training data set.
        - X_test: np array. descriptor values of test data set.
        - y_train: np array. output values of training data set.
        - y_test: np array. output values of test data set.
    '''
    # train_idx and test_idx come from skf.split
    X_train, X_test = X.iloc[list(train_idx)], X.iloc[list(test_idx)]
    y_train, y_test = y.iloc[list(train_idx)], y.iloc[list(test_idx)]

    return X_train, X_test, y_train, y_test

In [6]:
def fit_predict(X_train, y_train, X_test, clf):
    '''
    This function fits the training X/y data using the RFR model. Then makes a
    train and test prediction of the target value for each point, using the
    descriptors of training and testing. For each fold of the cross validation,
    the training and testing sets will change.

    Inputs
        - X_train: np array. descriptor values of training data set.
        - y_train: np array. output values of training data set.
        - X_test: np array. descriptor values of test data set.
        - clf: RandomForestRegressor from sklearn

    Outputs
        - trainpred: np array. predicted output value for every point in the
        train data set.
        - testpred: np array. predicted output value for every point in the
        test data set.
    '''
    # fit all the training data
    clf.fit(X_train, y_train)

    # predict on training data and testing data based on fit model
    trainpred = clf.predict(X_train)
    testpred = clf.predict(X_test)

    return trainpred, testpred,

# Hyperparameter Tuning
To understand if Bayesian Optimization was the best method to find hyperparameters, I compared against finding hyperparameters via Grid Search. Grid Search was broken down into 2 parts: first a “random search” using the sklearn library `RandomizedSearchCV` to investigate a large hyperparameter space, followed by `GridSearchCV` to more finely tune hyperparameters based on best results from the random search. 

Running the model using the best hyperparameters from step 1 and step 2 found that for the formation enthalpies the test RMSE slightly improved at the expense of large model overfitting. I assume this is because it is hard to mitigate overfitting using the GridSearch functionality, unlike BO where loss could be adjusted to steer away from overfitting.

For transition energy level predictions, the model was worse at predicting and overfitting became more extreme. Presumably due to reasons listed above.

### Random hyperparameter grid
Create a grid of the parameter space we want to explore. For RFR the most important parameters are:
- n_estimators: number of trees in the forest 
- max_depth: number of leaves in each decision tree
- min_sammples_split: minimum number of datapoints in a node before the node splits
- min_samples_leaf: minimum number of data points allowed in a leaf node

In [7]:
def RSCV_grid():

    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 19)]

    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(3, 10, num = 8)]
    max_depth.append(None)

    # Minimum number of samples required to split a node
    min_samples_split = [int(x) for x in np.linspace(2, 25, num = 24)]

    # Minimum number of samples required at each leaf node
    min_samples_leaf = [int(x) for x in np.linspace(1, 25, num = 25)]
    
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt', 'log2']

    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'max_features': max_features}
    
    return random_grid

In [8]:
RSCV_grid()

{'n_estimators': [100,
  150,
  200,
  250,
  300,
  350,
  400,
  450,
  500,
  550,
  600,
  650,
  700,
  750,
  800,
  850,
  900,
  950,
  1000],
 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, None],
 'min_samples_split': [2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25],
 'min_samples_leaf': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25],
 'max_features': ['auto', 'sqrt', 'log2']}

### Random Search Training
``RandomizedSearchCV``: randomized search on hyper parameters. Important arguements to set are n_iter and cv.
- n_iter: number of different combinations to try
- cv: number of folds to use for cross validation
In this example the model is going to ty 100 different combinations from the random grid, wih a 5-fold cross validation (ie 500 combinations).

Similar to other scikit-learn models:
- initiate the model (random search) with ``RandomizedSearchCV``
- fit the model

In [9]:
#RESULTS = {}
from sklearn.model_selection import train_test_split

def rfr_randomhyper(df, o=0, d_start=5, label_type=1, label_site=4):
    
    # make the dataframe stratifiable by type and site
    b = stratify_df(df, label_type, label_site)

    # identify the descriptor columns and output column
    X, y = descriptors_outputs(df, d_start, o)

        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=130)
    
    clf = RandomForestRegressor()
    random_grid = RSCV_grid()
        
    clf_random = RandomizedSearchCV(estimator=clf, param_distributions=random_grid,
                               n_iter=100, cv=5, verbose=2, scoring='neg_root_mean_squared_error',
                                random_state=130, n_jobs=-1)
    
    #clf_random.fit(X_train, y_train)
    
    #print('Best params:', clf_random.best_params_)
    params_dict = pd.DataFrame(clf_random.fit(X_train, y_train).cv_results_)
    pd.set_option('display.max_colwidth', None)
    params_dict = params_dict[['params', 'rank_test_score']]
    params_dict.sort_values(by=['rank_test_score'], inplace=True)
    
    return params_dict.head(10)

### Hyperparameter Tuning based on random search

``GridSearchCV``: Exhaustive search over specified parameter values for an estimator. Important arguments are param_grid and scoring.

- param_grid: this comes from the tops returns from `RandomSearchCV`, and was hand picked
- scoring: to keep consistent with error metrics RMSE was selected as the scoring.

In [10]:
from sklearn.model_selection import GridSearchCV
def rfr_gridsearch(df, grid, o=0, d_start=5, label_type=1, label_site=4):
    
    # make the dataframe stratifiable by type and site
    b = stratify_df(df, label_type, label_site)

    # identify the descriptor columns and output column
    X, y = descriptors_outputs(df, d_start, o)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=130)
    
    clf = RandomForestRegressor()
    random_grid = RSCV_grid()
        
    clf_grid = GridSearchCV(estimator=clf, param_grid=grid, cv=5, verbose=2, 
                              scoring='neg_root_mean_squared_error', n_jobs=-1)
    
    #print('Best params:', clf_random.best_params_)
    bestparams_dict = pd.DataFrame(clf_grid.fit(X_train, y_train).cv_results_)
    pd.set_option('display.max_colwidth', None)
    bestparams_dict = bestparams_dict[['params', 'rank_test_score']]
    bestparams_dict.sort_values(by=['rank_test_score'], inplace=True)
    
    return bestparams_dict.head(10)

# dHA

In [11]:
lasso_a = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_HA_7.7.csv')

In [13]:
lassoa_randomrank = rfr_randomhyper(lasso_a, o=0, d_start=5, label_type=1, label_site=4)
lassoa_randomrank

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 11.5min finished


Unnamed: 0,params,rank_test_score
30,"{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}",1
31,"{'n_estimators': 350, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",2
32,"{'n_estimators': 900, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",3
36,"{'n_estimators': 650, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",4
68,"{'n_estimators': 150, 'min_samples_split': 13, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}",5
2,"{'n_estimators': 350, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}",6
16,"{'n_estimators': 950, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",7
56,"{'n_estimators': 450, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10}",8
24,"{'n_estimators': 250, 'min_samples_split': 20, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 10}",9
76,"{'n_estimators': 900, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 7}",10


In [14]:
ha_type_traindic, ha_type_testdic= \
    rfr_remote.rfr_predictor(lasso_a, d_start=5, max_depth=None, max_feat='sqrt', 
                             min_samp_leaf=1, min_samples_split=5,
                             num_trees=500, folds=5)

In [15]:
dHA_type_rmse = rfr_remote.rmse_calculator(ha_type_traindic, ha_type_testdic,
                                      output_type='none')
dHA_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.452182,1.07308
1,0.450778,0.998611
2,0.448383,0.985356
3,0.444303,1.05966
4,0.448934,0.992908
5,0.45 +/- 0.003,1.02 +/- 0.041


---

In [16]:
n_estimators = [150, 350, 500, 650]
min_samples_split = [5,10,14]
min_samples_leaf = [1,4]
max_features = ['sqrt', 'auto']
max_depth = [7, 10, None]

grid = {'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features}

In [17]:
lassoa_gridrank = rfr_gridsearch(lasso_a, grid=grid, o=0, d_start=5, label_type=1, label_site=4)
lassoa_gridrank

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 23.4min finished


Unnamed: 0,params,rank_test_score
122,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}",1
121,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",2
123,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 650}",3
120,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}",4
98,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}",5
99,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 650}",6
96,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}",7
75,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 650}",8
97,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",9
73,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",10


In [18]:
ha_type_traindic, ha_type_testdic= \
    rfr_remote.rfr_predictor(lasso_a, d_start=5, max_depth=None, max_feat='auto', 
                             min_samp_leaf=1, min_samples_split=5,
                             num_trees=650, folds=5)

In [19]:
dHA_type_rmse = rfr_remote.rmse_calculator(ha_type_traindic, ha_type_testdic,
                                      output_type='none')
dHA_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.425547,1.06144
1,0.4185,1.03103
2,0.417252,1.01372
3,0.417933,1.00917
4,0.425124,1.00847
5,0.42 +/- 0.004,1.02 +/- 0.022


# dHB

In [20]:
lasso_b = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_HB_7.7.csv')

In [21]:
lassob_randomrank = rfr_randomhyper(lasso_b, o=0, d_start=5, label_type=1, label_site=4)
lassob_randomrank

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 14.9min finished


Unnamed: 0,params,rank_test_score
30,"{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}",1
31,"{'n_estimators': 350, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",2
2,"{'n_estimators': 350, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}",3
36,"{'n_estimators': 650, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",4
32,"{'n_estimators': 900, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",5
56,"{'n_estimators': 450, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10}",6
68,"{'n_estimators': 150, 'min_samples_split': 13, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}",7
16,"{'n_estimators': 950, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",8
59,"{'n_estimators': 700, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 9}",9
19,"{'n_estimators': 250, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10}",10


In [22]:
dHB_type_traindic, dHB_type_testdic= \
    rfr_remote.rfr_predictor(lasso_b, d_start=5,  max_depth=None, max_feat='sqrt', 
                             min_samp_leaf=1, min_samples_split=5,
                             num_trees=500, folds=5)

same random grid and grid search

In [23]:
dHB_type_rmse = rfr_remote.rmse_calculator(dHB_type_traindic, dHB_type_testdic,
                                      output_type='none')
dHB_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.492276,1.22625
1,0.501899,1.14736
2,0.498763,1.1078
3,0.496127,1.07631
4,0.494244,1.10246
5,0.5 +/- 0.004,1.13 +/- 0.058


---

In [24]:
n_estimators = [350, 500, 650]
min_samples_split = [5,10,9]
min_samples_leaf = [1,4]
max_features = ['sqrt', 'auto']
max_depth = [10, None]

grid_b = {'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features}

In [25]:
lassob_gridrank = rfr_gridsearch(lasso_b, grid=grid_b, o=0, d_start=5, label_type=1, label_site=4)
lassob_gridrank

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 17.2min finished


Unnamed: 0,params,rank_test_score
36,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",1
37,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}",2
38,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 650}",3
55,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}",4
56,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 650}",5
54,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",6
44,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 650}",7
43,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 500}",8
61,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 500}",9
0,"{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",10


# (+3, +2)

In [26]:
lasso_p32 = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_(+3,+2)_7.7.csv')

In [27]:
lassop32_randomrank = rfr_randomhyper(lasso_p32, o=0, d_start=5, label_type=1, label_site=4)
lassop32_randomrank

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.5min finished


Unnamed: 0,params,rank_test_score
32,"{'n_estimators': 900, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",1
31,"{'n_estimators': 350, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",2
30,"{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}",3
76,"{'n_estimators': 900, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 7}",4
36,"{'n_estimators': 650, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",5
16,"{'n_estimators': 950, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",6
68,"{'n_estimators': 150, 'min_samples_split': 13, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}",7
2,"{'n_estimators': 350, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}",8
59,"{'n_estimators': 700, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 9}",9
24,"{'n_estimators': 250, 'min_samples_split': 20, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 10}",10


In [28]:
p32_type_traindic, p32_type_testdic= \
    rfr_remote.rfr_predictor(lasso_p32, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=1, min_samples_split=10,
                             num_trees=350, folds=5)

In [29]:
p32_type_rmse = rfr_remote.rmse_calculator(p32_type_traindic, p32_type_testdic,
                                      output_type='none')
p32_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.19833,0.330443
1,0.187563,0.420678
2,0.193376,0.370342
3,0.206897,0.319355
4,0.193603,0.306783
5,0.2 +/- 0.007,0.35 +/- 0.046


---

In [30]:
n_estimators = [350, 650, 900]
min_samples_split = [3,10,14]
min_samples_leaf = [1,3,4]
max_features = ['auto']
max_depth = [7,10, None]

grid_p32 = {'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features}

In [32]:
lassop32_gridrank = rfr_gridsearch(lasso_p32, grid=grid_p32, o=0, d_start=5, label_type=1, label_site=4)
lassop32_gridrank

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   59.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 18.0min finished


Unnamed: 0,params,rank_test_score
55,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 650}",1
56,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 900}",2
64,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 650}",3
65,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 900}",4
54,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 350}",5
28,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 650}",6
37,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 650}",7
38,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 900}",8
63,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 350}",9
29,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 900}",10


In [33]:
p32_type_traindic, p32_type_testdic= \
    rfr_remote.rfr_predictor(lasso_p32, d_start=5, max_depth=None, max_feat='auto', 
                             min_samp_leaf=3, min_samples_split=3,
                             num_trees=650, folds=5)

In [34]:
p32_type_rmse = rfr_remote.rmse_calculator(p32_type_traindic, p32_type_testdic,
                                      output_type='none')
p32_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.187279,0.328132
1,0.1755,0.41998
2,0.178084,0.36501
3,0.187487,0.317696
4,0.184851,0.30182
5,0.18 +/- 0.006,0.35 +/- 0.047


# (+2, +1)

In [35]:
lasso_p21 = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_(+2,+1)_7.7.csv')

In [36]:
lassop21_randomrank = rfr_randomhyper(lasso_p21, o=0, d_start=5, label_type=1, label_site=4)
lassop21_randomrank

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.6min finished


Unnamed: 0,params,rank_test_score
36,"{'n_estimators': 650, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",1
76,"{'n_estimators': 900, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 7}",2
31,"{'n_estimators': 350, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",3
32,"{'n_estimators': 900, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",4
30,"{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}",5
16,"{'n_estimators': 950, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",6
24,"{'n_estimators': 250, 'min_samples_split': 20, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 10}",7
2,"{'n_estimators': 350, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}",8
99,"{'n_estimators': 450, 'min_samples_split': 12, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 6}",9
66,"{'n_estimators': 400, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 8}",10


In [37]:
p21_type_traindic, p21_type_testdic= \
    rfr_remote.rfr_predictor(lasso_p21, d_start=5, max_depth=7, max_feat='auto', 
                             min_samp_leaf=3, min_samples_split=3,
                             num_trees=900, folds=5)

In [38]:
p21_type_rmse = rfr_remote.rmse_calculator(p21_type_traindic, p21_type_testdic,
                                      output_type='none')
p21_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.23519,0.375721
1,0.233175,0.431633
2,0.240335,0.386684
3,0.238358,0.371033
4,0.241886,0.32407
5,0.24 +/- 0.004,0.38 +/- 0.038


---

In [39]:
n_estimators = [350, 500, 900]
min_samples_split = [3,10,14]
min_samples_leaf = [1,3]
max_features = ['auto', 'sqrt']
max_depth = [7,10]

grid_p21 = {'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features}

In [40]:
lassop21_gridrank = rfr_gridsearch(lasso_p21, grid=grid_p21, o=0, d_start=5, label_type=1, label_site=4)
lassop21_gridrank

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  5.3min finished


Unnamed: 0,params,rank_test_score
38,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 900}",1
46,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 500}",2
45,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 350}",3
37,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 500}",4
36,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 350}",5
47,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 900}",6
39,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 350}",7
50,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 900}",8
40,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}",9
41,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 900}",10


In [41]:
p21_type_traindic, p21_type_testdic= \
    rfr_remote.rfr_predictor(lasso_p21, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=1, min_samples_split=3,
                             num_trees=900, folds=5)

In [42]:
p21_type_rmse = rfr_remote.rmse_calculator(p21_type_traindic, p21_type_testdic,
                                      output_type='none')
p21_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.166029,0.368382
1,0.163294,0.424737
2,0.169615,0.378734
3,0.168901,0.379908
4,0.170911,0.319882
5,0.17 +/- 0.003,0.37 +/- 0.037


# (+1, 0)

In [43]:
lasso_0p1 = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_(+1,0)_7.7.csv')

In [44]:
lasso0p1_randomrank = rfr_randomhyper(lasso_0p1, o=0, d_start=5, label_type=1, label_site=4)
lasso0p1_randomrank

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.6min finished


Unnamed: 0,params,rank_test_score
31,"{'n_estimators': 350, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",1
30,"{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}",2
32,"{'n_estimators': 900, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",3
36,"{'n_estimators': 650, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",4
68,"{'n_estimators': 150, 'min_samples_split': 13, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}",5
76,"{'n_estimators': 900, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 7}",6
16,"{'n_estimators': 950, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",7
2,"{'n_estimators': 350, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}",8
56,"{'n_estimators': 450, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10}",9
24,"{'n_estimators': 250, 'min_samples_split': 20, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 10}",10


In [45]:
p01_type_traindic, p01_type_testdic= \
    rfr_remote.rfr_predictor(lasso_0p1, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=1, min_samples_split=10,
                             num_trees=350, folds=5)

In [46]:
p01_type_rmse = rfr_remote.rmse_calculator(p01_type_traindic, p01_type_testdic,
                                      output_type='none')
p01_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.18677,0.350243
1,0.183162,0.406868
2,0.189723,0.355849
3,0.188986,0.359015
4,0.194368,0.336093
5,0.19 +/- 0.004,0.36 +/- 0.027


---

In [47]:
n_estimators = [350, 650, 900]
min_samples_split = [5,10,14]
min_samples_leaf = [1,4]
max_features = ['auto', 'sqrt']
max_depth = [None,10]

grid_0p1 = {'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features}

In [48]:
lasso0p1_gridrank = rfr_gridsearch(lasso_0p1, grid=grid_0p1, o=0, d_start=5, label_type=1, label_site=4)
lasso0p1_gridrank

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   47.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  6.2min finished


Unnamed: 0,params,rank_test_score
2,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 900}",1
1,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 650}",2
0,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",3
37,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 650}",4
36,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",5
38,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 900}",6
5,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 900}",7
4,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 650}",8
41,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 900}",9
39,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 350}",10


In [49]:
p01_type_traindic, p01_type_testdic= \
    rfr_remote.rfr_predictor(lasso_0p1, d_start=5, max_depth=None, max_feat='auto', 
                             min_samp_leaf=1, min_samples_split=5,
                             num_trees=900, folds=5)

In [50]:
p01_type_rmse = rfr_remote.rmse_calculator(p01_type_traindic, p01_type_testdic,
                                      output_type='none')
p01_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.148703,0.348811
1,0.146807,0.40024
2,0.15014,0.351428
3,0.15212,0.349461
4,0.155102,0.33095
5,0.15 +/- 0.003,0.36 +/- 0.026


# (0, -1)

In [51]:
lasso_0m1 = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_(0,-1)_7.7.csv')

In [52]:
lasso0m1_randomrank = rfr_randomhyper(lasso_0m1, o=0, d_start=5, label_type=1, label_site=4)
lasso0m1_randomrank

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.1min finished


Unnamed: 0,params,rank_test_score
32,"{'n_estimators': 900, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",1
31,"{'n_estimators': 350, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",2
30,"{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}",3
36,"{'n_estimators': 650, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",4
68,"{'n_estimators': 150, 'min_samples_split': 13, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}",5
16,"{'n_estimators': 950, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",6
76,"{'n_estimators': 900, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 7}",7
24,"{'n_estimators': 250, 'min_samples_split': 20, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 10}",8
2,"{'n_estimators': 350, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}",9
59,"{'n_estimators': 700, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 9}",10


In [53]:
m01_type_traindic, m01_type_testdic= \
    rfr_remote.rfr_predictor(lasso_0m1, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=4, min_samples_split=10,
                             num_trees=900, folds=5)

In [54]:
m01_type_rmse = rfr_remote.rmse_calculator(m01_type_traindic, m01_type_testdic,
                                      output_type='none')
m01_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.202341,0.368771
1,0.201265,0.365401
2,0.20787,0.309858
3,0.203798,0.373263
4,0.208461,0.366667
5,0.2 +/- 0.003,0.36 +/- 0.026


---

In [55]:
n_estimators = [350, 500, 900]
min_samples_split = [5,10,15]
min_samples_leaf = [1,4]
max_features = ['auto', 'sqrt']
max_depth = [None,10]

grid_0m1 = {'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features}

In [56]:
lasso0m1_gridrank = rfr_gridsearch(lasso_0m1, grid=grid_0p1, o=0, d_start=5, label_type=1, label_site=4)
lasso0m1_gridrank

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 10.6min finished


Unnamed: 0,params,rank_test_score
2,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 900}",1
0,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",2
9,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 350}",3
4,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 650}",4
38,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 900}",5
1,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 650}",6
5,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 900}",7
10,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 650}",8
12,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 350}",9
11,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 900}",10


In [57]:
m01_type_traindic, m01_type_testdic= \
    rfr_remote.rfr_predictor(lasso_0m1, d_start=5, max_depth=None, max_feat='auto', 
                             min_samp_leaf=1, min_samples_split=5,
                             num_trees=900, folds=5)

In [58]:
m01_type_rmse = rfr_remote.rmse_calculator(m01_type_traindic, m01_type_testdic,
                                      output_type='none')
m01_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.143642,0.364635
1,0.142215,0.360388
2,0.146973,0.309343
3,0.144715,0.364025
4,0.147309,0.358307
5,0.14 +/- 0.002,0.35 +/- 0.024


# (-1, -2)

In [59]:
lasso_m12 = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_(-1,-2)_7.7.csv')

In [60]:
lassom12_randomrank = rfr_randomhyper(lasso_m12, o=0, d_start=5, label_type=1, label_site=4)
lassom12_randomrank 

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.7min finished


Unnamed: 0,params,rank_test_score
30,"{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}",1
32,"{'n_estimators': 900, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 10}",2
31,"{'n_estimators': 350, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",3
2,"{'n_estimators': 350, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}",4
68,"{'n_estimators': 150, 'min_samples_split': 13, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': None}",5
36,"{'n_estimators': 650, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",6
59,"{'n_estimators': 700, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 9}",7
76,"{'n_estimators': 900, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 7}",8
66,"{'n_estimators': 400, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 8}",9
56,"{'n_estimators': 450, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10}",10


In [61]:
m12_type_traindic, m12_type_testdic= \
    rfr_remote.rfr_predictor(lasso_m12, d_start=5, max_depth=None, max_feat='sqrt', 
                             min_samp_leaf=1, min_samples_split=5,
                             num_trees=500, folds=5)

In [62]:
m12_type_rmse = rfr_remote.rmse_calculator(m12_type_traindic, m12_type_testdic,
                                      output_type='none')
m12_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.146108,0.319421
1,0.152371,0.292077
2,0.152494,0.304645
3,0.143103,0.355936
4,0.14805,0.313048
5,0.15 +/- 0.004,0.32 +/- 0.024


---

In [63]:
n_estimators = [350, 500, 900]
min_samples_split = [5,10,9]
min_samples_leaf = [1,5]
max_features = ['auto', 'sqrt']
max_depth = [None,10,8]

grid_m12 = {'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features}

In [64]:
lassom12_gridrank = rfr_gridsearch(lasso_m12, grid=grid_m12, o=0, d_start=5, label_type=1, label_site=4)
lassom12_gridrank

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 12.7min finished


Unnamed: 0,params,rank_test_score
2,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 900}",1
1,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}",2
20,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 900}",3
38,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 900}",4
0,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",5
19,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}",6
7,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 500}",7
8,"{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 900}",8
18,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",9
25,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 500}",10


In [65]:
m12_type_traindic, m12_type_testdic= \
    rfr_remote.rfr_predictor(lasso_m12, d_start=5, max_depth=None, max_feat='sqrt', 
                             min_samp_leaf=1, min_samples_split=5,
                             num_trees=900, folds=5)

In [66]:
m12_type_rmse = rfr_remote.rmse_calculator(m12_type_traindic, m12_type_testdic,
                                      output_type='none')
m12_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.14566,0.32072
1,0.152401,0.292181
2,0.152302,0.304749
3,0.143061,0.355196
4,0.148561,0.313355
5,0.15 +/- 0.004,0.32 +/- 0.024


# (-2, -3)

In [67]:
lasso_m23 = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_(-2,-3)_7.7.csv')

In [68]:
lassom23_randomrank = rfr_randomhyper(lasso_m23, o=0, d_start=5, label_type=1, label_site=4)
lassom23_randomrank

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.2min finished


Unnamed: 0,params,rank_test_score
30,"{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}",1
2,"{'n_estimators': 350, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}",2
56,"{'n_estimators': 450, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10}",3
45,"{'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 8}",4
66,"{'n_estimators': 400, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 8}",5
31,"{'n_estimators': 350, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",6
19,"{'n_estimators': 250, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10}",7
76,"{'n_estimators': 900, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 7}",8
36,"{'n_estimators': 650, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}",9
59,"{'n_estimators': 700, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 9}",10


In [69]:
m23_type_traindic, m23_type_testdic= \
    rfr_remote.rfr_predictor(lasso_m23, d_start=5, max_depth=None, max_feat='sqrt', 
                             min_samp_leaf=1, min_samples_split=5,
                             num_trees=500, folds=5)

In [70]:
m23_type_rmse = rfr_remote.rmse_calculator(m23_type_traindic, m23_type_testdic,
                                      output_type='none')
m23_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.130597,0.239404
1,0.130013,0.256229
2,0.129186,0.251533
3,0.134935,0.215489
4,0.117075,0.283725
5,0.13 +/- 0.007,0.25 +/- 0.025


---

In [71]:
n_estimators = [350, 500, 450]
min_samples_split = [5,9,2]
min_samples_leaf = [1,2]
max_features = ['log2', 'sqrt']
max_depth = [None,10,8]

grid_m23 = {'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features}

In [72]:
lassom23_gridrank = rfr_gridsearch(lasso_m23, grid=grid_m23, o=0, d_start=5, label_type=1, label_site=4)
lassom23_gridrank

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   42.6s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  2.5min finished


Unnamed: 0,params,rank_test_score
28,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 500}",1
29,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 450}",2
24,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 350}",3
18,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",4
35,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 450}",5
19,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}",6
16,"{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}",7
34,"{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}",8
71,"{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 450}",9
54,"{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 350}",10


In [73]:
m23_type_traindic, m23_type_testdic= \
    rfr_remote.rfr_predictor(lasso_m23, d_start=5, max_depth=None, max_feat='sqrt', 
                             min_samp_leaf=2, min_samples_split=2,
                             num_trees=500, folds=5)

In [74]:
m23_type_rmse = rfr_remote.rmse_calculator(m23_type_traindic, m23_type_testdic,
                                      output_type='none')
m23_type_rmse

Unnamed: 0,train rmse,test rmse
0,0.146151,0.239121
1,0.146319,0.255993
2,0.144247,0.251319
3,0.149294,0.218752
4,0.136702,0.278424
5,0.14 +/- 0.005,0.25 +/- 0.022
