In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from statistics import mean, stdev
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict

from sklearn.inspection import permutation_importance

In [2]:
def stratify_df(df, label_type, label_site):
    '''
    This function modifies the dataframe so that during cross validation
    the data can be split into test/train datasets that are equally stratified
    in "type" and "site" as the original dataframe.

    Inputs
        - df: pandas df. A ML training dataset that contains targets and
        features.
        - label_type: int. column index of "Type" column. Default: 1.
        - label_site int. column index of "Site" column. Dafault: 4.

    Outputs
        - b: pandas series. A newly encoded column that uniquely identifies
        the 15 possible combinations (3 sc types x 5 impurity sites) that a
        datapoint in the set could fal into.
    '''
    labels = df[df.columns[[label_type, label_site]]]

    # encode sc type and site columns, then combine them into a new string col
    # i.e. sctype 1 and site 3 becomes new column of 13 (dtype: string)
    enc = OrdinalEncoder(dtype=np.int)
    a = enc.fit_transform(labels)
    a = pd.DataFrame(a, columns=["SC_Type", "Site"])
    a = a.applymap(str)
    a = a[["SC_Type", "Site"]].apply(lambda x: ''.join(x), axis=1)

    # encode the new string col to 0-14 (15 total classes -
    # 3 sctypes x 5 defsites)
    b = np.array(a).reshape(-1, 1)
    b = enc.fit_transform(b)

    return b

In [3]:
def descriptors_outputs(df, d_start, o):
    '''
    This function splits to dataframe up into separate dataframes of
    descriptors and outputs by column.

    Inputs
        - df: pandas df. A ML training dataset that contains targets and
        features.
        - d_start: int. column index to that the descriptors columns start at.
        In the input df, the descriptors must start at some column at index
        df_start to the last column in the dataframe. Default: 3.
        - o: int. column index of the output. Deafult: 0.
    Outputs
        - X: pandas df. Dataframe with descriptors.
        - y: pandas df. Dataframe with output.
    '''
    X = df[df.columns[d_start:]]
    y = df[df.columns[o]]

    return X, y

In [4]:
#X, y = descriptors_outputs(lasso_a, 5, 0)
#[X.columns]

In [5]:
def traintest(X, y, train_idx, test_idx):
    '''
    This function splits the descriptors (X) and output (y) points into train
    and test sets. The size of test set depends on the number of folds of CV.

    Inputs
        - X: pandas df. Dataframe with descriptors.
        - y: pandas df. Dataframe with output.
        - train_idx: np array. Indexes of training points.
        - test_idx: np array. Indexes of testing points.

    Outputs
        - X_train: np array. descriptor values of training data set.
        - X_test: np array. descriptor values of test data set.
        - y_train: np array. output values of training data set.
        - y_test: np array. output values of test data set.
    '''
    # train_idx and test_idx come from skf.split
    X_train, X_test = X.iloc[list(train_idx)], X.iloc[list(test_idx)]
    y_train, y_test = y.iloc[list(train_idx)], y.iloc[list(test_idx)]

    return X_train, X_test, y_train, y_test

In [6]:
def fit_predict(X_train, y_train, X_test, clf):
    '''
    This function fits the training X/y data using the RFR model. Then makes a
    train and test prediction of the target value for each point, using the
    descriptors of training and testing. For each fold of the cross validation,
    the training and testing sets will change.

    Inputs
        - X_train: np array. descriptor values of training data set.
        - y_train: np array. output values of training data set.
        - X_test: np array. descriptor values of test data set.
        - clf: RandomForestRegressor from sklearn

    Outputs
        - trainpred: np array. predicted output value for every point in the
        train data set.
        - testpred: np array. predicted output value for every point in the
        test data set.
    '''
    # fit all the training data
    clf.fit(X_train, y_train)

    # predict on training data and testing data based on fit model
    trainpred = clf.predict(X_train)
    testpred = clf.predict(X_test)

    return trainpred, testpred,

In [7]:
def rfr_permute(df, o=0, d_start=5, num_trees=100, max_feat='auto',
                  max_depth=5, min_samp_leaf=2, min_samples_split=5,
                  folds=5, label_type=1, label_site=4):
    '''
    This is a wrapper func that performs RFR with cross validation on a set of
    data with observed values and descriptors. For each  fold of CV, points
    are predicted for the train and test data. The function returns train and
    test dictionaries that contain the fold of CV as the key, and the a
    dataframe of type, ab, impurity, site, dft values, predicted values for
    every point in the train or test set in that fold.

    Inputs
        - df: pandas df. A ML training dataset that contains targets and
        features.
        - o: int. column index of the output. Deafult: 0.
        - d_start: int. column index to that the descriptors columns start at.
        In the input df, the descriptors must start at some column at index
        df_start to the last column in the dataframe. Default: 5.
        - num_trees: int. Number of estimators (trees) to by used by the
        RFR in the random forest. Default:100.
        - max_feat: str. The number of features to consider when looking for
        the best split. Default: 'auto'
        - max_depth: int. The maximum depth of each tree in the forest.
        Keep this value low to mitigate overfitting. Default:5.
        - min_samp_leaf: int. The minimum number of samples required to be at
         a leaf node. Deafult: 2.
        - min_samples_split: int. The minimum number of samples required to
         split an internal node. Default: 5.
        - folds: int. Number of folds to to split the data in cross validation.
        Default: 5.
        - label_type: int. column index of "Type" column in DFT training
        dataframe. Default: 1.
        - label_site int. column index of "Site" column in DFT training
        dataframe. Dafault: 4.

    Outputs
        - folds_dict_train: pandas df. key is CV fold, value is dataframe of
         type, ab, impurity, site, dft values, predicted values for every
         point in the train set in that fold.
        - folds_dict_test: pandas df.  key is CV fold, value is dataframe of
         type, ab, impurity, site, dft values, predicted values for every
         point in the test set in that fold.
    '''
    # make the dataframe stratifiable by type and site
    b = stratify_df(df, label_type, label_site)

    # identify the descriptor columns and output column
    X, y = descriptors_outputs(df, d_start, o)

    # establish the stratified k-fold cross validation, folds in an input
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=130)

    # establish the RFR classifier with certain parameters, which are inputs
    clf = RandomForestRegressor(n_estimators=num_trees, max_features=max_feat,
                                max_depth=max_depth,
                                min_samples_leaf=min_samp_leaf,
                                min_samples_split=min_samples_split,
                                n_jobs=2, random_state=130)

    i=1
    permute_list = []
    for train_idx, test_idx in skf.split(df, b):

        X_train, X_test, y_train, y_test = traintest(X, y, train_idx, test_idx)

        clf.fit(X_train, y_train)

        result = permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=130, n_jobs=-1)
        
        feature_names = X.columns.tolist()
        
        forest_importances = pd.DataFrame(data={'predictor': feature_names, 
                                                'fold {}'.format(i):result.importances_mean})
        
        permute_list.append(forest_importances)
        #print(permute_list)
        
        i+=1
        
    result = pd.concat(permute_list, axis=1)
    result.drop(['predictor'], axis=1, inplace=True)
    result.insert(0, 'predictor', feature_names)
    result.set_index('predictor', inplace=True)
    result['mean'] = result.mean(axis=1)
    result['std dev'] = result.std(axis=1)
    

    return result

# Feature Importance
I am ultimately reporting feature importance based on feature permutation: using the sklearn library `inspection.permutation_importance` (sklearn.inspection.permutation_importance —scikit-learn 1.0 documentation) because it could be computed on the left out test set which I found useful. To score importance I used the estimator’s default scorer -but I can change this to rmse, though I think feature importances will be ranked relatively the same.

I also checked feature importance based on mean decrease in impurity (a property of the RFR model) and found feature importances were ranked approximately the same between the two methods.

The feature importance for each output are listed by most to least importance. Importance was calculated on “held out set” over 5 rounds of CV. The mean and std deviation of the five rounds is reported in .csv files (importance ranked by mean).

# dHA

In [8]:
lasso_a = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_HA_7.7.csv')

In [9]:
lassoa_featimport = rfr_permute(lasso_a, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=2, min_samples_split=15,
                             num_trees=600, folds=5)

In [10]:
lassoa_featimport.sort_values(by=['mean'], ascending=False)

Unnamed: 0_level_0,fold 1,fold 2,fold 3,fold 4,fold 5,mean,std dev
predictor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
abs(Group.3-Heat_vap.3),0.155717,0.173314,0.165763,0.081492,0.187524,1.527619e-01,0.037116
MP.2*BP.2,0.035523,0.065456,0.019582,0.026693,0.049339,3.931885e-02,0.016413
abs(Electronegativity.3-Electronegativity.2),0.043864,0.028261,0.039317,0.039755,0.036549,3.754900e-02,0.005199
abs(Valence.3-Valence.2),0.036058,0.032654,0.052738,0.023740,0.020527,3.314345e-02,0.011316
PBE_latt_const^3,0.049815,0.017055,0.049451,0.012741,0.000839,2.598029e-02,0.020030
...,...,...,...,...,...,...,...
abs(Thermal_expn.2-CM1),-0.000409,0.000301,0.000333,0.000353,0.000503,2.163573e-04,0.000321
MP^3,0.000107,-0.000088,-0.000001,-0.000047,0.000229,3.996014e-05,0.000115
Elec_cond/Ion_Energy.1,0.000056,0.000140,-0.000052,-0.000019,0.000018,2.868566e-05,0.000066
Elec_cond/Ion_pot_1.1,-0.000032,0.000252,-0.000129,0.000258,-0.000210,2.796656e-05,0.000194


In [11]:
#lassoa_featimport.to_csv('./feature_importance/lassoa_featureimportance.csv')

In [12]:
#lassoa_featimport_diff = rfr_descrip(lasso_a, d_start=5, max_depth=10, max_feat='auto', 
                             #min_samp_leaf=2, min_samples_split=15,
                             #num_trees=600, folds=5)

# dHB

In [13]:
lasso_b = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_HB_7.7.csv')

In [14]:
lassob_featimport = rfr_permute(lasso_b, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=3, min_samples_split=15,
                             num_trees=1000, folds=5)

In [15]:
lassob_featimport.sort_values(by=['mean'], ascending=False)

Unnamed: 0_level_0,fold 1,fold 2,fold 3,fold 4,fold 5,mean,std dev
predictor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
abs(Thermal_expn.3-Thermal_expn.2),0.258081,0.227807,0.234781,0.178244,0.202504,0.220284,0.027491
Ion_pot_2.2+At_rad_2.1,0.060041,0.047441,0.091738,0.073460,0.081095,0.070755,0.015575
BP.2-CM6,0.023712,0.022488,0.032893,0.048547,0.062407,0.038009,0.015348
Valence.3-Valence.2,0.008627,0.012849,0.023510,0.029714,0.022915,0.019523,0.007675
BP.3-At_rad_2.1,0.014839,0.034221,0.013459,0.009406,0.011751,0.016735,0.008929
...,...,...,...,...,...,...,...
abs(Electronegativity.3-Ion_Energy.3),0.000212,0.000919,0.000696,-0.000211,-0.000828,0.000157,0.000629
abs(Elec_cond.2-CM1),-0.000052,-0.000050,0.000630,0.000220,-0.000055,0.000139,0.000267
abs(At_rad_2.2-MP),-0.000226,0.000197,0.000572,0.000031,-0.000126,0.000089,0.000281
At_rad_2^3,0.000072,-0.000001,0.000072,0.000053,0.000064,0.000052,0.000027


In [16]:
#lassob_featimport.to_csv('./feature_importance/lassob_featureimportance.csv')

# +3/+2

In [17]:
lasso_p32 = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_(+3,+2)_7.7.csv')

In [18]:
lassop32_featimport = rfr_permute(lasso_p32, d_start=5, max_depth=7, max_feat='auto', 
                             min_samp_leaf=3, min_samples_split=3,
                             num_trees=450, folds=5)

In [19]:
lassop32_featimport.sort_values(by=['mean'], ascending=False)

Unnamed: 0_level_0,fold 1,fold 2,fold 3,fold 4,fold 5,mean,std dev
predictor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
abs(Valence.3-Valence.2),0.335369,0.254084,0.296486,0.333148,0.164186,0.276655,0.063546
At_rad_1.2+At_rad_1,0.105441,0.071145,0.023896,0.064488,0.077609,0.068516,0.026307
abs(ICSD_vol.3-Ion_rad.3),0.031544,0.020338,0.048561,0.092805,0.009980,0.040646,0.029048
MP/Group.3,0.030958,0.031272,0.024700,0.048298,0.016866,0.030419,0.010364
abs(Group.3-Mend_num.2),0.029880,0.013596,0.004846,0.014678,0.032115,0.019023,0.010379
...,...,...,...,...,...,...,...
Density.3/Thermal_expn.2,-0.000027,0.000490,-0.001230,-0.000717,0.000142,-0.000269,0.000621
abs(At_num.3-At_wt.3),0.000127,-0.000833,-0.000024,0.000171,-0.001146,-0.000341,0.000543
abs(Elec_cond.3-At_vol.3),-0.000685,-0.000302,-0.000153,-0.000091,-0.000921,-0.000430,0.000321
Ox_state.3/Density.2,-0.000980,-0.000228,-0.000514,0.001304,-0.002127,-0.000509,0.001114


In [20]:
#lassop32_featimport.to_csv('./feature_importance/lassop32_featureimportance.csv')

# +2/+1

In [21]:
lasso_p21 = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_(+2,+1)_7.7.csv')

In [22]:
lassop21_featimport = rfr_permute(lasso_p21, d_start=5, max_depth=9, max_feat='auto', 
                             min_samp_leaf=7, min_samples_split=2,
                             num_trees=600, folds=5)

In [23]:
lassop21_featimport.sort_values(by=['mean'], ascending=False)

Unnamed: 0_level_0,fold 1,fold 2,fold 3,fold 4,fold 5,mean,std dev
predictor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
abs(Ox_state.3-Ox_state.2),0.072242,0.075549,0.057141,0.110778,0.135101,0.090162,0.028529
abs(Valence.3-Valence.2),0.096934,0.136553,0.093118,0.049718,0.067147,0.088694,0.029534
Electronegativity.3-At_rad_1.3,0.081482,0.086853,0.080452,0.074388,0.049045,0.074444,0.013302
abs(Ion_pot_2.3-Therm_cond.1),0.051982,0.043507,0.126002,0.052928,0.062403,0.067364,0.029924
Ion_pot_2.3-ICSD_vol.2,0.055802,0.038494,0.054196,0.045945,0.086779,0.056243,0.01648
Electronegativity.3-Sp_heat_cap.1,0.016017,0.021308,0.028657,0.018137,0.011595,0.019143,0.005709
Elec_Aff.3+At_rad_2.1,0.01389,0.013987,0.029868,0.006927,0.014691,0.015873,0.007547
Therm_cond.1/Ion_pot_2.3,0.021379,0.007492,0.007479,0.018771,0.012508,0.013526,0.005713
abs(Group.3-Mend_num.2),0.009725,0.021576,0.008694,0.004274,0.009177,0.010689,0.005777
Thermal_expn.2-At_rad_2.1,0.005687,0.013909,0.01062,0.002626,0.012707,0.00911,0.00429


In [24]:
#lassop21_featimport.to_csv('./feature_importance/lassop21_featureimportance.csv')

# +1/0

In [25]:
lasso_0p1 = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_(+1,0)_7.7.csv')

In [26]:
lassop01_featimport = rfr_permute(lasso_0p1, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=8, min_samples_split=12,
                             num_trees=150, folds=5)

In [27]:
lassop01_featimport.sort_values(by=['mean'], ascending=False)

Unnamed: 0_level_0,fold 1,fold 2,fold 3,fold 4,fold 5,mean,std dev
predictor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
abs(Electronegativity.3-Electronegativity.2),0.036910,0.061396,0.068873,0.074501,0.059184,0.060173,0.012842
Ion_pot_2.3-Ion_pot_2.1,0.059486,0.072057,0.020226,0.051408,0.039523,0.048540,0.017687
Ion_Energy.3-Ion_pot_2.1,0.072955,0.046676,0.039647,0.021194,0.045698,0.045234,0.016613
Electronegativity.3+At_rad_2.1,0.017568,0.030802,0.049498,0.027543,0.020074,0.029097,0.011277
Ion_Energy.3-Ion_Energy.1,0.009471,0.022609,0.013342,0.049524,0.017417,0.022472,0.014211
...,...,...,...,...,...,...,...
abs(CM1-At_rad_2.1),0.000552,0.000671,-0.000421,0.000007,0.001108,0.000383,0.000534
Elec_cond.2-Ox_state.1,0.000153,-0.000218,0.000018,0.000106,0.001535,0.000319,0.000621
abs(Electronegativity.3-BP.1),0.000868,0.000280,0.000037,0.000297,0.000079,0.000312,0.000297
Ox_state.1*Thermal_expn,0.000260,0.000205,0.000442,0.000059,0.000524,0.000298,0.000167


In [28]:
#lassop01_featimport.to_csv('./feature_importance/lassop01_featureimportance.csv')

# 0/-1

In [29]:
lasso_0m1 = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_(0,-1)_7.7.csv')

In [30]:
lassom01_featimport = rfr_permute(lasso_0m1, d_start=5, max_depth=8, max_feat='auto', 
                             min_samp_leaf=5, min_samples_split=3,
                             num_trees=150, folds=5)

In [31]:
lassom01_featimport.sort_values(by=['mean'], ascending=False)

Unnamed: 0_level_0,fold 1,fold 2,fold 3,fold 4,fold 5,mean,std dev
predictor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Thermal_expn.2+Therm_cond.1,0.213922,0.340325,0.288275,0.238698,0.178076,0.251859,0.056937
Sp_heat_cap/PBE_delta_H,0.013314,0.082861,0.028426,0.039695,0.026430,0.038145,0.023875
Electronegativity.3+At_rad_2.1,0.008754,0.023794,0.020481,0.017841,0.004848,0.015144,0.007175
Thermal_expn.2+At_rad_2.1,0.015379,0.027101,0.007560,0.010749,0.009653,0.014088,0.006992
abs(Electronegativity.3-Electronegativity.2),0.018433,0.018259,0.008685,0.010689,0.011335,0.013480,0.004068
...,...,...,...,...,...,...,...
CM6-At_rad_1,-0.000412,-0.000276,-0.000153,0.000230,0.000075,-0.000107,0.000233
CM6-Cov_rad,-0.000268,-0.000239,0.000022,-0.000097,-0.000074,-0.000131,0.000108
CM6+Sp_heat_cap,-0.000116,-0.000155,-0.000327,0.000095,-0.000808,-0.000262,0.000304
At_rad_1.3+Ion_Energy.1,-0.000038,0.000621,-0.001116,0.000471,-0.001328,-0.000278,0.000804


In [32]:
#lassom01_featimport.to_csv('./feature_importance/lassom01_featureimportance.csv')

# -1/-2

In [33]:
lasso_m12 = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_(-1,-2)_7.7.csv')

In [34]:
lassom12_featimport = rfr_permute(lasso_m12, d_start=5, max_depth=10, max_feat='auto', 
                             min_samp_leaf=8, min_samples_split=12,
                             num_trees=750, folds=5)

In [35]:
lassom12_featimport.sort_values(by=['mean'], ascending=False)

Unnamed: 0_level_0,fold 1,fold 2,fold 3,fold 4,fold 5,mean,std dev
predictor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
At_rad_2.1-Elec_cond,0.262056,0.141322,0.055133,0.287830,0.369431,2.231543e-01,0.111374
Cov_rad/At_rad_2.1,0.095711,0.105303,0.116056,0.136120,0.044520,9.954165e-02,0.030614
Valence.1+Ion_pot_2.1,0.011750,0.065056,0.037650,0.017036,0.011594,2.861711e-02,0.020580
Ion_Energy.1+Ion_pot_1,0.012992,0.030484,0.018872,0.011614,0.025940,1.998039e-02,0.007292
Ion_pot_1.1+Ion_pot_1,0.007232,0.029443,0.021588,0.009384,0.022008,1.793089e-02,0.008367
...,...,...,...,...,...,...,...
Period.2-Ion_Energy.2,0.000040,-0.000027,0.000054,-0.000312,0.000257,2.165596e-06,0.000184
At_rad_1.3+CM4,0.000107,0.000320,-0.000467,0.000042,-0.000003,4.652299e-08,0.000258
Heat_vap.1*Elec_cond,-0.000047,0.000229,-0.000145,0.000230,-0.000302,-7.254098e-06,0.000210
Sp_heat_cap.3+Mend_num.2,-0.000772,0.000786,0.000868,0.000326,-0.001472,-5.266295e-05,0.000920


In [36]:
#lassom12_featimport.to_csv('./feature_importance/lassom12_featureimportance.csv')

# -2/-3

In [37]:
lasso_m23 = pd.read_csv('./xiaofeng_lasso/dataset_7p7/Lasso_(-2,-3)_7.7.csv')

In [38]:
lassom23_featimport = rfr_permute(lasso_m23, d_start=5, max_depth=10, max_feat='sqrt', 
                             min_samp_leaf=2, min_samples_split=14,
                             num_trees=450, folds=5)

In [39]:
lassom23_featimport.sort_values(by=['mean'], ascending=False)

Unnamed: 0_level_0,fold 1,fold 2,fold 3,fold 4,fold 5,mean,std dev
predictor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cov_rad.1+Mend_num,0.024988,0.024988,0.021421,0.031488,0.013080,0.023193,0.006011
ICSD_vol.1+Mend_num,0.015782,0.014356,0.009448,0.018849,0.005800,0.012847,0.004650
Valence.1+Ion_pot_2.1,0.009520,0.011917,0.008750,0.015914,0.006493,0.010519,0.003206
Ox_state.1+Ion_pot_1.1,0.009551,0.010923,0.008578,0.013564,0.005743,0.009672,0.002583
Ox_state.1+Ion_pot_2.1,0.009646,0.010780,0.007118,0.015041,0.002675,0.009052,0.004089
...,...,...,...,...,...,...,...
Ox_state.1+Eps_elec,0.000155,-0.000183,0.000273,0.000502,-0.000454,0.000058,0.000338
Eps_ion/Cov_rad,0.000130,0.000235,0.000102,0.000062,-0.000342,0.000038,0.000198
Ion_rad.2+Ion_pot_2,0.000055,0.000312,0.000125,-0.000156,-0.000242,0.000019,0.000199
Eps_ion/At_num,0.000101,0.000239,-0.000157,0.000152,-0.000450,-0.000023,0.000251


In [40]:
#lassom23_featimport.to_csv('./feature_importance/lassom23_featureimportance.csv')

### Unused funcs

In [41]:
def rfr_descrip(df, o=0, d_start=5, num_trees=100, max_feat='auto',
                  max_depth=5, min_samp_leaf=2, min_samples_split=5,
                  folds=5, label_type=1, label_site=4):
    
    # make the dataframe stratifiable by type and site
    b = stratify_df(df, label_type, label_site)

    # identify the descriptor columns and output column
    X, y = descriptors_outputs(df, d_start, o)

    # establish the stratified k-fold cross validation, folds in an input
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=130)

    # establish the RFR classifier with certain parameters, which are inputs
    clf = RandomForestRegressor(n_estimators=num_trees, max_features=max_feat,
                                max_depth=max_depth,
                                min_samples_leaf=min_samp_leaf,
                                min_samples_split=min_samples_split,
                                n_jobs=2, random_state=130)

    i=1
    permute_list = []
    for train_idx, test_idx in skf.split(df, b):

        X_train, X_test, y_train, y_test = traintest(X, y, train_idx, test_idx)

        clf.fit(X_train, y_train)

        result = permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=130, n_jobs=-1)
        
        feature_names = X.columns.tolist()
        
        descrip_importance = list(zip(feature_names, clf.feature_importances_))
        #print(descrip_importance2)
        descrip_importance.sort(key=lambda x:x[1])
        descr_df = pd.DataFrame(descrip_importance, columns =['predictor', i])
        print(descr_df)
        
        i+=1
        
    #result = pd.concat(permute_list, axis=1)
    #result.drop(['predictor'], axis=1, inplace=True)
    #result.insert(0, 'predictor', feature_names)
    #result.set_index('predictor', inplace=True)
    #result['mean'] = result.mean(axis=1)
    #result['std dev'] = result.std(axis=1)
    

    #return result