In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.feature_selection import VarianceThreshold
%matplotlib inline

#Defining Dictionaries
List_features = {}
List_feature_importances = {}
List_dead_features = ['NAME']

for feat_num in range(30,46,1):
    
    List_dead_features = ['NAME']
    iter_counter=0
    dead_feature_counter = 6 #arbitrary number higher than 0
    
    while dead_feature_counter>0:

        print(f'Number of features: {feat_num}, Iteration: {iter_counter}')
        
        #Reading and processing of Features
        data = pd.read_csv("My_Data_Set_Clean_1_Excel.csv",';',low_memory=False,usecols=lambda x: x not in List_dead_features)
        features = pd.get_dummies(data)
        labels = np.array(np.log10(features['K_off']))
        features = features.drop('K_off', axis = 1)
        feature_list = list(features.columns)
        features = np.array(features)

        #**************************************************************************************************

        #Sorting Features by descending Variance
        Feat_var = {}
        for x in range(len(features[0,:])):
            List_scal = []                 #scale all features between 0 and 1
            sort = sorted(features[:,x])
            mini = sort[0]
            maxi = sort[-1]
            for i in features[:,x]:
                i_scal = (i-mini)/(maxi-mini)
                List_scal.append(i_scal)
            mean = sum(List_scal)/len(List_scal)

            List_var = []
            for i in List_scal:
                var = ((i-mean)**2)/(len(List_scal)) #calculate variance of features
                List_var.append(var)
                
            variance = sum(List_var)
            List_features_dummy = []
            
            for j in features[:,x]:
                List_features_dummy.append(j)
            Feat_var[feature_list[x],variance]=List_features_dummy
            
        Feat_var2 = sorted(Feat_var, key=lambda x: x[1], reverse=True) #sort features by variance
        feature_list = []
        features = []
        
        for x in Feat_var2:
            feature_list.append(x[0])
            features.append(Feat_var[x])
        features = np.array(features).transpose()
        
        #***************************************************************************************************
        
        #Top feat_num features
        features = features[:,0:feat_num]
        feature_list = feature_list[0:feat_num]
        
        #***************************************************************************************************

        #Splitting of Data
        train_features, test_features, train_labels, test_labels = train_test_split(
            features, labels, test_size = 0.20, random_state = 7)

        
        #Random Forest Regressor
        rf = RandomForestRegressor(
            n_estimators = 150, random_state = 7, max_depth = 10, max_features='log2') 
        rf.fit(train_features, train_labels)
        predictions_test = rf.predict(test_features)
        predictions_train = rf.predict(train_features)
        
        
        #Calculation of Metrics
        mae_train = metrics.mean_absolute_error(train_labels, predictions_train)
        mse_train = metrics.mean_squared_error(train_labels, predictions_train)
        rmse_train = mse_train**0.5
        r2_train = r2_score(train_labels, predictions_train)
        
        mae_test = metrics.mean_absolute_error(test_labels, predictions_test)
        mse_test = metrics.mean_squared_error(test_labels, predictions_test)
        rmse_test = mse_test**0.5
        r2_test = r2_score(test_labels, predictions_test)
        
        print('MAE (Training Set): ',mae_train)
        print('RMSE (Training Set): ',rmse_train)
        print('r2 score (Training Set): ', r2_train)
        print('')
        print('MAE (Test Set): ',mae_test)
        print('RMSE (Test Set): ',rmse_test)
        print('r2 score (Test Set): ', r2_test)


        #Calculation of Cross Validation Scores
        n_scores = cross_val_score(rf, train_features, train_labels, cv=7, scoring='r2')
        n_score = n_scores.mean()
        
        print('Cross Validation Values: ',n_scores)
        print('Cross Validation Score: ',n_score)

        #Calculation of Feature Importances
        result = permutation_importance(rf, test_features, test_labels, n_repeats=30, random_state=0)
        forest_importances = pd.Series(result.importances_mean, index=feature_list)

        #Deleting Features with negative (or below permutation_threshold) importances
        List_2 = []
        counter3=0
        permutation_threshold=0.0 #Can be defined differently
        for x in range(0,len(feature_list)):
            if forest_importances[x]<permutation_threshold:
                List_dead_features.append(feature_list[x])
                counter3 = counter3 + 1
        dead_feature_counter = counter3

        #Add features to dictionary
        List_features[feat_num,iter_counter,r2_test,r2_train,n_score]=feature_list
        List_feature_importances[feat_num,iter_counter,r2_test,r2_train,n_score]=forest_importances

        print('')
        print('Used features:', feature_list)
        print('Amount of dropped features:', dead_feature_counter)
        print('')
        print('****************************************************************')
        print('')
        
        iter_counter = iter_counter+1

print('All models sorted by descending Cross Validation Scores:')    
sorted(List_features, key=lambda x:x[4],reverse=True)
#print(List_features)

Number of features: 30, Iteration: 0


  exec(code_obj, self.user_global_ns, self.user_ns)


MAE (Training Set):  0.2630589952892112
RMSE (Training Set):  0.34034403422361564
r2 score (Training Set):  0.9023429759458864

MAE (Test Set):  0.4688076733228873
RMSE (Test Set):  0.5784509986515596
r2 score (Test Set):  0.5798193665725745
Cross Validation Values:  [ 0.29466184  0.57359118  0.48596413 -0.02460979  0.22998006  0.63559001
  0.46975499]
Cross Validation Score:  0.3807046319810867

Used features: ['NaasN', 'n5ARing', 'nFARing', 'BCUTi-1h', 'n5AHRing', 'nFHRing', 'NsOH', 'n5HRing', 'n9FARing', 'nFAHRing', 'n9FAHRing', 'SaasN', 'SsOH', 'NssNH', 'n10FRing', 'n10FHRing', 'n10FaRing', 'n10FaHRing', 'nARing', 'n6HRing', 'BCUTc-1h', 'BalabanJ', 'nAHRing', 'TopoShapeInd', 'nS', 'nBase', 'SssNH', 'PetitjeanInde', 'BCUTp-1h', 'n6ARing']
Amount of dropped features: 15

****************************************************************

Number of features: 30, Iteration: 1


  exec(code_obj, self.user_global_ns, self.user_ns)


MAE (Training Set):  0.18575429158630277
RMSE (Training Set):  0.27324719988727897
r2 score (Training Set):  0.937052477567112

MAE (Test Set):  0.3342418145472692
RMSE (Test Set):  0.43442948043791185
r2 score (Test Set):  0.7630037584196037
Cross Validation Values:  [0.52754323 0.48357696 0.80381615 0.0774289  0.56325705 0.75748868
 0.79093497]
Cross Validation Score:  0.5720065611546195

Used features: ['NaasN', 'NsOH', 'n5HRing', 'SsOH', 'NssNH', 'nARing', 'BCUTc-1h', 'BalabanJ', 'TopoShapeInd', 'nS', 'nBase', 'SssNH', 'PetitjeanInde', 'BCUTp-1h', 'n6ARing', 'BCUTd-1h', 'GATS5c', 'BCUTi-1l', 'Radius', 'PEOE_VSA2', 'nX', 'MID_X', 'NsssN', 'ECIndex', 'SpDiam_Dzare', 'SpDiam_Dzi', 'SpDiam_D', 'SpDiam_Dzpe', 'SpAbs_D', 'SpAD_D']
Amount of dropped features: 9

****************************************************************

Number of features: 30, Iteration: 2


  exec(code_obj, self.user_global_ns, self.user_ns)


MAE (Training Set):  0.1817111779917126
RMSE (Training Set):  0.26462329173266264
r2 score (Training Set):  0.9409631284107167

MAE (Test Set):  0.3398737815544304
RMSE (Test Set):  0.45306175607202215
r2 score (Test Set):  0.7422387144697902
Cross Validation Values:  [0.61659245 0.4510404  0.84320923 0.11637468 0.54450631 0.74055094
 0.76239829]
Cross Validation Score:  0.5820960433277541


KeyboardInterrupt: 

In [4]:
sorted(List_features, key=lambda x:x[4],reverse=True)

[(30, 1, 0.7630037584196037, 0.937052477567112, 0.5720065611546195),
 (30, 0, 0.5798193665725745, 0.9023429759458864, 0.3807046319810867)]

In [3]:
sorted(List_features, key=lambda x:x[2],reverse=True)

[(37, 0.0, 0.9296268530934619, 0.9617336780631687, 0.714501244643792),
 (37, 0.0, 0.9117554010525288, 0.9638616146572688, 0.7005522266333125),
 (37, 0.0, 0.9102396872803954, 0.9628175600610592, 0.690299542236428),
 (45, 0.0, 0.9075610307662249, 0.9607858931807067, 0.6617089809851997),
 (45, 0.0, 0.906098878527159, 0.9619534380147622, 0.6652788256092432),
 (37, 0.0, 0.9036790738058601, 0.9627910550722636, 0.7059910866947383),
 (43, 0.0, 0.9021866842006628, 0.9629321290522606, 0.6923548133006311),
 (44, 0.0, 0.9013051798732266, 0.9591464284812319, 0.6820019625537777),
 (45, 0.0, 0.9006844609680049, 0.9609496806806243, 0.6677357606274794),
 (37, 0.0, 0.8984325470798372, 0.9617752157465099, 0.7032868962985964),
 (43, 0.0, 0.8963101106406248, 0.9613076197173671, 0.6614509519188212),
 (37, 0.0, 0.8957403924242991, 0.9609507384256312, 0.7066816809049833),
 (37, 0.0, 0.8955072136005757, 0.9618001419255292, 0.6978332494053585),
 (42, 0.0, 0.89462789442621, 0.9610820072433892, 0.7170483526218536

In [4]:
print(List_features)

{(30, 0.0, 0.5798193665725745, 0.9023429759458864, 0.3807046319810867): ['NaasN', 'n5ARing', 'nFARing', 'BCUTi-1h', 'n5AHRing', 'nFHRing', 'NsOH', 'n5HRing', 'n9FARing', 'nFAHRing', 'n9FAHRing', 'SaasN', 'SsOH', 'NssNH', 'n10FRing', 'n10FHRing', 'n10FaRing', 'n10FaHRing', 'nARing', 'n6HRing', 'BCUTc-1h', 'BalabanJ', 'nAHRing', 'TopoShapeInd', 'nS', 'nBase', 'SssNH', 'PetitjeanInde', 'BCUTp-1h', 'n6ARing'], (30, 0.0, 0.7630037584196037, 0.937052477567112, 0.5720065611546195): ['NaasN', 'NsOH', 'n5HRing', 'SsOH', 'NssNH', 'nARing', 'BCUTc-1h', 'BalabanJ', 'TopoShapeInd', 'nS', 'nBase', 'SssNH', 'PetitjeanInde', 'BCUTp-1h', 'n6ARing', 'BCUTd-1h', 'GATS5c', 'BCUTi-1l', 'Radius', 'PEOE_VSA2', 'nX', 'MID_X', 'NsssN', 'ECIndex', 'SpDiam_Dzare', 'SpDiam_Dzi', 'SpDiam_D', 'SpDiam_Dzpe', 'SpAbs_D', 'SpAD_D'], (30, 0.0, 0.7422387144697902, 0.9409631284107167, 0.5820960433277541): ['NsOH', 'n5HRing', 'SsOH', 'NssNH', 'nARing', 'BalabanJ', 'SssNH', 'BCUTp-1h', 'BCUTd-1h', 'GATS5c', 'Radius', 'PEOE_