In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.feature_selection import VarianceThreshold
%matplotlib inline

#Defining Dictionaries
List_features = {}
List_feature_importances = {}
List_dead_features = ['NAME']

for feat_num in range(30,46,1):
    
    List_dead_features = ['NAME']
    iter_counter=0
    dead_feature_counter = 6 #arbitrary number higher than 0
    
    while dead_feature_counter>0:

        print(f'Number of features: {feat_num}, Iteration: {iter_counter}')
        
        #Reading and processing of Features
        data = pd.read_csv("Mordred_Features.csv",';',low_memory=False,usecols=lambda x: x not in List_dead_features)
        features = pd.get_dummies(data)
        labels = np.array(np.log10(features['K_off']))
        features = features.drop('K_off', axis = 1)
        feature_list = list(features.columns)
        features = np.array(features)

        #**************************************************************************************************

        #Sorting Features by descending Variance
        Feat_var = {}
        for x in range(len(features[0,:])):
            List_scal = []                 #scale all features between 0 and 1
            sort = sorted(features[:,x])
            mini = sort[0]
            maxi = sort[-1]
            for i in features[:,x]:
                i_scal = (i-mini)/(maxi-mini)
                List_scal.append(i_scal)
            mean = sum(List_scal)/len(List_scal)

            List_var = []
            for i in List_scal:
                var = ((i-mean)**2)/(len(List_scal)) #calculate variance of features
                List_var.append(var)
                
            variance = sum(List_var)
            List_features_dummy = []
            
            for j in features[:,x]:
                List_features_dummy.append(j)
            Feat_var[feature_list[x],variance]=List_features_dummy
            
        Feat_var2 = sorted(Feat_var, key=lambda x: x[1], reverse=True) #sort features by variance
        feature_list = []
        features = []
        
        for x in Feat_var2:
            feature_list.append(x[0])
            features.append(Feat_var[x])
        features = np.array(features).transpose()
        
        #***************************************************************************************************
        
        #Top feat_num features
        features = features[:,0:feat_num]
        feature_list = feature_list[0:feat_num]
        
        #***************************************************************************************************

        #Splitting of Data
        train_features, test_features, train_labels, test_labels = train_test_split(
            features, labels, test_size = 0.20, random_state = 7)

        
        #Random Forest Regressor
        rf = RandomForestRegressor(
            n_estimators = 150, random_state = 7, max_depth = 10, max_features='log2') 
        rf.fit(train_features, train_labels)
        predictions_test = rf.predict(test_features)
        predictions_train = rf.predict(train_features)
        
        
        #Calculation of Metrics
        mae_train = metrics.mean_absolute_error(train_labels, predictions_train)
        mse_train = metrics.mean_squared_error(train_labels, predictions_train)
        rmse_train = mse_train**0.5
        r2_train = r2_score(train_labels, predictions_train)
        
        mae_test = metrics.mean_absolute_error(test_labels, predictions_test)
        mse_test = metrics.mean_squared_error(test_labels, predictions_test)
        rmse_test = mse_test**0.5
        r2_test = r2_score(test_labels, predictions_test)
        
        print('MAE (Training Set): ',mae_train)
        print('RMSE (Training Set): ',rmse_train)
        print('r2 score (Training Set): ', r2_train)
        print('')
        print('MAE (Test Set): ',mae_test)
        print('RMSE (Test Set): ',rmse_test)
        print('r2 score (Test Set): ', r2_test)


        #Calculation of Cross Validation Scores
        n_scores = cross_val_score(rf, train_features, train_labels, cv=7, scoring='r2')
        n_score = n_scores.mean()
        
        print('Cross Validation Values: ',n_scores)
        print('Cross Validation Score: ',n_score)

        #Calculation of Feature Importances
        result = permutation_importance(rf, test_features, test_labels, n_repeats=30, random_state=0)
        forest_importances = pd.Series(result.importances_mean, index=feature_list)

        #Deleting Features with negative (or below permutation_threshold) importances
        List_2 = []
        counter3=0
        permutation_threshold=0.0 #Can be defined differently
        for x in range(0,len(feature_list)):
            if forest_importances[x]<permutation_threshold:
                List_dead_features.append(feature_list[x])
                counter3 = counter3 + 1
        dead_feature_counter = counter3

        #Add features to dictionary
        List_features[feat_num,iter_counter,r2_test,r2_train,n_score]=feature_list
        List_feature_importances[feat_num,iter_counter,r2_test,r2_train,n_score]=forest_importances

        print('')
        print('Used features:', feature_list)
        print('Amount of dropped features:', dead_feature_counter)
        print('')
        print('****************************************************************')
        print('')
        
        iter_counter = iter_counter+1

print('All models sorted by descending Cross Validation Scores:')    
sorted(List_features, key=lambda x:x[4],reverse=True)
#print(List_features)