In [None]:
##Correlation-Variance Thresholding


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.feature_selection import VarianceThreshold
%matplotlib inline



List_dead_features = ['NAME']
List_features = {}
List_feature_importances = {}
List_good_features = {}

permutation_threshold = 0.0 
dead_feature_counter = 6


for variance_threshold in np.arange(0,0.1,0.01):
    for correlation_threshold in 0.5,0.7, 0.75, 0.8, 0.85, 0.9, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 0.995:
          
        #Reading and processing of Features
        data = pd.read_csv("HSP90_Mordred_Features.csv",';',low_memory=False,usecols=lambda x: x not in 'NAME')
        features = pd.get_dummies(data)
        labels = np.array(np.log10(features['K_off']))
        features = features.drop('K_off', axis = 1)
        feature_list = list(features.columns)
        features = np.array(features)

        #**************************************************************************************************

        #Variance Thresholding
       
        print(f'Variance threshold: {variance_threshold}')

        Feat_var = {}
        for x in range(len(features[0,:])):
            List_scal = []                 #scale all features between 0 and 1
            sort = sorted(features[:,x])
            mini = sort[0]
            maxi = sort[-1]
            for i in features[:,x]:
                i_scal = (i-mini)/(maxi-mini)
                List_scal.append(i_scal)
            mean = sum(List_scal)/len(List_scal)

            List_var = []
            for i in List_scal:
                var = ((i-mean)**2)/(len(List_scal)) #calculate variance of features
                List_var.append(var)
            variance = sum(List_var)
            List_features_dummy = []
            if variance>variance_threshold: #Delete features below threshold
                for j in features[:,x]:
                    List_features_dummy.append(j)
                Feat_var[feature_list[x],variance]=List_features_dummy

        Feat_var2 = sorted(Feat_var, key=lambda x: x[1], reverse=True) #sort features by variance
        feature_list = []
        features = []
        
        for x in Feat_var2:
            feature_list.append(x[0])
            features.append(Feat_var[x])
        features = np.array(features).transpose()
        
        
        #**************************************************************************************************
        
        #Calculate correlations and delete features with correlation above correlation_threshold

        print(f'Correlation threshold: {correlation_threshold}')

        #Calculation and deleting of correlated features
        corr = np.corrcoef(features.transpose())
        List_numbers = []
        corrdict={}

        for x in range(0,len(corr),1):
            for y in range(0,len(corr),1):
                if x!=y and abs(corr[x,y])>correlation_threshold:
                    #print(f'{x},{y} : {corr[x,y]}')
                    if (y,x) not in corrdict:
                        corrdict[x,y]=1
                        if y not in List_numbers:
                            List_numbers.append(y)
        feature_list_dummy = []
        features_dummy = []

        for x in range(len(feature_list)):
            if x not in List_numbers:
                feature_list_dummy.append(feature_list[x])
                features_dummy.append(features[:,x])
        feature_list = feature_list_dummy
        features = np.array(features_dummy).transpose()

        print(f'Number of features: {len(feature_list)}')
        print('')
        
        #**************************************************************************************************

        #Splitting of Data
        train_features, test_features, train_labels, test_labels = train_test_split(
            features, labels, test_size = 0.20, random_state = 7)


        #Random Forest Regressor
        rf = RandomForestRegressor(
            n_estimators = 220, random_state = 7, max_depth = 6, max_features='log2') 
        rf.fit(train_features, train_labels)
        predictions_test = rf.predict(test_features)
        predictions_train = rf.predict(train_features)


        #Calculation of Metrics
        mae_train = metrics.mean_absolute_error(train_labels, predictions_train)
        mse_train = metrics.mean_squared_error(train_labels, predictions_train)
        rmse_train = mse_train**0.5
        r2_train = r2_score(train_labels, predictions_train)

        mae_test = metrics.mean_absolute_error(test_labels, predictions_test)
        mse_test = metrics.mean_squared_error(test_labels, predictions_test)
        rmse_test = mse_test**0.5
        r2_test = r2_score(test_labels, predictions_test)

        print('MAE (Training Set): ',mae_train)
        print('RMSE (Training Set): ',rmse_train)
        print('r2 score (Training Set): ', r2_train)
        print('')
        print('MAE (Test Set): ',mae_test)
        print('RMSE (Test Set): ',rmse_test)
        print('r2 score (Test Set): ', r2_test)


        #Calculation of Cross Validation Scores
        n_scores = cross_val_score(rf, train_features, train_labels, cv=7, scoring='r2')
        n_score = n_scores.mean()
        
        print('')
        print('Cross Validation Values: ',n_scores)
        print('Cross Validation Score: ',n_score)
        print('')
        print('***********************************')
        print('')
        
        List_features[variance_threshold,correlation_threshold,r2_test,r2_train,n_score]=feature_list

In [None]:
sorted(List_features, key=lambda x:x[2],reverse=True)

In [None]:
sorted(List_features, key=lambda x:x[4],reverse=True)

In [None]:
print(List_features)