In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import RANSACRegressor, TheilSenRegressor

# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF22',
    'OF25',
    'OF26',
    'F3_1',
    'F3_2',
    'F3_3',
    'F3_4',
    'F3_5',
    'F3_6',
    'F3_7',
    'F20',
    'F21',
    'F22',
    'F28',
    'F31',
    'F41',
    'F42',
    'F44',
    'F48',
    'F49',
]

results_columns = ['WS']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(), 
    KNeighborsRegressor(), MLPRegressor(max_iter=200), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000), 
    SVR(cache_size=1000), BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(), 
    TheilSenRegressor()
]
# Create dictionaries to store feature importances for all models
all_feature_importances_avg = {col: [] for col in X.columns}
points_per_feature = {col: 0 for col in X.columns}
top_bottom_features_per_model = {type(model).__name__: {'Top 3': [], 'Bottom 3': []} for model in models}

# Iterate over models
for model in models:
    print(model)
    # Train the model
    model.fit(X_train, y_train)

    # Perform permutation feature importance analysis
    perm_importance = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42)

    # Get feature importances
    feature_importances = perm_importance.importances_mean

    # Get indices of features sorted by importance
    most_important_indices = feature_importances.argsort()[-3:][::-1]
    least_important_indices = feature_importances.argsort()[:3]

    
    important_indices = feature_importances.argsort()[:]

    # Calculate points for each feature
    for i in range(len(important_indices)):
        points_per_feature[X.columns[important_indices[i]]] += i



    # Store top 3 and bottom 3 important features
    for idx in most_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Top 3'].append((X.columns[idx], feature_importances[idx]))
    for idx in least_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Bottom 3'].append((X.columns[idx], feature_importances[idx]))

    # Append the feature importances to the dictionary
    for idx, col in enumerate(X.columns):
        all_feature_importances_avg[col].append(feature_importances[idx])

# Calculate average feature importance across all models
for col, importances in all_feature_importances_avg.items():
    if len(importances) > 0:
        all_feature_importances_avg[col] = np.mean(importances)

# Get indices of features sorted by average importance
sorted_indices_avg = np.argsort(list(all_feature_importances_avg.values()))[::-1]

# Print the top 3 and bottom 3 important features for each model
for model, features in top_bottom_features_per_model.items():
    print(f"\nModel: {model}")
    print("Top 3 Important Features:")
    for feature, importance in features['Top 3']:
        print(f"'{feature}': {importance}")
    print("Bottom 3 Important Features:")
    for feature, importance in features['Bottom 3']:
        print(f"'{feature}': {importance}")

# Print the average importance of features
print("\nAverage Importance of Features:")
for idx in sorted_indices_avg:
    print(f"Feature '{list(all_feature_importances_avg.keys())[idx]}': {list(all_feature_importances_avg.values())[idx]}")

# Calculate points per feature
print("\nPoints per Feature:")
sorted_points = sorted(points_per_feature.items(), key=lambda x: x[1], reverse=True)
for feature, points in sorted_points:
    print(f"Feature '{feature}': {points} points")


Ridge()
DecisionTreeRegressor()
GradientBoostingRegressor()
RandomForestRegressor()
AdaBoostRegressor()
KNeighborsRegressor()
MLPRegressor()




ElasticNet()
SGDRegressor()
SVR(cache_size=1000)
BayesianRidge(max_iter=1000)
KernelRidge()
LinearRegression()
RANSACRegressor()
TheilSenRegressor()

Model: Ridge
Top 3 Important Features:
'F48': 0.19474891027688837
'F41': 0.1873442207855662
'F44': 0.12057217616733026
Bottom 3 Important Features:
'F22': -0.019666791926622733
'F28': -0.012287819772225999
'F3_3': -0.00907327737717127

Model: DecisionTreeRegressor
Top 3 Important Features:
'F41': 0.831380186133945
'F31': 0.44218190627748594
'F21': 0.4232167376234584
Bottom 3 Important Features:
'F3_7': -0.013747695275318331
'OF26': -0.0074437094778184425
'F3_3': -0.00646071989198049

Model: GradientBoostingRegressor
Top 3 Important Features:
'F41': 0.2624876911421128
'F31': 0.13098117823078811
'F44': 0.08023063165155067
Bottom 3 Important Features:
'F28': -0.0010810354693398935
'F42': 0.0
'F3_1': 0.0012710486050913443

Model: RandomForestRegressor
Top 3 Important Features:
'F41': 0.540114519202654
'F31': 0.21343532054264133
'F44': 0.16773

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import RANSACRegressor, TheilSenRegressor

# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF22',
    'OF26',
    'OF27',
    'F17',
    'F20',
    'F22',
    'F28',
    'F29',
    'F31',
    'F33',
    'F34',
    'F35',
    'F36',
    'F41',
    'F42',
    'F44',
    'F49',
    'S5',
]

results_columns = ['SR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(), 
    KNeighborsRegressor(), MLPRegressor(max_iter=200), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000), 
    SVR(cache_size=1000), BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(), 
    TheilSenRegressor()
]

# Create dictionaries to store feature importances for all models
all_feature_importances_avg = {col: [] for col in X.columns}
points_per_feature = {col: 0 for col in X.columns}
top_bottom_features_per_model = {type(model).__name__: {'Top 3': [], 'Bottom 3': []} for model in models}

# Iterate over models
for model in models:
    print(model)
    # Train the model
    model.fit(X_train, y_train)

    # Perform permutation feature importance analysis
    perm_importance = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42)

    # Get feature importances
    feature_importances = perm_importance.importances_mean

    # Get indices of features sorted by importance
    most_important_indices = feature_importances.argsort()[-3:][::-1]
    least_important_indices = feature_importances.argsort()[:3]

    
    important_indices = feature_importances.argsort()[:]

    # Calculate points for each feature
    for i in range(len(important_indices)):
        points_per_feature[X.columns[important_indices[i]]] += i



    # Store top 3 and bottom 3 important features
    for idx in most_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Top 3'].append((X.columns[idx], feature_importances[idx]))
    for idx in least_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Bottom 3'].append((X.columns[idx], feature_importances[idx]))

    # Append the feature importances to the dictionary
    for idx, col in enumerate(X.columns):
        all_feature_importances_avg[col].append(feature_importances[idx])

# Calculate average feature importance across all models
for col, importances in all_feature_importances_avg.items():
    if len(importances) > 0:
        all_feature_importances_avg[col] = np.mean(importances)

# Get indices of features sorted by average importance
sorted_indices_avg = np.argsort(list(all_feature_importances_avg.values()))[::-1]

# Print the top 3 and bottom 3 important features for each model
for model, features in top_bottom_features_per_model.items():
    print(f"\nModel: {model}")
    print("Top 3 Important Features:")
    for feature, importance in features['Top 3']:
        print(f"'{feature}': {importance}")
    print("Bottom 3 Important Features:")
    for feature, importance in features['Bottom 3']:
        print(f"'{feature}': {importance}")

# Print the average importance of features
print("\nAverage Importance of Features:")
for idx in sorted_indices_avg:
    print(f"Feature '{list(all_feature_importances_avg.keys())[idx]}': {list(all_feature_importances_avg.values())[idx]}")

# Calculate points per feature
print("\nPoints per Feature:")
sorted_points = sorted(points_per_feature.items(), key=lambda x: x[1], reverse=True)
for feature, points in sorted_points:
    print(f"Feature '{feature}': {points} points")


Ridge()
DecisionTreeRegressor()
GradientBoostingRegressor()
RandomForestRegressor()
AdaBoostRegressor()
KNeighborsRegressor()
MLPRegressor()
ElasticNet()
SGDRegressor()
SVR(cache_size=1000)
BayesianRidge(max_iter=1000)
KernelRidge()
LinearRegression()
RANSACRegressor()
TheilSenRegressor()

Model: Ridge
Top 3 Important Features:
'F41': 0.1569375880452713
'F44': 0.1515156674079351
'F28': 0.02626338001924428
Bottom 3 Important Features:
'OF22': -0.033587990769614
'OF27': -0.028110955509000415
'S5': -0.018309398965431874

Model: DecisionTreeRegressor
Top 3 Important Features:
'F41': 0.7936518140002362
'F31': 0.7564830298923497
'OF26': 0.34163105385643927
Bottom 3 Important Features:
'OF27': -0.10517860700221486
'F34': -0.02303797432510955
'F22': -0.009824606107376318

Model: GradientBoostingRegressor
Top 3 Important Features:
'F31': 0.24197636741863532
'F44': 0.19427219264793127
'F41': 0.1661924238871593
Bottom 3 Important Features:
'F35': -0.005454165006583101
'F42': 0.0001125585444115710

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import RANSACRegressor, TheilSenRegressor

# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF9',
    'OF10',
    'OF11',
    'OF19',
    'OF20',
    'OF21',
    'OF22',
    'OF23',
    'OF24',
    'F13',
    'F40',
    'F50',
    'F51',
    'F52',
    'F66',
    'S2',
]

results_columns = ['NR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(), 
    KNeighborsRegressor(), MLPRegressor(max_iter=200), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000), 
    SVR(cache_size=1000), BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(), 
    TheilSenRegressor()
]

# Create dictionaries to store feature importances for all models
all_feature_importances_avg = {col: [] for col in X.columns}
points_per_feature = {col: 0 for col in X.columns}
top_bottom_features_per_model = {type(model).__name__: {'Top 3': [], 'Bottom 3': []} for model in models}

# Iterate over models
for model in models:
    print(model)
    # Train the model
    model.fit(X_train, y_train)

    # Perform permutation feature importance analysis
    perm_importance = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42)

    # Get feature importances
    feature_importances = perm_importance.importances_mean

    # Get indices of features sorted by importance
    most_important_indices = feature_importances.argsort()[-3:][::-1]
    least_important_indices = feature_importances.argsort()[:3]

    
    important_indices = feature_importances.argsort()[:]

    # Calculate points for each feature
    for i in range(len(important_indices)):
        points_per_feature[X.columns[important_indices[i]]] += i



    # Store top 3 and bottom 3 important features
    for idx in most_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Top 3'].append((X.columns[idx], feature_importances[idx]))
    for idx in least_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Bottom 3'].append((X.columns[idx], feature_importances[idx]))

    # Append the feature importances to the dictionary
    for idx, col in enumerate(X.columns):
        all_feature_importances_avg[col].append(feature_importances[idx])

# Calculate average feature importance across all models
for col, importances in all_feature_importances_avg.items():
    if len(importances) > 0:
        all_feature_importances_avg[col] = np.mean(importances)

# Get indices of features sorted by average importance
sorted_indices_avg = np.argsort(list(all_feature_importances_avg.values()))[::-1]

# Print the top 3 and bottom 3 important features for each model
for model, features in top_bottom_features_per_model.items():
    print(f"\nModel: {model}")
    print("Top 3 Important Features:")
    for feature, importance in features['Top 3']:
        print(f"'{feature}': {importance}")
    print("Bottom 3 Important Features:")
    for feature, importance in features['Bottom 3']:
        print(f"'{feature}': {importance}")

# Print the average importance of features
print("\nAverage Importance of Features:")
for idx in sorted_indices_avg:
    print(f"Feature '{list(all_feature_importances_avg.keys())[idx]}': {list(all_feature_importances_avg.values())[idx]}")

# Calculate points per feature
print("\nPoints per Feature:")
sorted_points = sorted(points_per_feature.items(), key=lambda x: x[1], reverse=True)
for feature, points in sorted_points:
    print(f"Feature '{feature}': {points} points")


Ridge()
DecisionTreeRegressor()
GradientBoostingRegressor()
RandomForestRegressor()
AdaBoostRegressor()
KNeighborsRegressor()
MLPRegressor()
ElasticNet()
SGDRegressor()
SVR(cache_size=1000)
BayesianRidge(max_iter=1000)
KernelRidge()
LinearRegression()
RANSACRegressor()
TheilSenRegressor()

Model: Ridge
Top 3 Important Features:
'F13': 0.2633045182506485
'OF24': 0.208328654465134
'OF10': 0.140631666902676
Bottom 3 Important Features:
'S2': -0.0134054885283717
'OF19': -0.0024938735746130704
'OF20': 0.0

Model: DecisionTreeRegressor
Top 3 Important Features:
'F50': 0.025201194789933455
'F13': 0.021783264702260167
'OF10': 0.01976586380598892
Bottom 3 Important Features:
'F66': -0.4538123484516632
'OF9': -0.173120267536922
'OF19': -0.14381257201494055

Model: GradientBoostingRegressor
Top 3 Important Features:
'F13': 0.3103453206621778
'OF24': 0.18176460535463512
'OF11': 0.18124055136508985
Bottom 3 Important Features:
'S2': -0.020654592792847863
'OF20': 0.0
'OF21': 0.0

Model: RandomForest

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import RANSACRegressor, TheilSenRegressor

# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF22',
    'OF26',
    'OF27',
    'F17',
    'F20',
    'F21',
    'F23',
    'F24',
    'F28',
    'F29',
    'F33',
    'F34',
    'F36',
    'F38',
    'F41',
    'F42',
    'F44',
    'F49',
    'F63',
    'F65',
]

results_columns = ['PR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(), 
    KNeighborsRegressor(), MLPRegressor(max_iter=200), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000), 
    SVR(cache_size=1000), BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(), 
    TheilSenRegressor()
]

# Create dictionaries to store feature importances for all models
all_feature_importances_avg = {col: [] for col in X.columns}
points_per_feature = {col: 0 for col in X.columns}
top_bottom_features_per_model = {type(model).__name__: {'Top 3': [], 'Bottom 3': []} for model in models}

# Iterate over models
for model in models:
    print(model)
    # Train the model
    model.fit(X_train, y_train)

    # Perform permutation feature importance analysis
    perm_importance = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42)

    # Get feature importances
    feature_importances = perm_importance.importances_mean

    # Get indices of features sorted by importance
    most_important_indices = feature_importances.argsort()[-3:][::-1]
    least_important_indices = feature_importances.argsort()[:3]

    
    important_indices = feature_importances.argsort()[:]

    # Calculate points for each feature
    for i in range(len(important_indices)):
        points_per_feature[X.columns[important_indices[i]]] += i



    # Store top 3 and bottom 3 important features
    for idx in most_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Top 3'].append((X.columns[idx], feature_importances[idx]))
    for idx in least_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Bottom 3'].append((X.columns[idx], feature_importances[idx]))

    # Append the feature importances to the dictionary
    for idx, col in enumerate(X.columns):
        all_feature_importances_avg[col].append(feature_importances[idx])

# Calculate average feature importance across all models
for col, importances in all_feature_importances_avg.items():
    if len(importances) > 0:
        all_feature_importances_avg[col] = np.mean(importances)

# Get indices of features sorted by average importance
sorted_indices_avg = np.argsort(list(all_feature_importances_avg.values()))[::-1]

# Print the top 3 and bottom 3 important features for each model
for model, features in top_bottom_features_per_model.items():
    print(f"\nModel: {model}")
    print("Top 3 Important Features:")
    for feature, importance in features['Top 3']:
        print(f"'{feature}': {importance}")
    print("Bottom 3 Important Features:")
    for feature, importance in features['Bottom 3']:
        print(f"'{feature}': {importance}")

# Print the average importance of features
print("\nAverage Importance of Features:")
for idx in sorted_indices_avg:
    print(f"Feature '{list(all_feature_importances_avg.keys())[idx]}': {list(all_feature_importances_avg.values())[idx]}")

# Calculate points per feature
print("\nPoints per Feature:")
sorted_points = sorted(points_per_feature.items(), key=lambda x: x[1], reverse=True)
for feature, points in sorted_points:
    print(f"Feature '{feature}': {points} points")


Ridge()
DecisionTreeRegressor()
GradientBoostingRegressor()
RandomForestRegressor()
AdaBoostRegressor()
KNeighborsRegressor()
MLPRegressor()
ElasticNet()
SGDRegressor()
SVR(cache_size=1000)
BayesianRidge(max_iter=1000)
KernelRidge()
LinearRegression()
RANSACRegressor()
TheilSenRegressor()

Model: Ridge
Top 3 Important Features:
'F44': 0.2312628281318235
'F24': 0.10313288586173087
'F23': 0.0941967218198077
Bottom 3 Important Features:
'F21': -0.013004331211253791
'F20': -0.0037148731457689265
'OF22': -0.003390889382650693

Model: DecisionTreeRegressor
Top 3 Important Features:
'F28': 0.5078366933511352
'F23': 0.3772867160300349
'F44': 0.25411408695992926
Bottom 3 Important Features:
'F36': -0.00538495125713188
'F29': -0.001631248122456567
'F24': -0.0011078251966758971

Model: GradientBoostingRegressor
Top 3 Important Features:
'F44': 0.35084453507567304
'F41': 0.14023342378673265
'F28': 0.12732479674082417
Bottom 3 Important Features:
'F33': -0.004194473866954799
'F20': -0.0009507062195