In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import RANSACRegressor, TheilSenRegressor

# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF22',
    'OF25',
    'OF26',
    'F3_1',
    'F3_2',
    'F3_3',
    'F3_4',
    'F3_5',
    'F3_6',
    'F3_7',
    'F20',
    'F21',
    'F22',
    'F28',
    'F31',
    'F41',
    'F42',
    'F44',
    'F48',
    'F49',
    'PC',
    'FC',
    'WRI',
    'SVT',
    'VCHWC',
    'HWCC',
    'MC',
    'PP',
    'ST',
    'SWP',
    'DP',
    'ADLM',
    'ATDO',
    'AOD'
]

results_columns = ['WS']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(), 
    KNeighborsRegressor(), MLPRegressor(max_iter=200), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000), 
    SVR(cache_size=1000), BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(), 
    TheilSenRegressor()
]
# Create dictionaries to store feature importances for all models
all_feature_importances_avg = {col: [] for col in X.columns}
points_per_feature = {col: 0 for col in X.columns}
top_bottom_features_per_model = {type(model).__name__: {'Top 3': [], 'Bottom 3': []} for model in models}

# Iterate over models
for model in models:
    print(model)
    # Train the model
    model.fit(X_train, y_train)

    # Perform permutation feature importance analysis
    perm_importance = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42)

    # Get feature importances
    feature_importances = perm_importance.importances_mean

    # Get indices of features sorted by importance
    most_important_indices = feature_importances.argsort()[-3:][::-1]
    least_important_indices = feature_importances.argsort()[:3]

    
    important_indices = feature_importances.argsort()[:]

    # Calculate points for each feature
    for i in range(len(important_indices)):
        points_per_feature[X.columns[important_indices[i]]] += i



    # Store top 3 and bottom 3 important features
    for idx in most_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Top 3'].append((X.columns[idx], feature_importances[idx]))
    for idx in least_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Bottom 3'].append((X.columns[idx], feature_importances[idx]))

    # Append the feature importances to the dictionary
    for idx, col in enumerate(X.columns):
        all_feature_importances_avg[col].append(feature_importances[idx])

# Calculate average feature importance across all models
for col, importances in all_feature_importances_avg.items():
    if len(importances) > 0:
        all_feature_importances_avg[col] = np.mean(importances)

# Get indices of features sorted by average importance
sorted_indices_avg = np.argsort(list(all_feature_importances_avg.values()))[::-1]

# Print the top 3 and bottom 3 important features for each model
for model, features in top_bottom_features_per_model.items():
    print(f"\nModel: {model}")
    print("Top 3 Important Features:")
    for feature, importance in features['Top 3']:
        print(f"'{feature}': {importance}")
    print("Bottom 3 Important Features:")
    for feature, importance in features['Bottom 3']:
        print(f"'{feature}': {importance}")

# Print the average importance of features
print("\nAverage Importance of Features:")
for idx in sorted_indices_avg:
    print(f"Feature '{list(all_feature_importances_avg.keys())[idx]}': {list(all_feature_importances_avg.values())[idx]}")

# Calculate points per feature
print("\nPoints per Feature:")
sorted_points = sorted(points_per_feature.items(), key=lambda x: x[1], reverse=True)
for feature, points in sorted_points:
    print(f"Feature '{feature}': {points} points")


Ridge()
DecisionTreeRegressor()
GradientBoostingRegressor()
RandomForestRegressor()
AdaBoostRegressor()
KNeighborsRegressor()
MLPRegressor()




ElasticNet()
SGDRegressor()
SVR(cache_size=1000)
BayesianRidge(max_iter=1000)
KernelRidge()
LinearRegression()
RANSACRegressor()
TheilSenRegressor()

Model: Ridge
Top 3 Important Features:
'F41': 0.20927085109330273
'F48': 0.10867141674043689
'F44': 0.10367071151119145
Bottom 3 Important Features:
'ADLM': -0.044115080672420974
'F22': -0.01622959352484662
'F3_3': -0.013285127165714576

Model: DecisionTreeRegressor
Top 3 Important Features:
'F28': 0.9149378308833084
'F31': 0.530395315676556
'F41': 0.5183916711430278
Bottom 3 Important Features:
'ADLM': -0.09600698048112613
'FC': -0.015422927281222184
'SWP': -0.010489182695707526

Model: GradientBoostingRegressor
Top 3 Important Features:
'F41': 0.172108771900187
'F31': 0.10412660196431137
'F49': 0.09075554359445229
Bottom 3 Important Features:
'ATDO': -0.03504352342201253
'F3_3': -0.020305776435032966
'F3_5': -0.004167069745776723

Model: RandomForestRegressor
Top 3 Important Features:
'F41': 0.21584656441841915
'F31': 0.1275933097446765

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import RANSACRegressor, TheilSenRegressor

# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF22',
    'OF26',
    'OF27',
    'F17',
    'F20',
    'F22',
    'F28',
    'F29',
    'F31',
    'F33',
    'F34',
    'F35',
    'F36',
    'F41',
    'F42',
    'F44',
    'F49',
    'S5',
    'PC',
    'FC',
    'WRI',
    'SVT',
    'VCHWC',
    'HWCC',
    'MC',
    'PP',
    'ST',
    'SWP',
    'DP',
    'ADLM',
    'ATDO',
    'AOD'
]

results_columns = ['SR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(), 
    KNeighborsRegressor(), MLPRegressor(max_iter=200), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000), 
    SVR(cache_size=1000), BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(), 
    TheilSenRegressor()
]

# Create dictionaries to store feature importances for all models
all_feature_importances_avg = {col: [] for col in X.columns}
points_per_feature = {col: 0 for col in X.columns}
top_bottom_features_per_model = {type(model).__name__: {'Top 3': [], 'Bottom 3': []} for model in models}

# Iterate over models
for model in models:
    print(model)
    # Train the model
    model.fit(X_train, y_train)

    # Perform permutation feature importance analysis
    perm_importance = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42)

    # Get feature importances
    feature_importances = perm_importance.importances_mean

    # Get indices of features sorted by importance
    most_important_indices = feature_importances.argsort()[-3:][::-1]
    least_important_indices = feature_importances.argsort()[:3]

    
    important_indices = feature_importances.argsort()[:]

    # Calculate points for each feature
    for i in range(len(important_indices)):
        points_per_feature[X.columns[important_indices[i]]] += i



    # Store top 3 and bottom 3 important features
    for idx in most_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Top 3'].append((X.columns[idx], feature_importances[idx]))
    for idx in least_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Bottom 3'].append((X.columns[idx], feature_importances[idx]))

    # Append the feature importances to the dictionary
    for idx, col in enumerate(X.columns):
        all_feature_importances_avg[col].append(feature_importances[idx])

# Calculate average feature importance across all models
for col, importances in all_feature_importances_avg.items():
    if len(importances) > 0:
        all_feature_importances_avg[col] = np.mean(importances)

# Get indices of features sorted by average importance
sorted_indices_avg = np.argsort(list(all_feature_importances_avg.values()))[::-1]

# Print the top 3 and bottom 3 important features for each model
for model, features in top_bottom_features_per_model.items():
    print(f"\nModel: {model}")
    print("Top 3 Important Features:")
    for feature, importance in features['Top 3']:
        print(f"'{feature}': {importance}")
    print("Bottom 3 Important Features:")
    for feature, importance in features['Bottom 3']:
        print(f"'{feature}': {importance}")

# Print the average importance of features
print("\nAverage Importance of Features:")
for idx in sorted_indices_avg:
    print(f"Feature '{list(all_feature_importances_avg.keys())[idx]}': {list(all_feature_importances_avg.values())[idx]}")

# Calculate points per feature
print("\nPoints per Feature:")
sorted_points = sorted(points_per_feature.items(), key=lambda x: x[1], reverse=True)
for feature, points in sorted_points:
    print(f"Feature '{feature}': {points} points")


Ridge()
DecisionTreeRegressor()
GradientBoostingRegressor()
RandomForestRegressor()
AdaBoostRegressor()
KNeighborsRegressor()
MLPRegressor()
ElasticNet()
SGDRegressor()
SVR(cache_size=1000)
BayesianRidge(max_iter=1000)
KernelRidge()
LinearRegression()
RANSACRegressor()
TheilSenRegressor()

Model: Ridge
Top 3 Important Features:
'ATDO': 0.3692529558780715
'AOD': 0.2827133617008372
'F44': 0.10982359560907519
Bottom 3 Important Features:
'OF22': -0.04205994818487987
'FC': -0.023916988517724846
'S5': -0.011198760795855744

Model: DecisionTreeRegressor
Top 3 Important Features:
'F44': 0.655673431126347
'F31': 0.6489398109613667
'SWP': 0.46065324925804163
Bottom 3 Important Features:
'HWCC': -0.09863419057158845
'F29': -0.07313768180642734
'PP': -0.06251659218950412

Model: GradientBoostingRegressor
Top 3 Important Features:
'F31': 0.29134785603026064
'F44': 0.24524020772421048
'S5': 0.05277014668997375
Bottom 3 Important Features:
'ADLM': -0.035330367931165355
'OF26': -0.01173601009983057
'

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import RANSACRegressor, TheilSenRegressor

# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF9',
    'OF10',
    'OF11',
    'OF19',
    'OF20',
    'OF21',
    'OF22',
    'OF23',
    'OF24',
    'F13',
    'F40',
    'F50',
    'F51',
    'F52',
    'F66',
    'S2',
    'PC',
    'FC',
    'WRI',
    'SVT',
    'VCHWC',
    'HWCC',
    'MC',
    'PP',
    'ST',
    'SWP',
    'DP',
    'ADLM',
    'ATDO',
    'AOD'
]

results_columns = ['NR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(), 
    KNeighborsRegressor(), MLPRegressor(max_iter=200), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000), 
    SVR(cache_size=1000), BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(), 
    TheilSenRegressor()
]

# Create dictionaries to store feature importances for all models
all_feature_importances_avg = {col: [] for col in X.columns}
points_per_feature = {col: 0 for col in X.columns}
top_bottom_features_per_model = {type(model).__name__: {'Top 3': [], 'Bottom 3': []} for model in models}

# Iterate over models
for model in models:
    print(model)
    # Train the model
    model.fit(X_train, y_train)

    # Perform permutation feature importance analysis
    perm_importance = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42)

    # Get feature importances
    feature_importances = perm_importance.importances_mean

    # Get indices of features sorted by importance
    most_important_indices = feature_importances.argsort()[-3:][::-1]
    least_important_indices = feature_importances.argsort()[:3]

    
    important_indices = feature_importances.argsort()[:]

    # Calculate points for each feature
    for i in range(len(important_indices)):
        points_per_feature[X.columns[important_indices[i]]] += i



    # Store top 3 and bottom 3 important features
    for idx in most_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Top 3'].append((X.columns[idx], feature_importances[idx]))
    for idx in least_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Bottom 3'].append((X.columns[idx], feature_importances[idx]))

    # Append the feature importances to the dictionary
    for idx, col in enumerate(X.columns):
        all_feature_importances_avg[col].append(feature_importances[idx])

# Calculate average feature importance across all models
for col, importances in all_feature_importances_avg.items():
    if len(importances) > 0:
        all_feature_importances_avg[col] = np.mean(importances)

# Get indices of features sorted by average importance
sorted_indices_avg = np.argsort(list(all_feature_importances_avg.values()))[::-1]

# Print the top 3 and bottom 3 important features for each model
for model, features in top_bottom_features_per_model.items():
    print(f"\nModel: {model}")
    print("Top 3 Important Features:")
    for feature, importance in features['Top 3']:
        print(f"'{feature}': {importance}")
    print("Bottom 3 Important Features:")
    for feature, importance in features['Bottom 3']:
        print(f"'{feature}': {importance}")

# Print the average importance of features
print("\nAverage Importance of Features:")
for idx in sorted_indices_avg:
    print(f"Feature '{list(all_feature_importances_avg.keys())[idx]}': {list(all_feature_importances_avg.values())[idx]}")

# Calculate points per feature
print("\nPoints per Feature:")
sorted_points = sorted(points_per_feature.items(), key=lambda x: x[1], reverse=True)
for feature, points in sorted_points:
    print(f"Feature '{feature}': {points} points")


Ridge()
DecisionTreeRegressor()
GradientBoostingRegressor()
RandomForestRegressor()
AdaBoostRegressor()
KNeighborsRegressor()
MLPRegressor()




ElasticNet()
SGDRegressor()
SVR(cache_size=1000)
BayesianRidge(max_iter=1000)
KernelRidge()
LinearRegression()
RANSACRegressor()
TheilSenRegressor()

Model: Ridge
Top 3 Important Features:
'AOD': 0.32962725939425114
'OF24': 0.11754979834122753
'MC': 0.11477394003465104
Bottom 3 Important Features:
'ADLM': -0.037039288042329414
'S2': -0.01709458322068775
'OF22': -0.011268978842271023

Model: DecisionTreeRegressor
Top 3 Important Features:
'SWP': 0.4793325628356261
'MC': 0.36736086828416514
'F13': 0.33031137795756815
Bottom 3 Important Features:
'F66': -0.17100823071347168
'OF22': -0.1581659538322384
'AOD': -0.1527057724063874

Model: GradientBoostingRegressor
Top 3 Important Features:
'SWP': 0.2722807557029092
'MC': 0.11653717142883997
'PC': 0.11077063526372104
Bottom 3 Important Features:
'F66': -0.021861956137611063
'ATDO': -0.019966093340447397
'PP': -0.0018228883258027925

Model: RandomForestRegressor
Top 3 Important Features:
'SWP': 0.1929691322995531
'MC': 0.19076465133729975
'PC'

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Ridge, ElasticNet, SGDRegressor, LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import RANSACRegressor, TheilSenRegressor

# Load the data
file_path = "data_all_numerical_select_reduced.xlsx"
data = pd.read_excel(file_path)

# Define columns
data_columns = [
    'OF22',
    'OF26',
    'OF27',
    'F17',
    'F20',
    'F21',
    'F23',
    'F24',
    'F28',
    'F29',
    'F33',
    'F34',
    'F36',
    'F38',
    'F41',
    'F42',
    'F44',
    'F49',
    'F63',
    'F65',
    'PC',
    'FC',
    'WRI',
    'SVT',
    'VCHWC',
    'HWCC',
    'MC',
    'PP',
    'ST',
    'SWP',
    'DP',
    'ADLM',
    'ATDO',
    'AOD'
]

results_columns = ['PR']

# Prepare data for regression
X = data[data_columns]
y = data[results_columns[0]]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = [
    Ridge(), DecisionTreeRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), AdaBoostRegressor(), 
    KNeighborsRegressor(), MLPRegressor(max_iter=200), ElasticNet(max_iter=1000), SGDRegressor(max_iter=1000), 
    SVR(cache_size=1000), BayesianRidge(max_iter=1000), KernelRidge(), LinearRegression(), RANSACRegressor(), 
    TheilSenRegressor()
]

# Create dictionaries to store feature importances for all models
all_feature_importances_avg = {col: [] for col in X.columns}
points_per_feature = {col: 0 for col in X.columns}
top_bottom_features_per_model = {type(model).__name__: {'Top 3': [], 'Bottom 3': []} for model in models}

# Iterate over models
for model in models:
    print(model)
    # Train the model
    model.fit(X_train, y_train)

    # Perform permutation feature importance analysis
    perm_importance = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42)

    # Get feature importances
    feature_importances = perm_importance.importances_mean

    # Get indices of features sorted by importance
    most_important_indices = feature_importances.argsort()[-3:][::-1]
    least_important_indices = feature_importances.argsort()[:3]

    
    important_indices = feature_importances.argsort()[:]

    # Calculate points for each feature
    for i in range(len(important_indices)):
        points_per_feature[X.columns[important_indices[i]]] += i



    # Store top 3 and bottom 3 important features
    for idx in most_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Top 3'].append((X.columns[idx], feature_importances[idx]))
    for idx in least_important_indices:
        top_bottom_features_per_model[type(model).__name__]['Bottom 3'].append((X.columns[idx], feature_importances[idx]))

    # Append the feature importances to the dictionary
    for idx, col in enumerate(X.columns):
        all_feature_importances_avg[col].append(feature_importances[idx])

# Calculate average feature importance across all models
for col, importances in all_feature_importances_avg.items():
    if len(importances) > 0:
        all_feature_importances_avg[col] = np.mean(importances)

# Get indices of features sorted by average importance
sorted_indices_avg = np.argsort(list(all_feature_importances_avg.values()))[::-1]

# Print the top 3 and bottom 3 important features for each model
for model, features in top_bottom_features_per_model.items():
    print(f"\nModel: {model}")
    print("Top 3 Important Features:")
    for feature, importance in features['Top 3']:
        print(f"'{feature}': {importance}")
    print("Bottom 3 Important Features:")
    for feature, importance in features['Bottom 3']:
        print(f"'{feature}': {importance}")

# Print the average importance of features
print("\nAverage Importance of Features:")
for idx in sorted_indices_avg:
    print(f"Feature '{list(all_feature_importances_avg.keys())[idx]}': {list(all_feature_importances_avg.values())[idx]}")

# Calculate points per feature
print("\nPoints per Feature:")
sorted_points = sorted(points_per_feature.items(), key=lambda x: x[1], reverse=True)
for feature, points in sorted_points:
    print(f"Feature '{feature}': {points} points")


Ridge()
DecisionTreeRegressor()
GradientBoostingRegressor()
RandomForestRegressor()
AdaBoostRegressor()
KNeighborsRegressor()
MLPRegressor()
ElasticNet()
SGDRegressor()
SVR(cache_size=1000)
BayesianRidge(max_iter=1000)
KernelRidge()
LinearRegression()
RANSACRegressor()
TheilSenRegressor()

Model: Ridge
Top 3 Important Features:
'F44': 0.17533408694267252
'AOD': 0.11046211931017584
'F41': 0.08958364922071164
Bottom 3 Important Features:
'F21': -0.01277514531732787
'WRI': -0.007669183635915544
'F20': -0.007235373029328854

Model: DecisionTreeRegressor
Top 3 Important Features:
'FC': 0.5302465286387221
'F44': 0.33831647390352243
'F23': 0.3269040830780739
Bottom 3 Important Features:
'OF27': -0.013558253977844947
'F63': -0.010821933231821679
'F36': -0.0059129103986966325

Model: GradientBoostingRegressor
Top 3 Important Features:
'F44': 0.2905581506475662
'F41': 0.19200674133812373
'F23': 0.07689232034525686
Bottom 3 Important Features:
'F33': -0.0013843663854785895
'ATDO': -0.001132821847