# Heimbach Features Regression FELT_LIFE_NET

## Code

In [1]:
from src import database
from src.Dataset import Dataset
from src import plot

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb

In [2]:
#Vorverarbeitete Daten aus CSV auslesen
df = pd.read_csv("data/df_preprocessed_all.csv", sep = ",")

# Überprüfen Sie die Existenz der ausgewählten Spalten im DataFrame, da nur bei Preprocessing Spalten entfernt wurden in df_preprocessed
heimbach_features = [col for col in database.get_heimbach_important_features() if col in df.columns]
df_heimbach= df[heimbach_features]

#Das Label nennen
label = df_heimbach.columns.get_loc('FELT_LIFE_NET')
header = df_heimbach.columns

#Machine Learning Algorithms die benutzt werden
MLA = [
    ('Linear Regressor',LinearRegression()),
    ('SVR', SVR()),
    ('Random Forest',RandomForestRegressor()),
    ('Gradient Boosting', xgb.XGBRegressor())
]

results = {}
predictions = {}

# Für alle Modelle in MLA Trainvalidierung, Cross-Validierung und Testvalidierung 
for model_name, model_instance in MLA:
    
    dataset = Dataset(df_heimbach, 'df', label, divide_dataset=False, header=header)
    dataset.divide_dataset(model_instance, normalize=False, shuffle=True, all_features=True, all_instances=True, evaluate=True, partial_sample=False,folds=5)
    
    selected_features = list(range(df_heimbach.shape[1]))
    num_selected_features = len(selected_features)
    
    dataset.set_CV()
    cv = dataset.get_CV()
    
    train = dataset.get_train_metrics()
    validation = dataset.get_validation_metrics()
    test = dataset.get_test_metrics()
    traintime = dataset.get_traintime()
    
    #dataset.plot_shapley_values("All_Heimbach_"+model_name)
    
    results[model_name] = {
        'Model_name': model_name,
        'CV_TrainMAE': cv['CV_TrainMAE'],
        'CV_TrainRMSE': cv['CV_TrainRMSE'],
        'CV_TestMAE': cv['CV_TestMAE'],
        'CV_TestRMSE': cv['CV_TestRMSE'],
        'CV_fit_time_ges': cv['CV_fit_time'],
        'TrainRMSE': train['TrainRMSE'],
        'TrainMAE': train['TrainMAE'],
        'ValidationRMSE': validation['ValidationRMSE'],
        'ValidationMAE': validation['ValidationMAE'],
        'TestRMSE': test['TestRMSE'],
        'TestMAE': test['TestMAE'],
        'TrainTime_ges': traintime,
        'Features': selected_features,
        'Feature-Anzahl': num_selected_features
    }
    
    predictions[model_name] = {
        'Model_name': model_name,
        'y_train': dataset.get_y_train(),
        'y_val': dataset.get_y_val(),
        'y_test': dataset.get_y_test(),
        'pred_train': dataset.get_y_pred_train(),
        'pred_val': dataset.get_y_pred_val(),
        'pred_test':dataset.get_y_pred_test()
    }
    print(model_name)

{'CV_TrainMAE': 33.74829898454959, 'CV_TrainRMSE': 71.23153326207685, 'CV_TestMAE': 33.92389002562479, 'CV_TestRMSE': 71.02406493645681, 'CV_fit_time': 0.062043190002441406}
Linear Regressor
{'CV_TrainMAE': 33.43864667898586, 'CV_TrainRMSE': 72.25570770519717, 'CV_TestMAE': 33.52920586642033, 'CV_TestRMSE': 71.91473191802228, 'CV_fit_time': 0.9816241264343262}
SVR
{'CV_TrainMAE': 16.073823745583038, 'CV_TrainRMSE': 41.83560603722462, 'CV_TestMAE': 35.83191296249204, 'CV_TestRMSE': 71.80100161261517, 'CV_fit_time': 9.83640685081482}
Random Forest
{'CV_TrainMAE': 15.654879270019242, 'CV_TrainRMSE': 36.46724452473604, 'CV_TestMAE': 38.147714087519944, 'CV_TestRMSE': 73.51113474213716, 'CV_fit_time': 0.5152637958526611}
Gradient Boosting


## Speichern der Ergebnisse

In [10]:
# Metrics speichern
df_metrics = pd.DataFrame(results)
df_metrics = df_metrics.transpose()
df_metrics.to_csv('data/Heimbach/All_Heimbach_metrics.csv', index=False)

In [11]:
# Features speichern
df_features_heimbach = pd.DataFrame()

# Maximale Anzahl von ausgewählten Features über alle Modelle bestimmen
max_selected_features = max(len(results[model_name]['Features']) for model_name, _ in MLA)

# Iteration über die Modelle und Hinzufügen der Feature-Namen in df_features_heimbach
for model_name, model_instance in MLA:
    selected_features = df_heimbach.columns[results[model_name]['Features']]
    
    # Auffüllen der nicht ausgewählten Features mit NaN
    if len(selected_features) < max_selected_features:
        selected_features = np.append(selected_features, [np.nan] * (max_selected_features - len(selected_features)))
    
    # Die ausgewählten Spaltennamen in eine neue Spalte in df_features_chcqx einfügen
    df_features_heimbach[model_name] = selected_features

df_features_heimbach.to_csv('data/Heimbach/All_Heimbach_Features.csv', index=False)

In [12]:
flattened_data = []
# Iteriere über die Modelle und ihre Daten
for model_name, data in predictions.items():
    model_data = data.copy()
    model_data.pop('Model_name')  # Entferne den Eintrag 'Model_name'
    for data_type, values in model_data.items():
        # Iteriere über die Werte in jedem Datenfeld und füge sie zur flattened_data-Liste hinzu
        for value in values:
            flattened_data.append({'Model_name': model_name, 'Data_type': data_type, 'Value': value})


df = pd.DataFrame(flattened_data)
df.to_csv('data/Heimbach/All_Heimbach_predictions.csv', index=False)