# PCA Regression FELT_LIFE_NET

In [1]:
import src.database
from src.Dataset import Dataset
import numpy as np
import pandas as pd
import src.plot 
from src import featureselection
import shap
import os

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb

## Code

In [2]:
#Vorverarbeitete Daten aus CSV auslesen
df = pd.read_csv("data/df_preprocessed_all.csv", sep = ",")
df = df.drop(columns=['FELT_LIFE','REMOVAL_DATE', 'INSTALLATION_DATE', 'REPORT_DATE'])

#Das Label nennen
label = df.columns.get_loc('FELT_LIFE_NET')
header = df.columns

#Machine Learning Algorithms die benutzt werden
MLA = [
    ('Linear Regressor', LinearRegression()),
    ('SVR', SVR()),
    ('Random Forest', RandomForestRegressor()),
    ('Gradient Boosting', xgb.XGBRegressor())
]

all_results = {}  # Speichert Ergebnisse für jede Hauptkomponentenanzahl
all_predictions = {}  # Speichert Predictions für jede Hauptkomponentenanzahl

# Feature-Anzahlen, die getestet werden sollen
feature_counts = [
    #1,2,3,4,5,6,7,8,9,
    10, 
    20, 
    30, 
    40
]

for num_features in feature_counts:
    
    # Initialisiere Ergebnisse und Predictions für die aktuelle Hauptkomponentenanzahl
    all_results[num_features] = {}
    all_predictions[num_features] = {}
    
    # Für alle Modelle in MLA Trainvalidierung, Cross-Validierung und Testvalidierung 
    for model_name, model_instance in MLA:
        
        #Dataset vorbereiten
        dataset = Dataset(df, 'df', label, divide_dataset=False, header=header)
        dataset.divide_dataset(model_instance, normalize=False, shuffle=True, all_features=True, all_instances=True, evaluate=False, partial_sample=False,folds=5)
    
        #Dimension Reduction mit PCA
        x_train_pca,  x_test_pca, duration, pca = featureselection.pca(dataset.get_X_train(),  dataset.get_X_test(), num_features)
        
        #print(pca.components_)
        #print(pca.explained_variance_)
        #print(sum(pca.explained_variance_ratio_))
        
        
        dataset.set_X_train(x_train_pca)
        dataset.set_X_test(x_test_pca)
        
        dataset.set_features(list(range(num_features)))
       

        #Regressor trainieren
        dataset.fit_classifier()

        # Cross-Validation
        dataset.set_CV()
        cv = dataset.get_CV()

        #Validierung auf Testset
        dataset.set_train_metrics()
        train = dataset.get_train_metrics()
        traintime = dataset.get_traintime()
        

        dataset.set_test_metrics()
        test = dataset.get_test_metrics()

        # Ergebnisse für das aktuelle Modell und die aktuelle Hauptkomponentenanzahl speichern
        all_results[num_features][model_name] = {
            'Model_name': model_name,
            'CV_TrainMAE': cv['CV_TrainMAE'],
            'CV_TrainRMSE': cv['CV_TrainRMSE'],
            'CV_TestMAE': cv['CV_TestMAE'],
            'CV_TestRMSE': cv['CV_TestRMSE'],
            'CV_fit_time': cv['CV_fit_time'],
            'CV_fit_time_ges':cv['CV_fit_time']+duration,
            'TrainRMSE': train['TrainRMSE'],
            'TrainMAE': train['TrainMAE'],
            'TestRMSE': test['TestRMSE'],
            'TestMAE': test['TestMAE'],
            'TrainTime_ges': traintime,
            'Hauptkomponenten': pca.explained_variance_ratio_,
            'Hauptkomponenten-Anzahl': num_features,
            'FS-Laufzeit': duration
        }

        # Predictions für das aktuelle Modell und die aktuelle Hauptkomponentenanzahl speichern
        all_predictions[num_features][model_name] = {
            'Model_name': model_name,
            'y_train': dataset.get_y_train(),
            'y_test': dataset.get_y_test(),
            'pred_train': dataset.get_y_pred_train(),
            'pred_test': dataset.get_y_pred_test()
        }

        print(f"Finished: Model {model_name}, Components: {num_features}")

        
# Ergebnisse und Predictions als CSV speichern

output_dir = "data/PCA/"
os.makedirs(output_dir, exist_ok=True)

# Ergebnisse für alle Hauptkomponentenanzahlen und Modelle
for num_features, models_results in all_results.items():
    # 1. Speichern der Metriken im gewünschten Format
    df_metrics = pd.DataFrame(models_results)
    df_metrics = df_metrics.transpose()  # Um sicherzustellen, dass jedes Modell als Zeile erscheint
    df_metrics.to_csv(f'{output_dir}{num_features}_PCA_metrics.csv', index=False)

    # 2. Speichern der Vorhersagen im flachen Format (DataFrame für Vorhersagen)
    flattened_predictions = []
    
    for model_name, data in all_predictions[num_features].items():
        for data_type, values in data.items():
            if data_type != 'Model_name':  # Überspringe das Model_name-Feld
                for value in values:
                    flattened_predictions.append({
                        'Model_name': model_name,
                        'Data_type': data_type,
                        'Value': value
                    })
    
    df_predictions = pd.DataFrame(flattened_predictions)
    df_predictions.to_csv(f'{output_dir}{num_features}_PCA_predictions.csv', index=False)

    print(f"Saved metrics and predictions for {num_features} components.")

Finished: Model Linear Regressor, Components: 10
Finished: Model SVR, Components: 10
Finished: Model Random Forest, Components: 10
Finished: Model Gradient Boosting, Components: 10
Finished: Model Linear Regressor, Components: 20
Finished: Model SVR, Components: 20
Finished: Model Random Forest, Components: 20
Finished: Model Gradient Boosting, Components: 20
Finished: Model Linear Regressor, Components: 30
Finished: Model SVR, Components: 30
Finished: Model Random Forest, Components: 30
Finished: Model Gradient Boosting, Components: 30
Finished: Model Linear Regressor, Components: 40
Finished: Model SVR, Components: 40
Finished: Model Random Forest, Components: 40
Finished: Model Gradient Boosting, Components: 40
Saved metrics and predictions for 10 components.
Saved metrics and predictions for 20 components.
Saved metrics and predictions for 30 components.
Saved metrics and predictions for 40 components.


## ShapleyValues

In [None]:
#Vorverarbeitete Daten aus CSV auslesen
df = pd.read_csv("data/df_preprocessed_all.csv", sep = ",")
df = df.drop(columns=['FELT_LIFE','REMOVAL_DATE', 'INSTALLATION_DATE', 'REPORT_DATE'])

#Das Label nennen
label = df.columns.get_loc('FELT_LIFE_NET')
header = df.columns

#Machine Learning Algorithms die benutzt werden
MLA = [
    #('Linear Regressor', LinearRegression()),
    #('SVR', SVR()),
    #('Random Forest', RandomForestRegressor()),
    ('Gradient Boosting', xgb.XGBRegressor())
]

# Feature-Anzahlen, die getestet werden sollen
feature_counts = [
    5,
    #10, 
    #20, 
    #30, 
    #40
]

for num_features in feature_counts:
    
    # Für alle Modelle in MLA Trainvalidierung, Cross-Validierung und Testvalidierung 
    for model_name, model_instance in MLA:
        
        #Dataset vorbereiten
        dataset = Dataset(df, 'df', label, divide_dataset=False, header=header)
        dataset.divide_dataset(model_instance, normalize=False, shuffle=True, all_features=True, all_instances=True, evaluate=False, partial_sample=False,folds=5)
        
        #Dimension Reduction mit PCA
        x_train_org = dataset.get_X_train()
        x_train_pca, x_test_pca, duration, pca = featureselection.pca(dataset.get_X_train(), dataset.get_X_val(), dataset.get_X_test(), num_features)
        
        dataset.set_X_train(x_train_pca)
        dataset.set_features(list(range(num_features)))
       
        #Regressor trainieren
        dataset.fit_classifier()
        
        #Berechnung und Rücktransformation auf die ursprünglichen Features
        shap_values_pca = dataset.shapley_values()
        shap_values_original = shap_values_pca.dot(pca.components_)
        

        # Visualisierung 
        shap.summary_plot(shap_values_original, x_train_org, feature_names=df.columns)