Daten einlesen, Zielvariable festlegen und X-Daten normalisieren

In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Pfad zur Datei
file_path = r"K:\Team\Böhmer_Michael\TSA\ML\ml_algorithmus_finden.xlsx"

# Excel-Datei einlesen
try:
    # Direkt die Datei einlesen (erstes Tabellenblatt standardmäßig)
    df = pd.read_excel(file_path)

    # Informationen anzeigen
    print("\nErste Zeilen der Tabelle:")
    print(df.head())  # Zeigt die ersten 5 Zeilen an

    print("\nInformationen über die Tabelle:")
    print(df.info())  # Struktur der Tabelle

    # Zielvariable (y) und Features (X) extrahieren
    y = df['Verletzungsstatus']
    X = df.drop(columns=['Verletzungsstatus'])  # Entferne die Zielvariable aus den Features

    # Skalierung der Features (X) mit StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)  # Normiert X auf den Standardbereich (Mittelwert = 0, Std = 1)

    # Ausgabe nach Skalierung (optional)
    print("\nBeispiel nach Skalierung (erste Zeilen):")
    print(pd.DataFrame(X_scaled, columns=X.columns).head())

except FileNotFoundError:
    print("Die Datei wurde nicht gefunden. Bitte überprüfen Sie den Pfad.")
except Exception as e:
    print(f"Ein Fehler ist aufgetreten: {e}")



Erste Zeilen der Tabelle:
   Verletzungsstatus  CMJ_Jump Height flighttime  CMJ_Net Impulse  \
0                  1                      22.961         2.153492   
1                  1                      39.377         2.692762   
2                  1                      36.429         2.587946   
3                  1                      36.080         2.570528   
4                  1                      49.831         3.084610   

   CMJ_Vertical Takeoff velocity  CMJ_Jump Height impulse  \
0                       0.032683                 0.342905   
1                       0.042159                 0.570762   
2                       0.027645                 0.362215   
3                       0.036000                 0.475528   
4                       0.036707                 0.563049   

   CMJ_Rel. peak loading force  CMJ_Rel. peak loading force left  \
0                        2.299                             0.995   
1                        2.075                         

Modelle vergleichen mit Standardeinstellungen

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

def repeated_k_fold(model, X, y, n_splits=5, n_repeats=10):
    """Führt repeated k-fold cross-validation durch und berechnet die Metriken."""
    # Repeated Stratified K-Fold
    rkf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)

    # Metriken definieren
    accuracy_train = []
    accuracy_test = []
    f1 = []
    roc_auc = []

    for train_index, test_index in rkf.split(X, y):
        # Splitte die Daten in Trainings- und Testdaten
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Trainiere das Modell
        model.fit(X_train, y_train)

        # Vorhersagen für Trainings- und Testdaten
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Berechne die Metriken
        accuracy_train.append(accuracy_score(y_train, y_train_pred))
        accuracy_test.append(accuracy_score(y_test, y_test_pred))
        f1.append(f1_score(y_test, y_test_pred))
        roc_auc.append(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

    # Durchschnittswerte berechnen
    return {
        "Train Accuracy": np.mean(accuracy_train),
        "Test Accuracy": np.mean(accuracy_test),
        "F1-Score": np.mean(f1),
        "ROC-AUC": np.mean(roc_auc),
    }


# Pfad zur Datei
file_path = r"K:\Team\Böhmer_Michael\TSA\ML\ml_algorithmus_finden.xlsx"

# Excel-Datei einlesen
try:
    # Direkt die Datei einlesen (erstes Tabellenblatt standardmäßig)
    df = pd.read_excel(file_path)

    # Zielvariable (y) und Features (X) extrahieren
    y = df['Verletzungsstatus']
    X = df.drop(columns=['Verletzungsstatus'])  # Entferne die Zielvariable aus den Features

    # Skalierung der Features (X) mit StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)  # Normiert X auf den Standardbereich (Mittelwert = 0, Std = 1)

    # Modelle definieren
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
        "LightGBM": LGBMClassifier(random_state=42),
        "SVC": SVC(probability=True, random_state=42),
        "k-Nearest Neighbors": KNeighborsClassifier(),
        "MLP Classifier": MLPClassifier(max_iter=1000, random_state=42),
        "Gaussian Naive Bayes": GaussianNB(),
        "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
        "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
        "Bagging Classifier": BaggingClassifier(random_state=42),
        "Extra Trees": ExtraTreesClassifier(random_state=42),
    }

    # Ergebnisse speichern
    results = []

    # Validierung jedes Modells
    for model_name, model in models.items():
        print(f"Modell wird validiert: {model_name}")
        metrics = repeated_k_fold(model, X_scaled, y)  # Verwende X_scaled statt X
        results.append({"Model": model_name, **metrics})

    # Ergebnisse in DataFrame konvertieren und sortieren
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by="ROC-AUC", ascending=False)

    # Ergebnisse anzeigen
    print("\nErgebnisse der Modelle:")
    print(results_df)

except FileNotFoundError:
    print("Die Datei wurde nicht gefunden. Bitte überprüfen Sie den Pfad.")
except Exception as e:
    print(f"Ein Fehler ist aufgetreten: {e}")


Modell wird validiert: Logistic Regression
Modell wird validiert: Decision Tree
Modell wird validiert: Random Forest
Modell wird validiert: Gradient Boosting
Modell wird validiert: XGBoost


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Modell wird validiert: LightGBM
[LightGBM] [Info] Number of positive: 52, number of negative: 42
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3380
[LightGBM] [Info] Number of data points in the train set: 94, number of used features: 104
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.553191 -> initscore=0.213574
[LightGBM] [Info] Start training from score 0.213574
[LightGBM] [Info] Number of positive: 53, number of negative: 41
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000232 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3381
[LightGBM] [Info] Number of data points in the train set: 94, number of used features: 104
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.563830 -> initscore=0.256720
[LightGBM] [Info] Start training from score 0.256720




Modell wird validiert: Bagging Classifier
Modell wird validiert: Extra Trees

Ergebnisse der Modelle:
                              Model  Train Accuracy  Test Accuracy  F1-Score  \
0               Logistic Regression        0.992376       0.791739  0.808561   
3                 Gradient Boosting        1.000000       0.792790  0.810489   
13                      Extra Trees        1.000000       0.789239  0.817600   
6                               SVC        0.946206       0.786413  0.818121   
4                           XGBoost        1.000000       0.775000  0.795767   
5                          LightGBM        1.000000       0.769348  0.788978   
2                     Random Forest        1.000000       0.802210  0.830249   
12               Bagging Classifier        0.986004       0.764058  0.777338   
8                    MLP Classifier        1.000000       0.765435  0.788704   
9              Gaussian Naive Bayes        0.842387       0.748551  0.778506   
7               k-

In [21]:
results_df

Unnamed: 0,Model,Train Accuracy,Test Accuracy,F1-Score,ROC-AUC
0,Logistic Regression,0.992376,0.791739,0.808561,0.878547
3,Gradient Boosting,1.0,0.79279,0.810489,0.867768
13,Extra Trees,1.0,0.789239,0.8176,0.86103
6,SVC,0.946206,0.786413,0.818121,0.856066
4,XGBoost,1.0,0.775,0.795767,0.855231
5,LightGBM,1.0,0.769348,0.788978,0.852844
2,Random Forest,1.0,0.80221,0.830249,0.84624
12,Bagging Classifier,0.986004,0.764058,0.777338,0.840698
8,MLP Classifier,1.0,0.765435,0.788704,0.831708
9,Gaussian Naive Bayes,0.842387,0.748551,0.778506,0.815904


Auto ML mit TPOT

In [27]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Daten einlesen
file_path = r"K:\Team\Böhmer_Michael\TSA\ML\ml_algorithmus_finden.xlsx"
df = pd.read_excel(file_path)

# Zielvariable (y) und Features (X) extrahieren
y = df['Verletzungsstatus']
X = df.drop(columns=['Verletzungsstatus'])

# Skalierung der Features (X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Trainings- und Testdaten splitten
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# TPOTClassifier erstellen und anpassen
tpot = TPOTClassifier(verbosity=2, generations=5, population_size=20, random_state=42)

# Modell mit dem Trainingsdatensatz optimieren
tpot.fit(X_train, y_train)

# Beste Lösung ausgeben
print("Beste Lösung:", tpot.fitted_pipeline_)

# Evaluation des besten Modells auf den Testdaten
print(f"Test Accuracy: {tpot.score(X_test, y_test)}")

# Optional: Export des besten Modells als Python-Code
tpot.export('best_model.py')


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8309941520467836

Generation 2 - Current best internal CV score: 0.8309941520467836

Generation 3 - Current best internal CV score: 0.8309941520467836

Generation 4 - Current best internal CV score: 0.8309941520467836

Generation 5 - Current best internal CV score: 0.852046783625731

Best pipeline: ExtraTreesClassifier(ZeroCount(input_matrix), bootstrap=False, criterion=gini, max_features=0.3, min_samples_leaf=4, min_samples_split=9, n_estimators=100)
Beste Lösung: Pipeline(steps=[('zerocount', ZeroCount()),
                ('extratreesclassifier',
                 ExtraTreesClassifier(max_features=0.3, min_samples_leaf=4,
                                      min_samples_split=9, random_state=42))])
Test Accuracy: 0.8333333333333334


Einzelnes Modell validieren

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

def repeated_k_fold(model, X, y, n_splits=5, n_repeats=10):
    """Führt repeated k-fold cross-validation durch und berechnet die Metriken."""
    # Repeated Stratified K-Fold
    rkf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)

    # Metriken definieren
    accuracy_train = []
    accuracy_test = []
    f1 = []
    roc_auc = []

    for train_index, test_index in rkf.split(X, y):
        # Splitte die Daten in Trainings- und Testdaten
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Trainiere das Modell
        model.fit(X_train, y_train)

        # Vorhersagen für Trainings- und Testdaten
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Berechne die Metriken
        accuracy_train.append(accuracy_score(y_train, y_train_pred))
        accuracy_test.append(accuracy_score(y_test, y_test_pred))
        f1.append(f1_score(y_test, y_test_pred))
        roc_auc.append(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

    # Durchschnittswerte berechnen
    return {
        "Train Accuracy": np.mean(accuracy_train),
        "Test Accuracy": np.mean(accuracy_test),
        "F1-Score": np.mean(f1),
        "ROC-AUC": np.mean(roc_auc),
    }


# Pfad zur Datei
file_path = r"K:\Team\Böhmer_Michael\TSA\ML\ml_algorithmus_finden.xlsx"

# Excel-Datei einlesen
try:
    # Direkt die Datei einlesen (erstes Tabellenblatt standardmäßig)
    df = pd.read_excel(file_path)

    # Zielvariable (y) und Features (X) extrahieren
    y = df['Verletzungsstatus']
    X = df.drop(columns=['Verletzungsstatus'])  # Entferne die Zielvariable aus den Features

    # Skalierung der Features (X) mit StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)  # Normiert X auf den Standardbereich (Mittelwert = 0, Std = 1)

    # Modelle definieren
    models = {
    "Extra Trees": ExtraTreesClassifier(
        bootstrap=False, 
        criterion="gini", 
        max_features=0.3, 
        min_samples_leaf=4, 
        min_samples_split=9, 
        n_estimators=100,
        random_state=42
    ),}



    # Ergebnisse speichern
    results = []

    # Validierung jedes Modells
    for model_name, model in models.items():
        print(f"Modell wird validiert: {model_name}")
        metrics = repeated_k_fold(model, X_scaled, y)  # Verwende X_scaled statt X
        results.append({"Model": model_name, **metrics})

    # Ergebnisse in DataFrame konvertieren und sortieren
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by="ROC-AUC", ascending=False)

    # Ergebnisse anzeigen
    print("\nErgebnisse der Modelle:")
    print(results_df)

except FileNotFoundError:
    print("Die Datei wurde nicht gefunden. Bitte überprüfen Sie den Pfad.")
except Exception as e:
    print(f"Ein Fehler ist aufgetreten: {e}")


Modell wird validiert: Extra Trees

Ergebnisse der Modelle:
         Model  Train Accuracy  Test Accuracy  F1-Score   ROC-AUC
0  Extra Trees        0.991946        0.81837  0.841412  0.884207


In [31]:
results_df

Unnamed: 0,Model,Train Accuracy,Test Accuracy,F1-Score,ROC-AUC
0,Extra Trees,0.991946,0.81837,0.841412,0.884207


Grid Search für ein ausgewähltes Modell

In [33]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import ExtraTreesClassifier
    

# Pfad zur Datei
file_path = r"K:\Team\Böhmer_Michael\TSA\ML\ml_algorithmus_finden.xlsx"

# Excel-Datei einlesen
try:
    # Direkt die Datei einlesen (erstes Tabellenblatt standardmäßig)
    df = pd.read_excel(file_path)

    # Zielvariable (y) und Features (X) extrahieren
    y = df['Verletzungsstatus']
    X = df.drop(columns=['Verletzungsstatus'])  # Entferne die Zielvariable aus den Features

    # Skalierung der Features (X) mit StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)  # Normiert X auf den Standardbereich (Mittelwert = 0, Std = 1)

    # Ausgabe nach Skalierung (optional)
    print("\nBeispiel nach Skalierung (erste Zeilen):")
    print(pd.DataFrame(X_scaled, columns=X.columns).head())
    
    # Train-Test-Split durchführen (80% Training, 20% Test)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # Modell initialisieren
    model = ExtraTreesClassifier(random_state=42)
    
    # Parameterbereich für GridSearch definieren
    param_grid = {
        'n_estimators': [50, 100, 200, 300, 500],
        'max_features': ['auto', 'sqrt', 'log2', 0.1, 0.3],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5, 10],
        'bootstrap': [True, False]
    }
    
    # GridSearchCV initialisieren
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                               cv=5, n_jobs=-1, scoring='roc_auc', verbose=1)
    
    # Führe GridSearchCV aus auf dem Trainingsdatensatz
    grid_search.fit(X_train, y_train)
    
    # Beste Parameter und beste Leistung anzeigen
    print(f"Beste Parameter: {grid_search.best_params_}")
    print(f"Beste ROC-AUC: {grid_search.best_score_}")
    
    # Optional: Testdaten mit dem besten Modell evaluieren
    best_model = grid_search.best_estimator_
    
    # Vorhersage und Bewertung auf dem Testdatensatz
    from sklearn.metrics import roc_auc_score
    y_test_pred = best_model.predict(X_test)
    y_test_prob = best_model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_test_prob)
    
    print(f"ROC-AUC auf den Testdaten: {roc_auc}")
    

except FileNotFoundError:
    print("Die Datei wurde nicht gefunden. Bitte überprüfen Sie den Pfad.")
except Exception as e:
    print(f"Ein Fehler ist aufgetreten: {e}")



Beispiel nach Skalierung (erste Zeilen):
   CMJ_Jump Height flighttime  CMJ_Net Impulse  CMJ_Vertical Takeoff velocity  \
0                   -1.332146        -1.101240                      -0.495440   
1                    0.664907         0.621206                       0.896304   
2                    0.306275         0.286421                      -1.235268   
3                    0.263818         0.230786                      -0.008213   
4                    1.936666         1.872782                       0.095669   

   CMJ_Jump Height impulse  CMJ_Rel. peak loading force  \
0                -1.025456                     0.048163   
1                 0.906800                    -0.858569   
2                -0.861703                     1.108716   
3                 0.099203                    -0.348532   
4                 0.841391                     1.173483   

   CMJ_Rel. peak loading force left  CMJ_Rel. peak loading force right  \
0                         -1.015552       

6000 fits failed out of a total of 30000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2675 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\boehmer\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\boehmer\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\boehmer\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\boehmer\AppData\Local\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py"

Beste Parameter: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 0.3, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Beste ROC-AUC: 0.9024747474747474
ROC-AUC auf den Testdaten: 0.9020979020979021


In [35]:
import os
print(os.getcwd())  # Gibt den aktuellen Arbeitsverzeichnis-Pfad aus


C:\Users\boehmer
