In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
url = 'https://raw.githubusercontent.com/IvaroEkel/Probabilistic-Machine-Learning_lecture-PROJECTS/refs/heads/main/projects/05-1PLXXXX_political_color_posts/data/final-features-hsv.csv'
df = pd.read_csv(url)
X_full = df[[f'feature_{i}' for i in range(13)]]
y_full = df['party']
relevant_full = df['relevant']

In [3]:
# choose data for training and test sets (depending on column 'relevant')
train_relevant = 1
test_relevant = 1

if train_relevant == test_relevant:
    mask = (relevant_full == train_relevant)
    X_selected = X_full[mask]
    y_selected = y_full[mask]
    X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y_selected, stratify=y_selected, test_size=0.2, random_state=42)
else:
    X_train = X_full[relevant_full == train_relevant]
    y_train = y_full[relevant_full == train_relevant]
    X_test = X_full[relevant_full == test_relevant]
    y_test = y_full[relevant_full == test_relevant]

In [4]:
def extract_metrics(report):
    metrics = {}
    lines = report.split("\n")
    for line in lines:
        parts = line.split()
        if len(parts) >= 5:
            label = parts[0]
            try:
                precision = float(parts[1])
                recall = float(parts[2])
                f1_score = float(parts[3])
                support = int(parts[4])
                metrics[label] = {
                    'Precision': precision,
                    'Recall': recall,
                    'F1-Score': f1_score,
                    'Support': support
                }
            except ValueError:
                continue
    return metrics

In [5]:
# 1. Random Forest
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
report_rf = classification_report(y_test, y_pred_rf)
metrics_rf = extract_metrics(report_rf)
print("=== Random Forest ===")
print(report_rf)

=== Random Forest ===
              precision    recall  f1-score   support

         afd       0.84      0.45      0.58       182
         cdu       0.61      0.60      0.61       512
         csu       0.61      0.85      0.71       765
         fdp       0.88      0.90      0.89       611
      gruene       0.99      0.69      0.81       297
       linke       0.72      0.60      0.65       364
         spd       0.82      0.59      0.69       272

    accuracy                           0.72      3003
   macro avg       0.78      0.67      0.71      3003
weighted avg       0.75      0.72      0.72      3003



In [6]:
# 2. XGBoost
# Encode the target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train_encoded)
y_pred_xgb_encoded = xgb.predict(X_test)

# Decode the predictions back to original labels for classification report
y_pred_xgb = le.inverse_transform(y_pred_xgb_encoded)

report_xgb = classification_report(y_test, y_pred_xgb)
metrics_xgb = extract_metrics(report_xgb)
print("=== XGBoost ===")
print(report_xgb)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== XGBoost ===
              precision    recall  f1-score   support

         afd       0.75      0.49      0.59       182
         cdu       0.62      0.64      0.63       512
         csu       0.64      0.81      0.71       765
         fdp       0.91      0.89      0.90       611
      gruene       0.91      0.74      0.82       297
       linke       0.70      0.63      0.66       364
         spd       0.76      0.61      0.68       272

    accuracy                           0.73      3003
   macro avg       0.76      0.69      0.71      3003
weighted avg       0.74      0.73      0.73      3003



In [7]:
# 3. Support Vector Machine (SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm = SVC(kernel='rbf', class_weight='balanced', random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
report_svm = classification_report(y_test, y_pred_svm)
metrics_svm = extract_metrics(report_svm)
print("=== Support Vector Machine (SVM) ===")
print(report_svm)

=== Support Vector Machine (SVM) ===
              precision    recall  f1-score   support

         afd       0.44      0.65      0.52       182
         cdu       0.49      0.68      0.57       512
         csu       0.73      0.62      0.67       765
         fdp       0.90      0.83      0.86       611
      gruene       0.84      0.71      0.77       297
       linke       0.60      0.46      0.52       364
         spd       0.65      0.65      0.65       272

    accuracy                           0.67      3003
   macro avg       0.66      0.66      0.65      3003
weighted avg       0.69      0.67      0.67      3003



In [8]:
# 4. MLP Classifier
mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation='relu',
    learning_rate_init=0.001,
    max_iter=1000,
    alpha=0.0001,
    random_state=42
)

mlp.fit(X_train_scaled, y_train)
y_pred_mlp = mlp.predict(X_test_scaled)
report_mlp = classification_report(y_test, y_pred_mlp)
metrics_mlp = extract_metrics(report_mlp)
print("=== MLP Classifier ===")
print(report_mlp)

=== MLP Classifier ===
              precision    recall  f1-score   support

         afd       0.57      0.52      0.54       182
         cdu       0.55      0.62      0.58       512
         csu       0.66      0.72      0.69       765
         fdp       0.88      0.85      0.86       611
      gruene       0.81      0.73      0.77       297
       linke       0.58      0.55      0.57       364
         spd       0.68      0.57      0.62       272

    accuracy                           0.68      3003
   macro avg       0.68      0.65      0.66      3003
weighted avg       0.69      0.68      0.68      3003



In [9]:
# 5. Linear Discriminant Analysis (LDA)
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_scaled, y_train)
y_pred_lda = lda.predict(X_test_scaled)
report_lda = classification_report(y_test, y_pred_lda)
metrics_lda = extract_metrics(report_lda)
print("=== Linear Discriminant Analysis (LDA) ===")
print(report_lda)

=== Linear Discriminant Analysis (LDA) ===
              precision    recall  f1-score   support

         afd       0.64      0.10      0.17       182
         cdu       0.36      0.19      0.24       512
         csu       0.45      0.80      0.57       765
         fdp       0.76      0.62      0.69       611
      gruene       0.58      0.49      0.53       297
       linke       0.42      0.34      0.38       364
         spd       0.52      0.56      0.54       272

    accuracy                           0.51      3003
   macro avg       0.53      0.44      0.45      3003
weighted avg       0.52      0.51      0.48      3003



In [10]:
metrics_all = {}

for label in metrics_rf:
    metrics_all[label] = {
        'Random Forest': metrics_rf[label]['F1-Score'],
        'SVM': metrics_svm.get(label, {}).get('F1-Score', 0),
        'MLP': metrics_mlp.get(label, {}).get('F1-Score', 0),
        'XGBoost': metrics_xgb[label]['F1-Score'],
        'LDA': metrics_lda[label]['F1-Score']
    }

metrics_df = pd.DataFrame(metrics_all).T

metrics_df["Average"] = metrics_df.mean(axis=1)

average_row = metrics_df.mean(numeric_only=True)
average_row.name = "Average"
metrics_df = pd.concat([metrics_df, average_row.to_frame().T])

metrics_df = metrics_df.sort_values(by="Average", ascending=False)

# Output
print("=== Modellvergleich ===")
print(metrics_df.round(2))

=== Modellvergleich ===
         Random Forest   SVM   MLP  XGBoost   LDA  Average
fdp               0.89  0.86  0.86     0.90  0.69     0.84
gruene            0.81  0.77  0.77     0.82  0.53     0.74
csu               0.71  0.67  0.69     0.71  0.57     0.67
spd               0.69  0.65  0.62     0.68  0.54     0.64
Average           0.71  0.65  0.66     0.71  0.45     0.64
linke             0.65  0.52  0.57     0.66  0.38     0.56
cdu               0.61  0.57  0.58     0.63  0.24     0.53
afd               0.58  0.52  0.54     0.59  0.17     0.48
