In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict, StratifiedKFold

# read csv
df = pd.read_csv('rgb_color_palettes.csv')

# extract features + labels
feature_columns = [f'feature_{i}' for i in range(12)]
X = df[feature_columns].values
y = df['party'].values

# label-encoding
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-Val Confidence
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
probs_cv = cross_val_predict(clf, X, y_encoded, cv=cv, method='predict_proba')

# max-prob per post
max_probs_cv = probs_cv.max(axis=1)

df['confidence'] = max_probs_cv


plt.hist(df['confidence'], bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Confidence (Cross-Validated)')
plt.ylabel('Number of Posts')
plt.title('Confidence Distribution HSV (Cross-Validation)')
plt.show()

#threshold
df_neutral = df[df['confidence'] < 0.4]
df_parteitypisch = df[df['confidence'] >= 0.4]

print(f'Anzahl neutraler Posts (Confidence < 0.4): {len(df_neutral)}')
print(df_neutral[['party', 'filename', 'confidence']].head())

In [None]:
# create new csv
df['relevant'] = (df['confidence'] >= 0.4).astype(int)
df.to_csv('final-features-rgb.csv', index=False)