V1

In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# === 1. Ordnernamen anpassen ===
folders = ["Arbeit", "Finanzen", "Werbung"]  # Hier einfach deine Ordnernamen eintragen
base_path = "data"  # Der Pfad zu deinem Hauptverzeichnis mit den Ordnern

# === 2. Datenimport: CSV-Dateien laden ===
def load_data_from_folders(base_path, folders):
    data = []
    for folder in folders:
        folder_path = os.path.join(base_path, folder)
        for file in os.listdir(folder_path):
            if file.endswith(".csv"):
                file_path = os.path.join(folder_path, file)
                # CSV-Datei lesen
                csv_data = pd.read_csv(file_path)
                for _, row in csv_data.iterrows():
                    data.append({
                        "Betreff": row.get("Betreff", ""),
                        "Inhalt": row.get("Inhalt", ""),
                        "Label": folder  # Ordnername als Label
                    })
    return pd.DataFrame(data)

data = load_data_from_folders(base_path, folders)

# === 3. Datenaufteilung in Training und Test ===
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# === 4. Textdaten vorbereiten: Tokenisierung ===
# Tokenizer erstellen und auf Trainingsdaten anpassen
tokenizer = Tokenizer(num_words=5000, oov_token="<UNK>")
tokenizer.fit_on_texts(train_data["Inhalt"])

# Textdaten in Sequenzen umwandeln
x_train = tokenizer.texts_to_matrix(train_data["Inhalt"], mode="tfidf")
x_test = tokenizer.texts_to_matrix(test_data["Inhalt"], mode="tfidf")

# Labels in numerische Werte umwandeln
label_map = {folder: idx for idx, folder in enumerate(folders)}  # Ordner zu Index
y_train = to_categorical(train_data["Label"].map(label_map), num_classes=len(folders))
y_test = to_categorical(test_data["Label"].map(label_map), num_classes=len(folders))

# === 5. Modell erstellen ===
model = Sequential([
    Dense(128, activation='relu', input_shape=(x_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(len(folders), activation='softmax')  # Ausgabe hat so viele Klassen wie Ordner
])

# Modell kompilieren
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# === 6. Modell trainieren ===
model.fit(x_train, y_train, epochs=10, batch_size=16, validation_data=(x_test, y_test))

# === 7. Belohnungssystem simulieren ===
def reward_system(predicted_folder, true_folder):
    if predicted_folder == true_folder:
        return 1  # Richtige Zuordnung
    else:
        return -1  # Falsche Zuordnung

# === 8. Aufgaben: Vorhersagen und Belohnung auswerten ===
def evaluate_model(data, tokenizer, model, label_map, folders):
    score = 0
    for _, row in data.iterrows():
        # Text in Matrix umwandeln
        x = tokenizer.texts_to_matrix([row["Inhalt"]], mode="tfidf")
        # Vorhersage durchführen
        predictions = model.predict(x)
        predicted_label = folders[np.argmax(predictions)]  # Index zurück in Ordnername
        true_label = row["Label"]

        # Belohnung berechnen
        reward = reward_system(predicted_label, true_label)
        score += reward

        print(f"Betreff: {row['Betreff']}, Vorhergesagt: {predicted_label}, Korrekt: {true_label}, Belohnung: {reward}")
    return score

# Modell bewerten
final_score = evaluate_model(test_data, tokenizer, model, label_map, folders)
print(f"Gesamte Belohnung: {final_score}")


FileNotFoundError: [WinError 3] Das System kann den angegebenen Pfad nicht finden: 'data\\Arbeit'