In [2]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats import uniform
from sklearn.svm import SVC
from sklearn.metrics import f1_score, recall_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score


##### using randomizedSearch instead of gridSearch due to performance issues

## sigmoid

In [None]:
# # Daten laden
# filepath_name = ("..\\..\\..\\data\\mixed_dataset\\train_cleaned.csv")
# df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')
# df_cleaned = df_cleaned.dropna(subset=['tweet_cleaned'])

# # TF-IDF Vektorisierung
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(df_cleaned['tweet_cleaned'])
# y = df_cleaned["label"]

# # Train-Test-Split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Vektorisierungsmodell speichern
# with open("svm_tfidf_vectorizer.pkl", "wb") as file:
#     pickle.dump(vectorizer, file)

# # Parameterraum definieren
# param_dist = {
#     'kernel': ['sigmoid'],
#     'C': uniform(0.01, 1),
#     'gamma': uniform(0.01, 0.1),
#     'random_state': [40, 42],
#     'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
# }

# # SVM erstellen
# svm = SVC()


# random_search = RandomizedSearchCV(
#     estimator=svm, 
#     param_distributions=param_dist, 
#     n_iter=50,
#     scoring='f1', 
#     cv=3,
#     verbose=2, 
#     n_jobs=-1,
#     random_state=42
# )

# # SVM Training
# random_search.fit(X_train, y_train)

# cv_results = random_search.cv_results_
# params = cv_results['params']

# model_results = []

# for idx, param in enumerate(params):
#     model = SVC(**param)
#     model.fit(X_train, y_train)
    
#     y_test_pred = model.predict(X_test)
#     f1 = f1_score(y_test, y_test_pred)
#     recall = recall_score(y_test, y_test_pred)
    
#     model_results.append({
#         'index': idx,
#         'params': param,
#         'f1_score': f1,
#         'recall': recall,
#         'model': model
#     })

# # Nach Metrik sortieren
# top_3_f1 = sorted(model_results, key=lambda x: x['f1_score'], reverse=True)[:3]
# top_3_recall = sorted(model_results, key=lambda x: x['recall'], reverse=True)[:3]

# # Modelle und Konfigurationen speichern
# model_configs = []

# for rank, model_info in enumerate(top_3_f1, start=1):
#     filename = f"svm_tfidf_sigmoid_top_f1_{rank}.pkl"
#     with open(filename, "wb") as file:
#         pickle.dump(model_info['model'], file)
    
#     model_configs.append({
#         'Rank': rank,
#         'Metric': 'F1-Score',
#         'Model Filename': filename,
#         'F1 Score': model_info['f1_score'],
#         'Recall': model_info['recall'],
#         'Kernel': model_info['params']['kernel'],
#         'C': model_info['params']['C'],
#         'Gamma': model_info['params']['gamma'],
#         'Random State': model_info['params']['random_state'],
#         'Class Weight': model_info['params']['class_weight']
#     })

# for rank, model_info in enumerate(top_3_recall, start=1):
#     filename = f"svm_tfidf_sigmoid_top_recall_{rank}.pkl"
#     with open(filename, "wb") as file:
#         pickle.dump(model_info['model'], file)
    
#     model_configs.append({
#         'Rank': rank,
#         'Metric': 'Recall',
#         'Model Filename': filename,
#         'F1 Score': model_info['f1_score'],
#         'Recall': model_info['recall'],
#         'Kernel': model_info['params']['kernel'],
#         'C': model_info['params']['C'],
#         'Gamma': model_info['params']['gamma'],
#         'Random State': model_info['params']['random_state'],
#         'Class Weight': model_info['params']['class_weight']
#     })

# config_df = pd.DataFrame(model_configs)
# config_df.to_csv("top_models_config.csv", index=False)

# print("Top 6 Modelle gespeichert!")
# print(config_df)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [None]:
# Daten laden
df_cleaned = pd.read_csv("data/mixed_dataset/train_cleaned.csv")
df_cleaned = df_cleaned.dropna(subset=['tweet_cleaned'])

# TF-IDF Vektorisierung
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_cleaned['tweet_cleaned'])
y = df_cleaned["label"]

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vektorisierungsmodell speichern
with open("svm_tfidf_sigmoid_vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file)

# Parameterraum definieren
param_dist = {
    'kernel': ['sigmoid'],
    'C': uniform(0.01, 1),
    'gamma': uniform(0.01, 0.1),
    'random_state': [40, 42],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

# SVM erstellen
svm = SVC()

random_search = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_dist,
    n_iter=50,
    scoring='f1',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# SVM Training
random_search.fit(X_train, y_train)

# Ergebnisse sammeln
results_list = []
model_results = []

for idx, params in enumerate(random_search.cv_results_['params']):
    model = SVC(**params)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'tfidf',
        'kernel': params['kernel'],
        'degree': 'X',
        'C': params['C'],
        'gamma': params['gamma'],
        'random_state': params['random_state'],
        'class_weight': params['class_weight'],

        # Train metrics
        'train_precision_0': train_report['0']['precision'],
        'train_recall_0': train_report['0']['recall'],
        'train_f1_0': train_report['0']['f1-score'],
        'train_support_0': train_report['0']['support'],
        'train_precision_1': train_report['1']['precision'],
        'train_recall_1': train_report['1']['recall'],
        'train_f1_1': train_report['1']['f1-score'],
        'train_support_1': train_report['1']['support'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,

        # Test metrics
        'test_precision_0': test_report['0']['precision'],
        'test_recall_0': test_report['0']['recall'],
        'test_f1_0': test_report['0']['f1-score'],
        'test_support_0': test_report['0']['support'],
        'test_precision_1': test_report['1']['precision'],
        'test_recall_1': test_report['1']['recall'],
        'test_f1_1': test_report['1']['f1-score'],
        'test_support_1': test_report['1']['support'],
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

    # Modell speichern
    filename = f"svm_tfidf_sigmoid_model_{idx}.pkl"
    with open(filename, "wb") as file:
        pickle.dump(model, file)

    model_results.append({
        'index': idx,
        'params': params,
        'f1_score': test_f1,
        'recall': test_recall,
        'model': model,
        'filename': filename
    })

# Modelle sortieren
top_3_f1 = results_df.nlargest(3, 'test_f1')
top_3_recall = results_df.nlargest(3, 'test_recall')

# Speicherpfade für Modelle
for rank, row in enumerate(top_3_f1.itertuples(), start=1):
    filename = f"svm_tfidf_sigmoid_top_f1_{rank}.pkl"
    with open(filename, "wb") as file:
        pickle.dump(row.model, file)

for rank, row in enumerate(top_3_recall.itertuples(), start=1):
    filename = f"svm_tfidf_sigmoid_top_recall_{rank}.pkl"
    with open(filename, "wb") as file:
        pickle.dump(row.model, file)

# Modelle und Konfigurationen speichern
model_configs = []

for rank, model_info in enumerate(top_3_f1, start=1):
    model_configs.append({
        'Rank': rank,
        'Metric': 'F1-Score',
        'Model Filename': model_info['filename'],
        'F1 Score': model_info['f1_score'],
        'Recall': model_info['recall'],
        'Kernel': model_info['params']['kernel'],
        'C': model_info['params']['C'],
        'Gamma': model_info['params']['gamma'],
        'Random State': model_info['params']['random_state'],
        'Class Weight': model_info['params']['class_weight']
    })

for rank, model_info in enumerate(top_3_recall, start=1):
    model_configs.append({
        'Rank': rank,
        'Metric': 'Recall',
        'Model Filename': model_info['filename'],
        'F1 Score': model_info['f1_score'],
        'Recall': model_info['recall'],
        'Kernel': model_info['params']['kernel'],
        'C': model_info['params']['C'],
        'Gamma': model_info['params']['gamma'],
        'Random State': model_info['params']['random_state'],
        'Class Weight': model_info['params']['class_weight']
    })

# Speichern der Top-Modelle als CSV
config_df = pd.DataFrame(model_configs)
config_df.to_csv("svm_tfidf_sigmoid_top_models_config.csv", index=False)

# Speichern der gesamten Ergebnisse als CSV
results_df = pd.DataFrame(results_list)
results_df.to_csv("svm_tfidf_sigmoid_results.csv", index=False)



## linear

In [None]:
# '# Daten laden
# filepath_name = ("..\\..\\..\\data\\mixed_dataset\\train_cleaned.csv")
# df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')
# df_cleaned = df_cleaned.dropna(subset=['tweet_cleaned'])

# # TF-IDF Vektorisierung
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(df_cleaned['tweet_cleaned'])
# y = df_cleaned["label"]

# # Train-Test-Split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Vektorisierungsmodell speichern
# with open("svm_tfidf_vectorizer.pkl", "wb") as file:
#     pickle.dump(vectorizer, file)

# # Parameterraum definieren
# param_dist = {
#     'kernel': ['linear'],
#     'C': uniform(0.01, 1),
#     'gamma': uniform(0.01, 0.1),
#     'random_state': [40, 42],
#     'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
# }

# # SVM erstellen
# svm = SVC()


# random_search = RandomizedSearchCV(
#     estimator=svm, 
#     param_distributions=param_dist, 
#     n_iter=50,
#     scoring='f1', 
#     cv=3,
#     verbose=2, 
#     n_jobs=-1,
#     random_state=42
# )

# # SVM Training
# random_search.fit(X_train, y_train)

# cv_results = random_search.cv_results_
# params = cv_results['params']

# model_results = []

# for idx, param in enumerate(params):
#     model = SVC(**param)
#     model.fit(X_train, y_train)
    
#     y_test_pred = model.predict(X_test)
#     f1 = f1_score(y_test, y_test_pred)
#     recall = recall_score(y_test, y_test_pred)
    
#     model_results.append({
#         'index': idx,
#         'params': param,
#         'f1_score': f1,
#         'recall': recall,
#         'model': model
#     })

# # Nach Metrik sortieren
# top_3_f1 = sorted(model_results, key=lambda x: x['f1_score'], reverse=True)[:3]
# top_3_recall = sorted(model_results, key=lambda x: x['recall'], reverse=True)[:3]

# # Modelle und Konfigurationen speichern
# model_configs = []

# for rank, model_info in enumerate(top_3_f1, start=1):
#     filename = f"svm_tfidf_linear_top_f1_{rank}.pkl"
#     with open(filename, "wb") as file:
#         pickle.dump(model_info['model'], file)
    
#     model_configs.append({
#         'Rank': rank,
#         'Metric': 'F1-Score',
#         'Model Filename': filename,
#         'F1 Score': model_info['f1_score'],
#         'Recall': model_info['recall'],
#         'Kernel': model_info['params']['kernel'],
#         'C': model_info['params']['C'],
#         'Gamma': model_info['params']['gamma'],
#         'Random State': model_info['params']['random_state'],
#         'Class Weight': model_info['params']['class_weight']
#     })

# for rank, model_info in enumerate(top_3_recall, start=1):
#     filename = f"svm_tfidf_linear_top_recall_{rank}.pkl"
#     with open(filename, "wb") as file:
#         pickle.dump(model_info['model'], file)
    
#     model_configs.append({
#         'Rank': rank,
#         'Metric': 'Recall',
#         'Model Filename': filename,
#         'F1 Score': model_info['f1_score'],
#         'Recall': model_info['recall'],
#         'Kernel': model_info['params']['kernel'],
#         'C': model_info['params']['C'],
#         'Gamma': model_info['params']['gamma'],
#         'Random State': model_info['params']['random_state'],
#         'Class Weight': model_info['params']['class_weight']
#     })

# config_df = pd.DataFrame(model_configs)
# config_df.to_csv("top_models_config.csv", index=False)

# print("Top 6 Modelle gespeichert!")
# print(config_df)
# '

In [None]:
# Daten laden
df_cleaned = pd.read_csv("data/mixed_dataset/train_cleaned.csv")
df_cleaned = df_cleaned.dropna(subset=['tweet_cleaned'])

# TF-IDF Vektorisierung
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_cleaned['tweet_cleaned'])
y = df_cleaned["label"]

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vektorisierungsmodell speichern
with open("svm_tfidf_linear_vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file)

# Parameterraum definieren
param_dist = {
    'kernel': ['linear'],
    'C': uniform(0.01, 1),
    'gamma': uniform(0.01, 0.1),
    'random_state': [40, 42],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

# SVM erstellen
svm = SVC()

random_search = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_dist,
    n_iter=50,
    scoring='f1',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# SVM Training
random_search.fit(X_train, y_train)

# Ergebnisse sammeln
results_list = []
model_results = []

for idx, params in enumerate(random_search.cv_results_['params']):
    model = SVC(**params)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'tfidf',
        'kernel': params['kernel'],
        'degree': 'X',
        'C': params['C'],
        'gamma': params['gamma'],
        'random_state': params['random_state'],
        'class_weight': params['class_weight'],

        # Train metrics
        'train_precision_0': train_report['0']['precision'],
        'train_recall_0': train_report['0']['recall'],
        'train_f1_0': train_report['0']['f1-score'],
        'train_support_0': train_report['0']['support'],
        'train_precision_1': train_report['1']['precision'],
        'train_recall_1': train_report['1']['recall'],
        'train_f1_1': train_report['1']['f1-score'],
        'train_support_1': train_report['1']['support'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,

        # Test metrics
        'test_precision_0': test_report['0']['precision'],
        'test_recall_0': test_report['0']['recall'],
        'test_f1_0': test_report['0']['f1-score'],
        'test_support_0': test_report['0']['support'],
        'test_precision_1': test_report['1']['precision'],
        'test_recall_1': test_report['1']['recall'],
        'test_f1_1': test_report['1']['f1-score'],
        'test_support_1': test_report['1']['support'],
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

    # Modell speichern
    filename = f"svm_tfidf_linear_model_{idx}.pkl"
    with open(filename, "wb") as file:
        pickle.dump(model, file)

    model_results.append({
        'index': idx,
        'params': params,
        'f1_score': test_f1,
        'recall': test_recall,
        'model': model,
        'filename': filename
    })

# Modelle sortieren
top_3_f1 = results_df.nlargest(3, 'test_f1')
top_3_recall = results_df.nlargest(3, 'test_recall')

# Speicherpfade für Modelle
for rank, row in enumerate(top_3_f1.itertuples(), start=1):
    filename = f"svm_tfidf_linear_top_f1_{rank}.pkl"
    with open(filename, "wb") as file:
        pickle.dump(row.model, file)

for rank, row in enumerate(top_3_recall.itertuples(), start=1):
    filename = f"svm_tfidf_linear_top_recall_{rank}.pkl"
    with open(filename, "wb") as file:
        pickle.dump(row.model, file)

# Modelle und Konfigurationen speichern
model_configs = []

for rank, model_info in enumerate(top_3_f1, start=1):
    model_configs.append({
        'Rank': rank,
        'Metric': 'F1-Score',
        'Model Filename': model_info['filename'],
        'F1 Score': model_info['f1_score'],
        'Recall': model_info['recall'],
        'Kernel': model_info['params']['kernel'],
        'C': model_info['params']['C'],
        'Gamma': model_info['params']['gamma'],
        'Random State': model_info['params']['random_state'],
        'Class Weight': model_info['params']['class_weight']
    })

for rank, model_info in enumerate(top_3_recall, start=1):
    model_configs.append({
        'Rank': rank,
        'Metric': 'Recall',
        'Model Filename': model_info['filename'],
        'F1 Score': model_info['f1_score'],
        'Recall': model_info['recall'],
        'Kernel': model_info['params']['kernel'],
        'C': model_info['params']['C'],
        'Gamma': model_info['params']['gamma'],
        'Random State': model_info['params']['random_state'],
        'Class Weight': model_info['params']['class_weight']
    })

# Speichern der Top-Modelle als CSV
config_df = pd.DataFrame(model_configs)
config_df.to_csv("svm_tfidf_linear_top_models_config.csv", index=False)

# Speichern der gesamten Ergebnisse als CSV
results_df = pd.DataFrame(results_list)
results_df.to_csv("svm_tfidf_linear_results.csv", index=False)



## poly

In [None]:
# # Daten laden
# filepath_name = ("..\\..\\..\\data\\mixed_dataset\\train_cleaned.csv")
# df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')
# df_cleaned = df_cleaned.dropna(subset=['tweet_cleaned'])

# # TF-IDF Vektorisierung
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(df_cleaned['tweet_cleaned'])
# y = df_cleaned["label"]

# # Train-Test-Split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Vektorisierungsmodell speichern
# with open("svm_tfidf_vectorizer.pkl", "wb") as file:
#     pickle.dump(vectorizer, file)

# # Parameterraum definieren
# param_dist = {
#     'kernel': ['poly'],
#     'degree': [3, 4, 5],
#     'C': uniform(0.01, 1),
#     'gamma': uniform(0.01, 0.1),
#     'random_state': [40, 42],
#     'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
# }

# # SVM erstellen
# svm = SVC()


# random_search = RandomizedSearchCV(
#     estimator=svm, 
#     param_distributions=param_dist, 
#     n_iter=50,
#     scoring='f1', 
#     cv=3,
#     verbose=2, 
#     n_jobs=-1,
#     random_state=42
# )

# # SVM Training
# random_search.fit(X_train, y_train)

# cv_results = random_search.cv_results_
# params = cv_results['params']

# model_results = []

# for idx, param in enumerate(params):
#     model = SVC(**param)
#     model.fit(X_train, y_train)
    
#     y_test_pred = model.predict(X_test)
#     f1 = f1_score(y_test, y_test_pred)
#     recall = recall_score(y_test, y_test_pred)
    
#     model_results.append({
#         'index': idx,
#         'params': param,
#         'f1_score': f1,
#         'recall': recall,
#         'model': model
#     })

# # Nach Metrik sortieren
# top_3_f1 = sorted(model_results, key=lambda x: x['f1_score'], reverse=True)[:3]
# top_3_recall = sorted(model_results, key=lambda x: x['recall'], reverse=True)[:3]

# # Modelle und Konfigurationen speichern
# model_configs = []

# for rank, model_info in enumerate(top_3_f1, start=1):
#     filename = f"svm_top_f1_{rank}.pkl"
#     with open(filename, "wb") as file:
#         pickle.dump(model_info['model'], file)
    
#     model_configs.append({
#         'Rank': rank,
#         'Metric': 'F1-Score',
#         'Model Filename': filename,
#         'F1 Score': model_info['f1_score'],
#         'Recall': model_info['recall'],
#         'Kernel': model_info['params']['kernel'],
#         'C': model_info['params']['C'],
#         'Gamma': model_info['params']['gamma'],
#         'Random State': model_info['params']['random_state'],
#         'Class Weight': model_info['params']['class_weight']
#     })

# for rank, model_info in enumerate(top_3_recall, start=1):
#     filename = f"svm_tfidf_poly_top_recall_{rank}.pkl"
#     with open(filename, "wb") as file:
#         pickle.dump(model_info['model'], file)
    
#     model_configs.append({
#         'Rank': rank,
#         'Metric': 'Recall',
#         'Model Filename': filename,
#         'F1 Score': model_info['f1_score'],
#         'Recall': model_info['recall'],
#         'Kernel': model_info['params']['kernel'],
#         'C': model_info['params']['C'],
#         'Gamma': model_info['params']['gamma'],
#         'Random State': model_info['params']['random_state'],
#         'Class Weight': model_info['params']['class_weight']
#     })

# config_df = pd.DataFrame(model_configs)
# config_df.to_csv("top_tfidf_poly_models_config.csv", index=False)

# print("Top 6 Modelle gespeichert!")
# print(config_df)


NameError: name 'pd' is not defined

In [None]:
# Daten laden
df_cleaned = pd.read_csv("data/mixed_dataset/train_cleaned.csv")
df_cleaned = df_cleaned.dropna(subset=['tweet_cleaned'])

# TF-IDF Vektorisierung
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_cleaned['tweet_cleaned'])
y = df_cleaned["label"]

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vektorisierungsmodell speichern
with open("svm_tfidf_poly_vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file)

# Parameterraum definieren
param_dist = {
    'kernel': ['poly'],
    'degree': [3, 4, 5],
    'C': uniform(0.01, 1),
    'gamma': uniform(0.01, 0.1),
    'random_state': [40, 42],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

# SVM erstellen
svm = SVC()

random_search = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_dist,
    n_iter=50,
    scoring='f1',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# SVM Training
random_search.fit(X_train, y_train)

# Ergebnisse sammeln
results_list = []
model_results = []

for idx, params in enumerate(random_search.cv_results_['params']):
    model = SVC(**params)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'tfidf',
        'kernel': params['kernel'],
        'degree': 'X',
        'C': params['C'],
        'gamma': params['gamma'],
        'random_state': params['random_state'],
        'class_weight': params['class_weight'],

        # Train metrics
        'train_precision_0': train_report['0']['precision'],
        'train_recall_0': train_report['0']['recall'],
        'train_f1_0': train_report['0']['f1-score'],
        'train_support_0': train_report['0']['support'],
        'train_precision_1': train_report['1']['precision'],
        'train_recall_1': train_report['1']['recall'],
        'train_f1_1': train_report['1']['f1-score'],
        'train_support_1': train_report['1']['support'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,

        # Test metrics
        'test_precision_0': test_report['0']['precision'],
        'test_recall_0': test_report['0']['recall'],
        'test_f1_0': test_report['0']['f1-score'],
        'test_support_0': test_report['0']['support'],
        'test_precision_1': test_report['1']['precision'],
        'test_recall_1': test_report['1']['recall'],
        'test_f1_1': test_report['1']['f1-score'],
        'test_support_1': test_report['1']['support'],
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

    # Modell speichern
    filename = f"svm_tfidf_poly_model_{idx}.pkl"
    with open(filename, "wb") as file:
        pickle.dump(model, file)

    model_results.append({
        'index': idx,
        'params': params,
        'f1_score': test_f1,
        'recall': test_recall,
        'model': model,
        'filename': filename
    })

# Modelle sortieren
top_3_f1 = results_df.nlargest(3, 'test_f1')
top_3_recall = results_df.nlargest(3, 'test_recall')

# Speicherpfade für Modelle
for rank, row in enumerate(top_3_f1.itertuples(), start=1):
    filename = f"svm_tfidf_poly_top_f1_{rank}.pkl"
    with open(filename, "wb") as file:
        pickle.dump(row.model, file)

for rank, row in enumerate(top_3_recall.itertuples(), start=1):
    filename = f"svm_tfidf_poly_top_recall_{rank}.pkl"
    with open(filename, "wb") as file:
        pickle.dump(row.model, file)

# Modelle und Konfigurationen speichern
model_configs = []

for rank, model_info in enumerate(top_3_f1, start=1):
    model_configs.append({
        'Rank': rank,
        'Metric': 'F1-Score',
        'Model Filename': model_info['filename'],
        'F1 Score': model_info['f1_score'],
        'Recall': model_info['recall'],
        'Kernel': model_info['params']['kernel'],
        'C': model_info['params']['C'],
        'Gamma': model_info['params']['gamma'],
        'Random State': model_info['params']['random_state'],
        'Class Weight': model_info['params']['class_weight']
    })

for rank, model_info in enumerate(top_3_recall, start=1):
    model_configs.append({
        'Rank': rank,
        'Metric': 'Recall',
        'Model Filename': model_info['filename'],
        'F1 Score': model_info['f1_score'],
        'Recall': model_info['recall'],
        'Kernel': model_info['params']['kernel'],
        'C': model_info['params']['C'],
        'Gamma': model_info['params']['gamma'],
        'Random State': model_info['params']['random_state'],
        'Class Weight': model_info['params']['class_weight']
    })

# Speichern der Top-Modelle als CSV
config_df = pd.DataFrame(model_configs)
config_df.to_csv("svm_tfidf_poly_top_models_config.csv", index=False)

# Speichern der gesamten Ergebnisse als CSV
results_df = pd.DataFrame(results_list)
results_df.to_csv("svm_tfidf_poly_results.csv", index=False)


## rbf

In [None]:
# # Daten laden
# filepath_name = ("..\\..\\..\\data\\mixed_dataset\\train_cleaned.csv")
# df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')
# df_cleaned = df_cleaned.dropna(subset=['tweet_cleaned'])

# # TF-IDF Vektorisierung
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(df_cleaned['tweet_cleaned'])
# y = df_cleaned["label"]

# # Train-Test-Split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Vektorisierungsmodell speichern
# with open("svm_tfidf_vectorizer.pkl", "wb") as file:
#     pickle.dump(vectorizer, file)

# # Parameterraum definieren
# param_dist = {
#     'kernel': ['rbf'],
#     'C': uniform(0.01, 1),
#     'gamma': uniform(0.01, 0.1),
#     'random_state': [40, 42],
#     'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
# }

# # SVM erstellen
# svm = SVC()


# random_search = RandomizedSearchCV(
#     estimator=svm, 
#     param_distributions=param_dist, 
#     n_iter=50,
#     scoring='f1', 
#     cv=3,
#     verbose=2, 
#     n_jobs=-1,
#     random_state=42
# )

# # SVM Training
# random_search.fit(X_train, y_train)

# cv_results = random_search.cv_results_
# params = cv_results['params']

# model_results = []

# for idx, param in enumerate(params):
#     model = SVC(**param)
#     model.fit(X_train, y_train)
    
#     y_test_pred = model.predict(X_test)
#     f1 = f1_score(y_test, y_test_pred)
#     recall = recall_score(y_test, y_test_pred)
    
#     model_results.append({
#         'index': idx,
#         'params': param,
#         'f1_score': f1,
#         'recall': recall,
#         'model': model
#     })

# # Nach Metrik sortieren
# top_3_f1 = sorted(model_results, key=lambda x: x['f1_score'], reverse=True)[:3]
# top_3_recall = sorted(model_results, key=lambda x: x['recall'], reverse=True)[:3]

# # Modelle und Konfigurationen speichern
# model_configs = []

# for rank, model_info in enumerate(top_3_f1, start=1):
#     filename = f"svm_tfidf_rbf_top_f1_{rank}.pkl"
#     with open(filename, "wb") as file:
#         pickle.dump(model_info['model'], file)
    
#     model_configs.append({
#         'Rank': rank,
#         'Metric': 'F1-Score',
#         'Model Filename': filename,
#         'F1 Score': model_info['f1_score'],
#         'Recall': model_info['recall'],
#         'Kernel': model_info['params']['kernel'],
#         'C': model_info['params']['C'],
#         'Gamma': model_info['params']['gamma'],
#         'Random State': model_info['params']['random_state'],
#         'Class Weight': model_info['params']['class_weight']
#     })

# for rank, model_info in enumerate(top_3_recall, start=1):
#     filename = f"svm_tfidf_rbf_top_recall_{rank}.pkl"
#     with open(filename, "wb") as file:
#         pickle.dump(model_info['model'], file)
    
#     model_configs.append({
#         'Rank': rank,
#         'Metric': 'Recall',
#         'Model Filename': filename,
#         'F1 Score': model_info['f1_score'],
#         'Recall': model_info['recall'],
#         'Kernel': model_info['params']['kernel'],
#         'C': model_info['params']['C'],
#         'Gamma': model_info['params']['gamma'],
#         'Random State': model_info['params']['random_state'],
#         'Class Weight': model_info['params']['class_weight']
#     })

# config_df = pd.DataFrame(model_configs)
# config_df.to_csv("top_models_config.csv", index=False)

# print("Top 6 Modelle gespeichert!")
# print(config_df)


In [None]:
# Daten laden
df_cleaned = pd.read_csv("data/mixed_dataset/train_cleaned.csv")
df_cleaned = df_cleaned.dropna(subset=['tweet_cleaned'])

# TF-IDF Vektorisierung
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_cleaned['tweet_cleaned'])
y = df_cleaned["label"]

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vektorisierungsmodell speichern
with open("svm_tfidf_rbf_vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file)

# Parameterraum definieren
param_dist = {
    'kernel': ['rbf'],
    'C': uniform(0.01, 1),
    'gamma': uniform(0.01, 0.1),
    'random_state': [40, 42],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

# SVM erstellen
svm = SVC()

random_search = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_dist,
    n_iter=50,
    scoring='f1',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# SVM Training
random_search.fit(X_train, y_train)

# Ergebnisse sammeln
results_list = []
model_results = []

for idx, params in enumerate(random_search.cv_results_['params']):
    model = SVC(**params)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'tfidf',
        'kernel': params['kernel'],
        'degree': 'X',
        'C': params['C'],
        'gamma': params['gamma'],
        'random_state': params['random_state'],
        'class_weight': params['class_weight'],

        # Train metrics
        'train_precision_0': train_report['0']['precision'],
        'train_recall_0': train_report['0']['recall'],
        'train_f1_0': train_report['0']['f1-score'],
        'train_support_0': train_report['0']['support'],
        'train_precision_1': train_report['1']['precision'],
        'train_recall_1': train_report['1']['recall'],
        'train_f1_1': train_report['1']['f1-score'],
        'train_support_1': train_report['1']['support'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,

        # Test metrics
        'test_precision_0': test_report['0']['precision'],
        'test_recall_0': test_report['0']['recall'],
        'test_f1_0': test_report['0']['f1-score'],
        'test_support_0': test_report['0']['support'],
        'test_precision_1': test_report['1']['precision'],
        'test_recall_1': test_report['1']['recall'],
        'test_f1_1': test_report['1']['f1-score'],
        'test_support_1': test_report['1']['support'],
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

    # Modell speichern
    filename = f"svm_tfidf_rbf_model_{idx}.pkl"
    with open(filename, "wb") as file:
        pickle.dump(model, file)

    model_results.append({
        'index': idx,
        'params': params,
        'f1_score': test_f1,
        'recall': test_recall,
        'model': model,
        'filename': filename
    })

# Modelle sortieren
top_3_f1 = results_df.nlargest(3, 'test_f1')
top_3_recall = results_df.nlargest(3, 'test_recall')

# Speicherpfade für Modelle
for rank, row in enumerate(top_3_f1.itertuples(), start=1):
    filename = f"svm_tfidf_rbf_top_f1_{rank}.pkl"
    with open(filename, "wb") as file:
        pickle.dump(row.model, file)

for rank, row in enumerate(top_3_recall.itertuples(), start=1):
    filename = f"svm_tfidf_rbf_top_recall_{rank}.pkl"
    with open(filename, "wb") as file:
        pickle.dump(row.model, file)

# Modelle und Konfigurationen speichern
model_configs = []

for rank, model_info in enumerate(top_3_f1, start=1):
    model_configs.append({
        'Rank': rank,
        'Metric': 'F1-Score',
        'Model Filename': model_info['filename'],
        'F1 Score': model_info['f1_score'],
        'Recall': model_info['recall'],
        'Kernel': model_info['params']['kernel'],
        'C': model_info['params']['C'],
        'Gamma': model_info['params']['gamma'],
        'Random State': model_info['params']['random_state'],
        'Class Weight': model_info['params']['class_weight']
    })

for rank, model_info in enumerate(top_3_recall, start=1):
    model_configs.append({
        'Rank': rank,
        'Metric': 'Recall',
        'Model Filename': model_info['filename'],
        'F1 Score': model_info['f1_score'],
        'Recall': model_info['recall'],
        'Kernel': model_info['params']['kernel'],
        'C': model_info['params']['C'],
        'Gamma': model_info['params']['gamma'],
        'Random State': model_info['params']['random_state'],
        'Class Weight': model_info['params']['class_weight']
    })

# Speichern der Top-Modelle als CSV
config_df = pd.DataFrame(model_configs)
config_df.to_csv("svm_tfidf_rbf_top_models_config.csv", index=False)

# Speichern der gesamten Ergebnisse als CSV
results_df = pd.DataFrame(results_list)
results_df.to_csv("svm_tfidf_rbf_results.csv", index=False)
