In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn import metrics



from sklearn.model_selection import cross_val_score

import vectorize_functions

from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


Einladen der vektorisierten Daten

In [3]:
X_train, X_test, y_train, y_test = vectorize_functions.vectorize_tfidf()

In [4]:
positive = np.count_nonzero(y_train == 1)
negative = np.count_nonzero(y_train == 0)
print("Train")
print("- Positive:", positive)
print("- Negative:", negative)
print("- Verhältnis:", negative / positive)

positive = np.count_nonzero(y_test == 1)
negative = np.count_nonzero(y_test == 0)
print("Test")
print("- Positive:", positive)
print("- Negative:", negative)
print("- Verhältnis:", negative / positive)

Train
- Positive: 1272
- Negative: 18055
- Verhältnis: 14.194182389937106
Test
- Positive: 533
- Negative: 7750
- Verhältnis: 14.540337711069418


In [5]:
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)

print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (19327, 6319)
y_train shape (19327,)
X_test shape (8283, 6319)
y_test shape (8283,)


# Evaluation

In [6]:
evaluation = pd.DataFrame(
    columns=["model", "variant", "train_acc", "train_prec", "train_rec", "train_f1", "test_acc", "test_prec",
             "test_rec", "test_f1"])

In [7]:
def add_to_eval_df(model, model_name, variant, x_data_train, y_data_train, x_data_test, y_data_test):
    train_acc = model.score(x_data_train, y_data_train)
    train_precision = precision_score(y_data_train, model.predict(x_data_train))
    train_recall = recall_score(y_data_train, model.predict(x_data_train))
    train_f1 = f1_score(y_data_train, model.predict(x_data_train))

    test_acc = model.score(x_data_test, y_data_test)
    test_precision = precision_score(y_data_test, model.predict(x_data_test))
    test_recall = recall_score(y_data_test, model.predict(x_data_test))
    test_f1 = f1_score(y_data_test, model.predict(x_data_test))

    evaluation.loc[len(evaluation.index)] = [model_name, variant, train_acc, train_precision, train_recall, train_f1,
                                             test_acc, test_precision, test_recall, test_f1]

In [8]:
def evaluate_model(model, x_test, y_test, sampling_method):
    pred = model.predict(x_test)
    accscore = metrics.accuracy_score(pred, y_test)

    print(f'{sampling_method} model accuracy for classification is =', str('{:04.2f}'.format(accscore * 100)) + '%')
    print('------------------------------------------------')
    print('Confusion Matrix:')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    print('------------------------------------------------')
    print('Classification Report:')
    print(classification_report(y_test, pred))

    # probs = model.predict_proba(x_test)
    # preds = probs[:, 1]
    # fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
    # roc_auc = metrics.auc(fpr, tpr)

    # plt.title('Receiver Operating Characteristic')
    # plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
    # plt.legend(loc='lower right')
    # plt.plot([0, 1], [0, 1], 'r--')
    # plt.xlim([0, 1])
    # plt.ylim([0, 1])
    # plt.ylabel('True Positive Rate')
    # plt.xlabel('False Positive Rate')
    # plt.show()


# SVM


In [9]:
# Define parameter grid
param_grid = {
    'kernel': ['linear', 'sigmoid'],
    'degree': [3, 4, 5],
    'C': [0.01, 0.1, 1],
    'gamma': [0.01, 0.1, 1],
    'random_state': [42],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}]
}
# Initialize SVM model
svm = SVC()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Evaluate the best model
evaluate_model(best_model, X_test, y_test, "GridSearchCV")
add_to_eval_df(best_model, "SVM", f"best_params={best_params}", X_train, y_train, X_test, y_test)

# Print best parameters
print("Best parameters:", best_params)

# Show the evaluation DataFrame
print(evaluation)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
GridSearchCV model accuracy for classification is = 95.55%
------------------------------------------------
Confusion Matrix:
      0    1
0  7590  160
1   209  324
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      7750
           1       0.67      0.61      0.64       533

    accuracy                           0.96      8283
   macro avg       0.82      0.79      0.81      8283
weighted avg       0.95      0.96      0.95      8283

Best parameters: {'C': 1, 'class_weight': {0: 1, 1: 3}, 'degree': 3, 'gamma': 0.01, 'kernel': 'linear', 'random_state': 42}
  model                                            variant  train_acc  \
0   SVM  best_params={'C': 1, 'class_weight': {0: 1, 1:...   0.981528   

   train_prec  train_rec  train_f1  test_acc  test_prec  test_rec   test_f1  
0    0.851113   0.871855

In [10]:
# Save the evaluation results as a CSV file
evaluation.to_csv("evaluation_results.csv", index=False)