In [1]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn import metrics



from sklearn.model_selection import cross_val_score

import vectorize_functions


Einladen der vektorisierten Daten

In [2]:
X_train, X_test, y_train, y_test = vectorize_functions.vectorize_tfidf()

In [3]:
positive = np.count_nonzero(y_train == 1)
negative = np.count_nonzero(y_train == 0)
print("Train")
print("- Positive:", positive)
print("- Negative:", negative)
print("- Verhältnis:", negative / positive)

positive = np.count_nonzero(y_test == 1)
negative = np.count_nonzero(y_test == 0)
print("Test")
print("- Positive:", positive)
print("- Negative:", negative)
print("- Verhältnis:", negative / positive)

Train
- Positive: 1270
- Negative: 18084
- Verhältnis: 14.239370078740157
Test
- Positive: 541
- Negative: 7754
- Verhältnis: 14.33271719038817


In [4]:
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)

print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (19354, 6293)
y_train shape (19354,)
X_test shape (8295, 6293)
y_test shape (8295,)


# Evaluation

In [5]:
evaluation = pd.DataFrame(
    columns=["model", "variant", "train_acc", "train_prec", "train_rec", "train_f1", "test_acc", "test_prec",
             "test_rec", "test_f1"])

In [6]:
def add_to_eval_df(model, model_name, variant, x_data_train, y_data_train, x_data_test, y_data_test):
    train_acc = model.score(x_data_train, y_data_train)
    train_precision = precision_score(y_data_train, model.predict(x_data_train))
    train_recall = recall_score(y_data_train, model.predict(x_data_train))
    train_f1 = f1_score(y_data_train, model.predict(x_data_train))

    test_acc = model.score(x_data_test, y_data_test)
    test_precision = precision_score(y_data_test, model.predict(x_data_test))
    test_recall = recall_score(y_data_test, model.predict(x_data_test))
    test_f1 = f1_score(y_data_test, model.predict(x_data_test))

    evaluation.loc[len(evaluation.index)] = [model_name, variant, train_acc, train_precision, train_recall, train_f1,
                                             test_acc, test_precision, test_recall, test_f1]

In [7]:
def evaluate_model(model, x_test, y_test, sampling_method):
    pred = model.predict(x_test)
    accscore = metrics.accuracy_score(pred, y_test)

    print(f'{sampling_method} model accuracy for classification is =', str('{:04.2f}'.format(accscore * 100)) + '%')
    print('------------------------------------------------')
    print('Confusion Matrix:')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    print('------------------------------------------------')
    print('Classification Report:')
    print(classification_report(y_test, pred))

    # probs = model.predict_proba(x_test)
    # preds = probs[:, 1]
    # fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
    # roc_auc = metrics.auc(fpr, tpr)

    # plt.title('Receiver Operating Characteristic')
    # plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
    # plt.legend(loc='lower right')
    # plt.plot([0, 1], [0, 1], 'r--')
    # plt.xlim([0, 1])
    # plt.ylim([0, 1])
    # plt.ylabel('True Positive Rate')
    # plt.xlabel('False Positive Rate')
    # plt.show()


# SVM


# 1. kernel='linear'

default settings

In [8]:
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)


none model accuracy for classification is = 95.78%
------------------------------------------------
Confusion Matrix:
      0    1
0  7721   33
1   317  224
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      7754
           1       0.87      0.41      0.56       541

    accuracy                           0.96      8295
   macro avg       0.92      0.70      0.77      8295
weighted avg       0.95      0.96      0.95      8295



tuning

In [25]:
svm = SVC(kernel='linear', C=0.5, random_state=42, class_weight={0: 1, 1: 10})
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)

none model accuracy for classification is = 92.72%
------------------------------------------------
Confusion Matrix:
      0    1
0  7328  426
1   178  363
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96      7754
           1       0.46      0.67      0.55       541

    accuracy                           0.93      8295
   macro avg       0.72      0.81      0.75      8295
weighted avg       0.94      0.93      0.93      8295



In [26]:
svm = SVC(kernel='linear', C=0.9, random_state=42, class_weight={0: 1, 1: 3})
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)

none model accuracy for classification is = 95.44%
------------------------------------------------
Confusion Matrix:
      0    1
0  7601  153
1   225  316
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      7754
           1       0.67      0.58      0.63       541

    accuracy                           0.95      8295
   macro avg       0.82      0.78      0.80      8295
weighted avg       0.95      0.95      0.95      8295



In [28]:
svm = SVC(kernel='linear', C=0.8, random_state=42, class_weight={0: 1, 1: 3})
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)

none model accuracy for classification is = 95.50%
------------------------------------------------
Confusion Matrix:
      0    1
0  7607  147
1   226  315
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      7754
           1       0.68      0.58      0.63       541

    accuracy                           0.96      8295
   macro avg       0.83      0.78      0.80      8295
weighted avg       0.95      0.96      0.95      8295



In [27]:
svm = SVC(kernel='linear', C=0.8, random_state=42, class_weight={0: 1, 1: 4})
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)

none model accuracy for classification is = 95.08%
------------------------------------------------
Confusion Matrix:
      0    1
0  7554  200
1   208  333
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      7754
           1       0.62      0.62      0.62       541

    accuracy                           0.95      8295
   macro avg       0.80      0.79      0.80      8295
weighted avg       0.95      0.95      0.95      8295



# 2. kernel='rbf'

default settings

In [9]:
svm = SVC(kernel='rbf', C=1.0, random_state=42)
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)


none model accuracy for classification is = 95.36%
------------------------------------------------
Confusion Matrix:
      0    1
0  7743   11
1   374  167
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      7754
           1       0.94      0.31      0.46       541

    accuracy                           0.95      8295
   macro avg       0.95      0.65      0.72      8295
weighted avg       0.95      0.95      0.94      8295



tuning

In [None]:
svm = SVC(kernel='rbf', C=1.0, gamma=1, random_state=42)
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)


none model accuracy for classification is = 95.36%
------------------------------------------------
Confusion Matrix:
      0    1
0  7743   11
1   374  167
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      7754
           1       0.94      0.31      0.46       541

    accuracy                           0.95      8295
   macro avg       0.95      0.65      0.72      8295
weighted avg       0.95      0.95      0.94      8295



In [21]:
svm = SVC(kernel='rbf', C=1.0, gamma=0.01, random_state=42)
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)

none model accuracy for classification is = 93.48%
------------------------------------------------
Confusion Matrix:
      0  1
0  7754  0
1   541  0
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.97      7754
           1       0.00      0.00      0.00       541

    accuracy                           0.93      8295
   macro avg       0.47      0.50      0.48      8295
weighted avg       0.87      0.93      0.90      8295



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
svm = SVC(kernel='rbf', C=1.0, gamma=0.01, random_state=42, class_weight={0: 1, 1: 3})
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)

none model accuracy for classification is = 93.51%
------------------------------------------------
Confusion Matrix:
      0  1
0  7753  1
1   537  4
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      7754
           1       0.80      0.01      0.01       541

    accuracy                           0.94      8295
   macro avg       0.87      0.50      0.49      8295
weighted avg       0.93      0.94      0.90      8295



# 3. kernel='poly'

default settings

In [10]:
svm = SVC(kernel='poly', C=1.0, random_state=42)
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)


none model accuracy for classification is = 93.95%
------------------------------------------------
Confusion Matrix:
      0   1
0  7753   1
1   501  40
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      7754
           1       0.98      0.07      0.14       541

    accuracy                           0.94      8295
   macro avg       0.96      0.54      0.55      8295
weighted avg       0.94      0.94      0.91      8295



# 4. kernel='sigmoid'

default settings

In [11]:
svm = SVC(kernel='sigmoid', C=1.0, random_state=42)
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)


none model accuracy for classification is = 95.61%
------------------------------------------------
Confusion Matrix:
      0    1
0  7724   30
1   334  207
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      7754
           1       0.87      0.38      0.53       541

    accuracy                           0.96      8295
   macro avg       0.92      0.69      0.75      8295
weighted avg       0.95      0.96      0.95      8295



tuning

In [23]:
svm = SVC(kernel='sigmoid', C=0.5, random_state=42, class_weight={0: 1, 1: 10})
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)

none model accuracy for classification is = 90.66%
------------------------------------------------
Confusion Matrix:
      0    1
0  7118  636
1   139  402
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.92      0.95      7754
           1       0.39      0.74      0.51       541

    accuracy                           0.91      8295
   macro avg       0.68      0.83      0.73      8295
weighted avg       0.94      0.91      0.92      8295



In [24]:
svm = SVC(kernel='sigmoid', C=0.9, random_state=42, class_weight={0: 1, 1: 10})
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)

none model accuracy for classification is = 89.69%
------------------------------------------------
Confusion Matrix:
      0    1
0  7042  712
1   143  398
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.91      0.94      7754
           1       0.36      0.74      0.48       541

    accuracy                           0.90      8295
   macro avg       0.67      0.82      0.71      8295
weighted avg       0.94      0.90      0.91      8295



In [16]:
svm = SVC(kernel='sigmoid', C=0.9, random_state=42, class_weight={0: 1, 1: 3})
svm.fit(X_train, y_train)

evaluate_model(svm, X_test, y_test, "none")
add_to_eval_df(svm, "SVM", "clean", X_train, y_train, X_test, y_test)



none model accuracy for classification is = 94.90%
------------------------------------------------
Confusion Matrix:
      0    1
0  7551  203
1   220  321
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      7754
           1       0.61      0.59      0.60       541

    accuracy                           0.95      8295
   macro avg       0.79      0.78      0.79      8295
weighted avg       0.95      0.95      0.95      8295

