In [1]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import vectorize_functions
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.datasets import make_classification


Einladen der vektorisierten Daten

In [2]:
X_train, X_test, y_train, y_test = vectorize_functions.vectorize_tfidf()

In [3]:
positive = np.count_nonzero(y_train == 1)
negative = np.count_nonzero(y_train == 0)
print("Train")
print("- Positive:", positive)
print("- Negative:", negative)
print("- Verhältnis:", negative / positive)

positive = np.count_nonzero(y_test == 1)
negative = np.count_nonzero(y_test == 0)
print("Test")
print("- Positive:", positive)
print("- Negative:", negative)
print("- Verhältnis:", negative / positive)

Train
- Positive: 930
- Negative: 12807
- Verhältnis: 13.770967741935484
Test
- Positive: 374
- Negative: 5514
- Verhältnis: 14.743315508021391


In [4]:
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)

print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (13737, 4925)
y_train shape (13737,)
X_test shape (5888, 4925)
y_test shape (5888,)


# Evaluation

# SVM


In [None]:
results_list = []

param_grid = {
    'kernel': ['sigmoid'],
    #'degree': [3, 4, 5],
    'C': [0.01, 0.1, 1],
    'gamma': [0.01, 0.1, 1],
    'random_state': [40, 42, 44],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

svm = SVC()

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for params in grid_search.cv_results_['params']:
    model = SVC(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'tfidf',
        'kernel': params['kernel'],
        'degree': 'X',
        'C': params['C'],
        'gamma': params['gamma'],
        'class_weight': params['class_weight'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_grid_tfidf_sigmoid.csv', index=False)

print(results_df)


Fitting 3 folds for each of 81 candidates, totalling 243 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

     kernel degree     C  gamma  class_weight  train_accuracy  train_recall  \
0   sigmoid      X  0.01   0.01  {0: 1, 1: 1}        0.932300      0.000000   
1   sigmoid      X  0.01   0.01  {0: 1, 1: 1}        0.932300      0.000000   
2   sigmoid      X  0.01   0.01  {0: 1, 1: 1}        0.932300      0.000000   
3   sigmoid      X  0.01   0.10  {0: 1, 1: 1}        0.932300      0.000000   
4   sigmoid      X  0.01   0.10  {0: 1, 1: 1}        0.932300      0.000000   
..      ...    ...   ...    ...           ...             ...           ...   
76  sigmoid      X  1.00   0.10  {0: 1, 1: 5}        0.961928      0.604301   
77  sigmoid      X  1.00   0.10  {0: 1, 1: 5}        0.961928      0.604301   
78  sigmoid      X  1.00   1.00  {0: 1, 1: 5}        0.949916      0.833333   
79  sigmoid      X  1.00   1.00  {0: 1, 1: 5}        0.949916      0.833333   
80  sigmoid      X  1.00   1.00  {0: 1, 1: 5}        0.949916      0.833333   

    train_precision  train_f1  test_accuracy  test_

In [None]:
results_list = []

param_grid = {
    'kernel': ['linear'],
    #'degree': [3, 4, 5],
    'C': [0.01, 0.1, 1],
    'gamma': [0.01, 0.1, 1],
    'random_state': [40, 42, 44],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

svm = SVC()

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for params in grid_search.cv_results_['params']:
    model = SVC(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'tfidf',
        'kernel': params['kernel'],
        'degree': 'X',
        'C': params['C'],
        'gamma': params['gamma'],
        'class_weight': params['class_weight'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_grid_tfidf_linear.csv', index=False)

print(results_df)


Fitting 3 folds for each of 81 candidates, totalling 243 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

    kernel degree     C  gamma  class_weight  train_accuracy  train_recall  \
0   linear      X  0.01   0.01  {0: 1, 1: 1}        0.932300      0.000000   
1   linear      X  0.01   0.01  {0: 1, 1: 1}        0.932300      0.000000   
2   linear      X  0.01   0.01  {0: 1, 1: 1}        0.932300      0.000000   
3   linear      X  0.01   0.10  {0: 1, 1: 1}        0.932300      0.000000   
4   linear      X  0.01   0.10  {0: 1, 1: 1}        0.932300      0.000000   
..     ...    ...   ...    ...           ...             ...           ...   
76  linear      X  1.00   0.10  {0: 1, 1: 5}        0.978525      0.951613   
77  linear      X  1.00   0.10  {0: 1, 1: 5}        0.978525      0.951613   
78  linear      X  1.00   1.00  {0: 1, 1: 5}        0.978525      0.951613   
79  linear      X  1.00   1.00  {0: 1, 1: 5}        0.978525      0.951613   
80  linear      X  1.00   1.00  {0: 1, 1: 5}        0.978525      0.951613   

    train_precision  train_f1  test_accuracy  test_recall  test

In [None]:
results_list = []

param_grid = {
    'kernel': ['poly'],
    'degree': [3, 4, 5],
    'C': [0.01, 0.1, 1],
    'gamma': [0.01, 0.1, 1],
    'random_state': [40, 42, 44],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 3}]
}

svm = SVC()

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for params in grid_search.cv_results_['params']:
    model = SVC(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'tfidf',
        'kernel': params['kernel'],
        'degree': params['degree'],
        'C': params['C'],
        'gamma': params['gamma'],
        'class_weight': params['class_weight'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_grid_tfidf_poly.csv', index=False)

print(results_df)


Fitting 3 folds for each of 243 candidates, totalling 729 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

    kernel  degree     C  gamma  class_weight  train_accuracy  train_recall  \
0     poly       3  0.01   0.01  {0: 1, 1: 1}        0.932300           0.0   
1     poly       3  0.01   0.01  {0: 1, 1: 1}        0.932300           0.0   
2     poly       3  0.01   0.01  {0: 1, 1: 1}        0.932300           0.0   
3     poly       3  0.01   0.10  {0: 1, 1: 1}        0.932300           0.0   
4     poly       3  0.01   0.10  {0: 1, 1: 1}        0.932300           0.0   
..     ...     ...   ...    ...           ...             ...           ...   
238   poly       5  1.00   0.10  {0: 1, 1: 3}        0.932300           0.0   
239   poly       5  1.00   0.10  {0: 1, 1: 3}        0.932300           0.0   
240   poly       5  1.00   1.00  {0: 1, 1: 3}        0.999927           1.0   
241   poly       5  1.00   1.00  {0: 1, 1: 3}        0.999927           1.0   
242   poly       5  1.00   1.00  {0: 1, 1: 3}        0.999927           1.0   

     train_precision  train_f1  test_accuracy  test

In [None]:
results_list = []

param_grid = {
    'kernel': ['rbf'],
    #'degree': [],
    'C': [0.01, 0.1, 1],
    'gamma': [0.01, 0.1, 1],
    'random_state': [40, 42, 44],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 3}]
}

svm = SVC()

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for params in grid_search.cv_results_['params']:
    model = SVC(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'tfidf',
        'kernel': params['kernel'],
        'degree': 'X',
        'C': params['C'],
        'gamma': params['gamma'],
        'class_weight': params['class_weight'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_grid_tfidf_rbf.csv', index=False)

print(results_df)


Fitting 3 folds for each of 81 candidates, totalling 243 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

   kernel degree     C  gamma  class_weight  train_accuracy  train_recall  \
0     rbf      X  0.01   0.01  {0: 1, 1: 1}        0.932300      0.000000   
1     rbf      X  0.01   0.01  {0: 1, 1: 1}        0.932300      0.000000   
2     rbf      X  0.01   0.01  {0: 1, 1: 1}        0.932300      0.000000   
3     rbf      X  0.01   0.10  {0: 1, 1: 1}        0.932300      0.000000   
4     rbf      X  0.01   0.10  {0: 1, 1: 1}        0.932300      0.000000   
..    ...    ...   ...    ...           ...             ...           ...   
76    rbf      X  1.00   0.10  {0: 1, 1: 3}        0.963529      0.531183   
77    rbf      X  1.00   0.10  {0: 1, 1: 3}        0.963529      0.531183   
78    rbf      X  1.00   1.00  {0: 1, 1: 3}        0.999490      0.996774   
79    rbf      X  1.00   1.00  {0: 1, 1: 3}        0.999490      0.996774   
80    rbf      X  1.00   1.00  {0: 1, 1: 3}        0.999490      0.996774   

    train_precision  train_f1  test_accuracy  test_recall  test_precision  