In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.datasets import make_classification
import joblib
from sklearn.metrics import matthews_corrcoef

Einladen der vektorisierten Daten

In [10]:
%run ../../../../functions/vectorize_functions.py

In [11]:
filepath_name = (('../../../../../data/twitter_hate-speech/train_cleaned.csv'))
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [12]:
X_train, X_test, y_train, y_test, glv_mode = vectorize_glove(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [13]:
with open("svm_glove_grid_original_datavectorizer.joblib", "wb") as file:
    joblib.dump(glv_mode, file)

In [14]:
positive = np.count_nonzero(y_train == 1)
negative = np.count_nonzero(y_train == 0)
print("Train")
print("- Positive:", positive)
print("- Negative:", negative)
print("- Verhältnis:", negative / positive)

positive = np.count_nonzero(y_test == 1)
negative = np.count_nonzero(y_test == 0)
print("Test")
print("- Positive:", positive)
print("- Negative:", negative)
print("- Verhältnis:", negative / positive)

Train
- Positive: 964
- Negative: 13160
- Verhältnis: 13.651452282157676
Test
- Positive: 367
- Negative: 5687
- Verhältnis: 15.49591280653951


In [15]:
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)

print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (14124, 100)
y_train shape (14124,)
X_test shape (6054, 100)
y_test shape (6054,)


# SVM


In [16]:
results_list = []

param_grid = {
    'kernel': ['sigmoid'],
    #'degree': [3, 4, 5],
    'C': [0.01, 0.1, 1],
    'gamma': [0.01, 0.1, 1],
    'random_state': [40, 42, 44],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

svm = SVC()

grid_search = GridSearchCV(
    estimator=svm, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1
)

grid_search.fit(X_train, y_train)

for params in grid_search.cv_results_['params']:
    model = SVC(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_mcc = matthews_corrcoef(y_train, y_train_pred) 

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_mcc = matthews_corrcoef(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'glove',
        'kernel': params['kernel'],
        'degree': 'X',
        'C': params['C'],
        'gamma': params['gamma'],
        'random_state': params['random_state'],
        'class_weight': params['class_weight'],

        # Train metrics
        'train_precision_0': train_report['0']['precision'],
        'train_recall_0': train_report['0']['recall'],
        'train_f1_0': train_report['0']['f1-score'],
        'train_support_0': train_report['0']['support'],
        'train_precision_1': train_report['1']['precision'],
        'train_recall_1': train_report['1']['recall'],
        'train_f1_1': train_report['1']['f1-score'],
        'train_support_1': train_report['1']['support'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'train_mcc': train_mcc, 

        # Test metrics
        'test_precision_0': test_report['0']['precision'],
        'test_recall_0': test_report['0']['recall'],
        'test_f1_0': test_report['0']['f1-score'],
        'test_support_0': test_report['0']['support'],
        'test_precision_1': test_report['1']['precision'],
        'test_recall_1': test_report['1']['recall'],
        'test_f1_1': test_report['1']['f1-score'],
        'test_support_1': test_report['1']['support'],
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1,
        'test_mcc': test_mcc
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)
results_df.to_csv('svm_grid_glove_sigmoid.csv', index=False)

print(results_df)


Fitting 3 folds for each of 81 candidates, totalling 243 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

   vectorization   kernel degree     C  gamma  random_state  class_weight  \
0          glove  sigmoid      X  0.01   0.01            40  {0: 1, 1: 1}   
1          glove  sigmoid      X  0.01   0.01            42  {0: 1, 1: 1}   
2          glove  sigmoid      X  0.01   0.01            44  {0: 1, 1: 1}   
3          glove  sigmoid      X  0.01   0.10            40  {0: 1, 1: 1}   
4          glove  sigmoid      X  0.01   0.10            42  {0: 1, 1: 1}   
..           ...      ...    ...   ...    ...           ...           ...   
76         glove  sigmoid      X  1.00   0.10            42  {0: 1, 1: 5}   
77         glove  sigmoid      X  1.00   0.10            44  {0: 1, 1: 5}   
78         glove  sigmoid      X  1.00   1.00            40  {0: 1, 1: 5}   
79         glove  sigmoid      X  1.00   1.00            42  {0: 1, 1: 5}   
80         glove  sigmoid      X  1.00   1.00            44  {0: 1, 1: 5}   

    train_precision_0  train_recall_0  train_f1_0  ...  test_support_0  \
0

In [20]:
results_list = []

param_grid = {
    'kernel': ['linear'],
    #'degree': [3, 4, 5],
    'C': [0.01, 0.1, 1],
    'gamma': [0.01, 0.1, 1],
    'random_state': [40, 42, 44],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

svm = SVC()

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for params in grid_search.cv_results_['params']:
    model = SVC(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_mcc = matthews_corrcoef(y_train, y_train_pred) 

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_mcc = matthews_corrcoef(y_test, y_test_pred)


    result_dict = {
        'vectorization': 'glove',
        'kernel': params['kernel'],
        'degree': 'X',
        'C': params['C'],
        'gamma': params['gamma'],
        'random_state': params['random_state'],
        'class_weight': params['class_weight'],

        # Train metrics
        'train_precision_0': train_report['0']['precision'],
        'train_recall_0': train_report['0']['recall'],
        'train_f1_0': train_report['0']['f1-score'],
        'train_support_0': train_report['0']['support'],
        'train_precision_1': train_report['1']['precision'],
        'train_recall_1': train_report['1']['recall'],
        'train_f1_1': train_report['1']['f1-score'],
        'train_support_1': train_report['1']['support'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'train_mcc': train_mcc, 

        # Test metrics
        'test_precision_0': test_report['0']['precision'],
        'test_recall_0': test_report['0']['recall'],
        'test_f1_0': test_report['0']['f1-score'],
        'test_support_0': test_report['0']['support'],
        'test_precision_1': test_report['1']['precision'],
        'test_recall_1': test_report['1']['recall'],
        'test_f1_1': test_report['1']['f1-score'],
        'test_support_1': test_report['1']['support'],
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1,
        'test_mcc': test_mcc
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_grid_glove_linear.csv', index=False)

print(results_df)


Fitting 3 folds for each of 81 candidates, totalling 243 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

   vectorization  kernel degree     C  gamma  random_state  class_weight  \
0          glove  linear      X  0.01   0.01            40  {0: 1, 1: 1}   
1          glove  linear      X  0.01   0.01            42  {0: 1, 1: 1}   
2          glove  linear      X  0.01   0.01            44  {0: 1, 1: 1}   
3          glove  linear      X  0.01   0.10            40  {0: 1, 1: 1}   
4          glove  linear      X  0.01   0.10            42  {0: 1, 1: 1}   
..           ...     ...    ...   ...    ...           ...           ...   
76         glove  linear      X  1.00   0.10            42  {0: 1, 1: 5}   
77         glove  linear      X  1.00   0.10            44  {0: 1, 1: 5}   
78         glove  linear      X  1.00   1.00            40  {0: 1, 1: 5}   
79         glove  linear      X  1.00   1.00            42  {0: 1, 1: 5}   
80         glove  linear      X  1.00   1.00            44  {0: 1, 1: 5}   

    train_precision_0  train_recall_0  train_f1_0  ...  test_support_0  \
0            

In [21]:
results_list = []

param_grid = {
'kernel': ['poly'],
    'degree': [3, 4, 5],
    'C': [0.01, 0.1, 1],
    'gamma': [0.01, 0.1, 1],
    'random_state': [40, 42, 44],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

svm = SVC()

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for params in grid_search.cv_results_['params']:
    model = SVC(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_mcc = matthews_corrcoef(y_train, y_train_pred) 

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_mcc = matthews_corrcoef(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'glove',
        'kernel': params['kernel'],
        'degree': params['degree'],
        'C': params['C'],
        'gamma': params['gamma'],
        'random_state': params['random_state'],
        'class_weight': params['class_weight'],

        # Train metrics
        'train_precision_0': train_report['0']['precision'],
        'train_recall_0': train_report['0']['recall'],
        'train_f1_0': train_report['0']['f1-score'],
        'train_support_0': train_report['0']['support'],
        'train_precision_1': train_report['1']['precision'],
        'train_recall_1': train_report['1']['recall'],
        'train_f1_1': train_report['1']['f1-score'],
        'train_support_1': train_report['1']['support'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'train_mcc': train_mcc, 

        # Test metrics
        'test_precision_0': test_report['0']['precision'],
        'test_recall_0': test_report['0']['recall'],
        'test_f1_0': test_report['0']['f1-score'],
        'test_support_0': test_report['0']['support'],
        'test_precision_1': test_report['1']['precision'],
        'test_recall_1': test_report['1']['recall'],
        'test_f1_1': test_report['1']['f1-score'],
        'test_support_1': test_report['1']['support'],
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1,
        'test_mcc': test_mcc
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_grid_glove_poly.csv', index=False)


print(results_df)
 

Fitting 3 folds for each of 243 candidates, totalling 729 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

    vectorization kernel  degree     C  gamma  random_state  class_weight  \
0           glove   poly       3  0.01   0.01            40  {0: 1, 1: 1}   
1           glove   poly       3  0.01   0.01            42  {0: 1, 1: 1}   
2           glove   poly       3  0.01   0.01            44  {0: 1, 1: 1}   
3           glove   poly       3  0.01   0.10            40  {0: 1, 1: 1}   
4           glove   poly       3  0.01   0.10            42  {0: 1, 1: 1}   
..            ...    ...     ...   ...    ...           ...           ...   
238         glove   poly       5  1.00   0.10            42  {0: 1, 1: 5}   
239         glove   poly       5  1.00   0.10            44  {0: 1, 1: 5}   
240         glove   poly       5  1.00   1.00            40  {0: 1, 1: 5}   
241         glove   poly       5  1.00   1.00            42  {0: 1, 1: 5}   
242         glove   poly       5  1.00   1.00            44  {0: 1, 1: 5}   

     train_precision_0  train_recall_0  train_f1_0  ...  test_support_0  \


In [18]:
results_list = []

param_grid = {
    'kernel': ['rbf'],
    #'degree': [],
    'C': [0.01, 0.1, 1],
    'gamma': [0.01, 0.1, 1],
    'random_state': [40, 42, 44],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

svm = SVC()

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for params in grid_search.cv_results_['params']:
    model = SVC(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_mcc = matthews_corrcoef(y_train, y_train_pred) 

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_mcc = matthews_corrcoef(y_test, y_test_pred)
    
    result_dict = {
        'vectorization': 'glove',
        'kernel': params['kernel'],
        'degree': 'X',
        'C': params['C'],
        'gamma': params['gamma'],
        'random_state': params['random_state'],
        'class_weight': params['class_weight'],

        # Train metrics
        'train_precision_0': train_report['0']['precision'],
        'train_recall_0': train_report['0']['recall'],
        'train_f1_0': train_report['0']['f1-score'],
        'train_support_0': train_report['0']['support'],
        'train_precision_1': train_report['1']['precision'],
        'train_recall_1': train_report['1']['recall'],
        'train_f1_1': train_report['1']['f1-score'],
        'train_support_1': train_report['1']['support'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,
        'train_mcc': train_mcc, 

        # Test metrics
        'test_precision_0': test_report['0']['precision'],
        'test_recall_0': test_report['0']['recall'],
        'test_f1_0': test_report['0']['f1-score'],
        'test_support_0': test_report['0']['support'],
        'test_precision_1': test_report['1']['precision'],
        'test_recall_1': test_report['1']['recall'],
        'test_f1_1': test_report['1']['f1-score'],
        'test_support_1': test_report['1']['support'],
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1,
        'test_mcc': test_mcc
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_grid_glove_rbf.csv', index=False)

print(results_df)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

   vectorization kernel degree     C  gamma  random_state  class_weight  \
0          glove    rbf      X  0.01   0.01            40  {0: 1, 1: 1}   
1          glove    rbf      X  0.01   0.01            42  {0: 1, 1: 1}   
2          glove    rbf      X  0.01   0.01            44  {0: 1, 1: 1}   
3          glove    rbf      X  0.01   0.10            40  {0: 1, 1: 1}   
4          glove    rbf      X  0.01   0.10            42  {0: 1, 1: 1}   
..           ...    ...    ...   ...    ...           ...           ...   
76         glove    rbf      X  1.00   0.10            42  {0: 1, 1: 5}   
77         glove    rbf      X  1.00   0.10            44  {0: 1, 1: 5}   
78         glove    rbf      X  1.00   1.00            40  {0: 1, 1: 5}   
79         glove    rbf      X  1.00   1.00            42  {0: 1, 1: 5}   
80         glove    rbf      X  1.00   1.00            44  {0: 1, 1: 5}   

    train_precision_0  train_recall_0  train_f1_0  ...  test_support_0  \
0            0.931747    