In [1]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.datasets import make_classification


Einladen der vektorisierten Daten

In [2]:
%run ../../functions/vectorize_functions.py

In [3]:
filepath_name = ("..\\..\\..\\data\\new_datasets\\train_cleaned.csv")
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [4]:
X_train, X_test, y_train, y_test = vectorize_tfidf(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [5]:
positive = np.count_nonzero(y_train == 1)
negative = np.count_nonzero(y_train == 0)
print("Train")
print("- Positive:", positive)
print("- Negative:", negative)
print("- Verhältnis:", negative / positive)

positive = np.count_nonzero(y_test == 1)
negative = np.count_nonzero(y_test == 0)
print("Test")
print("- Positive:", positive)
print("- Negative:", negative)
print("- Verhältnis:", negative / positive)

Train
- Positive: 9184
- Negative: 33695
- Verhältnis: 3.668880662020906
Test
- Positive: 3919
- Negative: 14459
- Verhältnis: 3.689461597346262


In [6]:
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)

print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (42879, 10992)
y_train shape (42879,)
X_test shape (18378, 10992)
y_test shape (18378,)


# Evaluation

# SVM


In [None]:
results_list = []

param_combinations = [
    {'C': 1, 'gamma': 1, 'class_weight': {0: 1, 1: 3}},
    {'C': 1, 'gamma': 0.1, 'class_weight': {0: 1, 1: 5}},
    {'C': 0.1, 'gamma': 1, 'class_weight': {0: 1, 1: 5}}
]

svm = SVC(kernel='sigmoid', random_state=42)  

for params in param_combinations:
    model = SVC(kernel='sigmoid', C=params['C'], gamma=params['gamma'], 
                class_weight=params['class_weight'], random_state=42)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'w2v',
        'kernel': 'sigmoid',
        'degree': 'X',   
        'C': params['C'],
        'gamma': params['gamma'],
        'class_weight': params['class_weight'],

        # Train metrics
        'train_precision_0': train_report['0']['precision'],
        'train_recall_0': train_report['0']['recall'],
        'train_f1_0': train_report['0']['f1-score'],
        'train_support_0': train_report['0']['support'],
        'train_precision_1': train_report['1']['precision'],
        'train_recall_1': train_report['1']['recall'],
        'train_f1_1': train_report['1']['f1-score'],
        'train_support_1': train_report['1']['support'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,

        # Test metrics
        'test_precision_0': test_report['0']['precision'],
        'test_recall_0': test_report['0']['recall'],
        'test_f1_0': test_report['0']['f1-score'],
        'test_support_0': test_report['0']['support'],
        'test_precision_1': test_report['1']['precision'],
        'test_recall_1': test_report['1']['recall'],
        'test_f1_1': test_report['1']['f1-score'],
        'test_support_1': test_report['1']['support'],
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_tfidf_sigmoid_new_data', index=False)

print(results_df)


In [None]:
results_list = []

param_combinations = [
    {'kernel': 'linear', 'C': 1, 'gamma': 0.1, 'class_weight': {0: 1, 1: 3}},#random state?
    {'kernel': 'linear', 'C': 1, 'gamma': 1, 'class_weight': {0: 1, 1: 3}},
    {'kernel': 'linear', 'C': 1, 'gamma': 0.01, 'class_weight': {0: 1, 1: 3}}
]

for params in param_combinations:
    model = SVC(**params)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'w2v',
        'kernel': params['kernel'],
        'degree': 'X',
        'C': params['C'],
        'gamma': params['gamma'],
        'class_weight': params['class_weight'],

        # Train metrics
        'train_precision_0': train_report['0']['precision'],
        'train_recall_0': train_report['0']['recall'],
        'train_f1_0': train_report['0']['f1-score'],
        'train_support_0': train_report['0']['support'],
        'train_precision_1': train_report['1']['precision'],
        'train_recall_1': train_report['1']['recall'],
        'train_f1_1': train_report['1']['f1-score'],
        'train_support_1': train_report['1']['support'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,

        # Test metrics
        'test_precision_0': test_report['0']['precision'],
        'test_recall_0': test_report['0']['recall'],
        'test_f1_0': test_report['0']['f1-score'],
        'test_support_0': test_report['0']['support'],
        'test_precision_1': test_report['1']['precision'],
        'test_recall_1': test_report['1']['recall'],
        'test_f1_1': test_report['1']['f1-score'],
        'test_support_1': test_report['1']['support'],
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)
results_df.to_csv('svm_tfidf_linear_new_data.csv', index=False)

print(results_df)


  vectorization  kernel degree  C  gamma  class_weight  train_precision_0  \
0           w2v  linear      X  1   0.10  {0: 1, 1: 3}           0.943711   
1           w2v  linear      X  1   1.00  {0: 1, 1: 3}           0.943711   
2           w2v  linear      X  1   0.01  {0: 1, 1: 3}           0.943711   

   train_recall_0  train_f1_0  train_support_0  ...  test_f1_0  \
0        0.853331    0.896249            33695  ...   0.849625   
1        0.853331    0.896249            33695  ...   0.849625   
2        0.853331    0.896249            33695  ...   0.849625   

   test_support_0  test_precision_1  test_recall_1  test_f1_1  test_support_1  \
0           14459          0.477334        0.63409   0.544658            3919   
1           14459          0.477334        0.63409   0.544658            3919   
2           14459          0.477334        0.63409   0.544658            3919   

   test_accuracy  test_recall  test_precision   test_f1  
0       0.773914      0.63409        0.4773

In [10]:
results_list = []

param_grid = {
    'kernel': ['poly'],
    'degree': [3, 4],
    'C': [1],
    'gamma': [1],
    'random_state': [42],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3},]
}

svm = SVC()

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for params in grid_search.cv_results_['params']:
    model = SVC(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    result_dict = {
        'vectorization': 'tfidf',
        'kernel': params['kernel'],
        'degree': params['degree'],
        'C': params['C'],
        'gamma': params['gamma'],
        'random_state': params['random_state'],
        'class_weight': params['class_weight'],

        # Train metrics
        'train_precision_0': train_report['0']['precision'],
        'train_recall_0': train_report['0']['recall'],
        'train_f1_0': train_report['0']['f1-score'],
        'train_support_0': train_report['0']['support'],
        'train_precision_1': train_report['1']['precision'],
        'train_recall_1': train_report['1']['recall'],
        'train_f1_1': train_report['1']['f1-score'],
        'train_support_1': train_report['1']['support'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,

        # Test metrics
        'test_precision_0': test_report['0']['precision'],
        'test_recall_0': test_report['0']['recall'],
        'test_f1_0': test_report['0']['f1-score'],
        'test_support_0': test_report['0']['support'],
        'test_precision_1': test_report['1']['precision'],
        'test_recall_1': test_report['1']['recall'],
        'test_f1_1': test_report['1']['f1-score'],
        'test_support_1': test_report['1']['support'],
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_tfidf_poly_new_data.csv', index=False)


print(results_df)
 

Fitting 3 folds for each of 4 candidates, totalling 12 fits
  vectorization kernel  degree  C  gamma  random_state  class_weight  \
0         tfidf   poly       3  1      1            42  {0: 1, 1: 1}   
1         tfidf   poly       4  1      1            42  {0: 1, 1: 1}   
2         tfidf   poly       3  1      1            42  {0: 1, 1: 3}   
3         tfidf   poly       4  1      1            42  {0: 1, 1: 3}   

   train_precision_0  train_recall_0  train_f1_0  ...  test_f1_0  \
0           0.987689        0.997655    0.992647  ...   0.886927   
1           0.993595        0.999110    0.996345  ...   0.883838   
2           0.999672        0.994391    0.997024  ...   0.888064   
3           0.999851        0.997448    0.998648  ...   0.883942   

   test_support_0  test_precision_1  test_recall_1  test_f1_1  test_support_1  \
0           14459          0.741472       0.105384   0.184540            3919   
1           14459          0.779661       0.046951   0.088568            391

In [11]:
results_list = []

param_grid = {
    'kernel': ['rbf'],
    #'degree': [],
    'C': [1],
    'gamma': [0.1, 1],
    'random_state': [42],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}]
}


svm = SVC()

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

for params in grid_search.cv_results_['params']:
    model = SVC(**params)  
    model.fit(X_train, y_train)  

    y_train_pred = model.predict(X_train)

    y_test_pred = model.predict(X_test)

    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    
    result_dict = {
        'vectorization': 'tfidf',
        'kernel': params['kernel'],
        'degree': 'X',
        'C': params['C'],
        'gamma': params['gamma'],
        'random_state': params['random_state'],
        'class_weight': params['class_weight'],

        # Train metrics
        'train_precision_0': train_report['0']['precision'],
        'train_recall_0': train_report['0']['recall'],
        'train_f1_0': train_report['0']['f1-score'],
        'train_support_0': train_report['0']['support'],
        'train_precision_1': train_report['1']['precision'],
        'train_recall_1': train_report['1']['recall'],
        'train_f1_1': train_report['1']['f1-score'],
        'train_support_1': train_report['1']['support'],
        'train_accuracy': train_accuracy,
        'train_recall': train_recall,
        'train_precision': train_precision,
        'train_f1': train_f1,

        # Test metrics
        'test_precision_0': test_report['0']['precision'],
        'test_recall_0': test_report['0']['recall'],
        'test_f1_0': test_report['0']['f1-score'],
        'test_support_0': test_report['0']['support'],
        'test_precision_1': test_report['1']['precision'],
        'test_recall_1': test_report['1']['recall'],
        'test_f1_1': test_report['1']['f1-score'],
        'test_support_1': test_report['1']['support'],
        'test_accuracy': test_accuracy,
        'test_recall': test_recall,
        'test_precision': test_precision,
        'test_f1': test_f1
    }

    results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_tfidf_rbf_new_data.csv', index=False)

print(results_df)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
  vectorization kernel degree  C  gamma  random_state  class_weight  \
0         tfidf    rbf      X  1    0.1            42  {0: 1, 1: 1}   
1         tfidf    rbf      X  1    1.0            42  {0: 1, 1: 1}   
2         tfidf    rbf      X  1    0.1            42  {0: 1, 1: 3}   
3         tfidf    rbf      X  1    1.0            42  {0: 1, 1: 3}   

   train_precision_0  train_recall_0  train_f1_0  ...  test_f1_0  \
0           0.816219        0.987238    0.893620  ...   0.892505   
1           0.902826        0.993471    0.945982  ...   0.897413   
2           0.912750        0.869625    0.890665  ...   0.869234   
3           0.998909        0.978038    0.988363  ...   0.890199   

   test_support_0  test_precision_1  test_recall_1  test_f1_1  test_support_1  \
0           14459          0.759240       0.183465   0.295520            3919   
1           14459          0.741201       0.274050   0.400149            3919   
