In [1]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import uniform
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


## Data

In [2]:
%run ../../../../../functions/vectorize_functions.py

In [3]:
filepath_name = ("..\\..\\..\\..\\..\\..\\data\\mixed_dataset\\train_cleaned.csv")
df_cleaned = pd.read_csv(filepath_name, encoding='utf-8')

In [4]:
X_train, X_test, y_train, y_test, w2v_vectorizer = vectorize_w2v(df=df_cleaned, text_column='tweet_cleaned', 
                                                                                 label_column="label")

In [5]:
with open("svm_w2v_vectorizer_opt.joblib", "wb") as file:
    joblib.dump(w2v_vectorizer, file)

In [6]:
positive = np.count_nonzero(y_train == 1)
negative = np.count_nonzero(y_train == 0)
print("Train")
print("- Positive:", positive)
print("- Negative:", negative)
print("- Verhältnis:", negative / positive)

positive = np.count_nonzero(y_test == 1)
negative = np.count_nonzero(y_test == 0)
print("Test")
print("- Positive:", positive)
print("- Negative:", negative)
print("- Verhältnis:", negative / positive)

Train
- Positive: 10328
- Negative: 47004
- Verhältnis: 4.551123160340821
Test
- Positive: 4417
- Negative: 20155
- Verhältnis: 4.563051845143763


In [7]:
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)

print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (57332, 300)
y_train shape (57332,)
X_test shape (24572, 300)
y_test shape (24572,)


## SVM

In [8]:
# Hyperparameter search for SVC-model
param_dist = {
    'kernel': ['sigmoid'],
    'C': uniform(0.01, 1),
    'gamma': uniform(0.01, 0.1),
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

random_search = RandomizedSearchCV(
    SVC(),
    param_distributions=param_dist,
    n_iter=10, 
    scoring='f1',
    cv=2, 
    verbose=1,  
    n_jobs=-1,  
    random_state=42,
    refit=True 
)

random_search.fit(X_train, y_train)

# Retrieing best model found
best_model = random_search.best_estimator_

# Load test data
df_test = pd.read_csv("..\\..\\..\\..\\..\\..\\data\\mixed_dataset\\test_cleaned.csv")
df_test = df_test.dropna(subset=['tweet_cleaned'])

# Use vectorizer to transform test data
X_test, _, y_test, _, _ = vectorize_w2v(df_test, "tweet", "label", vector_size=300)

# Evaluate model on test set
y_test_pred = best_model.predict(X_test)
test_report = classification_report(y_test, y_test_pred, output_dict=True)
cm = confusion_matrix(y_test, y_test_pred)

#--------------------------------------
# Calculate additional metrics and add them to classification report df
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')
f1 = f1_score(y_test, y_test_pred, average='weighted')

classification_report_df = pd.DataFrame(test_report).transpose()

classification_report_df['accuracy'] = accuracy
classification_report_df['precision'] = precision
classification_report_df['recall'] = recall
classification_report_df['f1_score'] = f1
#----------------------

print("Test Classification Report:")
print(test_report)

# Save best model and hyperparameter search results
with open("svm_w2v_sigmoid_best_model_opt.joblib", "wb") as file:
    joblib.dump(best_model, file)

results_df = pd.DataFrame(random_search.cv_results_)
results_df.to_csv("svm_w2v_sigmoid_results_opt.csv", index=False)

# Save configuration of best model
best_model_config = best_model.get_params()
best_model_config_df = pd.DataFrame([best_model_config])
best_model_config_df.to_csv("svm_w2v_sigmoid_best_model_config_opt.csv", index=False)

# Save confusion matrix
unique_labels = sorted(y_test.unique())
cm_df = pd.DataFrame(cm, 
                     index=[f"Actual_{label}" for label in unique_labels],
                     columns=[f"Predicted_{label}" for label in unique_labels])
cm_df.to_csv("svm_w2v_sigmoid_confusion_matrix_opt.csv", index=True)

# Save classification report
classification_report_df = pd.DataFrame(test_report).transpose()
classification_report_df.to_csv("svm_w2v_sigmoid_classification_report_opt.csv", index=True)


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Test Classification Report:
{'0': {'precision': 0.8036832412523021, 'recall': 0.24290326171657575, 'f1-score': 0.3730552231150624, 'support': 17966}, '1': {'precision': 0.2135753931544866, 'recall': 0.7760504201680672, 'f1-score': 0.33496554225607544, 'support': 4760}, 'accuracy': 0.3545718560239373, 'macro avg': {'precision': 0.5086293172033943, 'recall': 0.5094768409423215, 'f1-score': 0.35401038268556895, 'support': 22726}, 'weighted avg': {'precision': 0.6800841319965772, 'recall': 0.3545718560239373, 'f1-score': 0.3650772735907828, 'support': 22726}}


In [9]:
# Hyperparameter search for SVC-model
param_dist = {
    'kernel': ['linear'],
    'C': uniform(0.01, 1),
    'gamma': uniform(0.01, 0.1),
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

random_search = RandomizedSearchCV(
    SVC(),
    param_distributions=param_dist,
    n_iter=10, 
    scoring='f1',
    cv=2, 
    verbose=1,  
    n_jobs=-1,  
    random_state=42,
    refit=True 
)

random_search.fit(X_train, y_train)

# Retrieing best model found
best_model = random_search.best_estimator_

# Load test data
df_test = pd.read_csv("..\\..\\..\\..\\..\\..\\data\\mixed_dataset\\test_cleaned.csv")
df_test = df_test.dropna(subset=['tweet_cleaned'])

# Use vectorizer to transform test data
X_test, _, y_test, _, _ = vectorize_w2v(df_test, "tweet", "label", vector_size=300)

# Evaluate model on test set
y_test_pred = best_model.predict(X_test)
test_report = classification_report(y_test, y_test_pred, output_dict=True)
cm = confusion_matrix(y_test, y_test_pred)

#--------------------------------------
# Calculate additional metrics and add them to classification report df
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')
f1 = f1_score(y_test, y_test_pred, average='weighted')

classification_report_df = pd.DataFrame(test_report).transpose()

classification_report_df['accuracy'] = accuracy
classification_report_df['precision'] = precision
classification_report_df['recall'] = recall
classification_report_df['f1_score'] = f1
#----------------------

print("Test Classification Report:")
print(test_report)

# Save best model and hyperparameter search results
with open("svm_w2v_linear_best_model_opt.joblib", "wb") as file:
    joblib.dump(best_model, file)

results_df = pd.DataFrame(random_search.cv_results_)
results_df.to_csv("svm_w2v_linear_results_opt.csv", index=False)

# Save configuration of best model
best_model_config = best_model.get_params()
best_model_config_df = pd.DataFrame([best_model_config])
best_model_config_df.to_csv("svm_w2v_linear_best_model_config_opt.csv", index=False)

# Save confusion matrix
unique_labels = sorted(y_test.unique())
cm_df = pd.DataFrame(cm, 
                     index=[f"Actual_{label}" for label in unique_labels],
                     columns=[f"Predicted_{label}" for label in unique_labels])
cm_df.to_csv("svm_w2v_linear_confusion_matrix_opt.csv", index=True)
 
# Save classification report
classification_report_df = pd.DataFrame(test_report).transpose()
classification_report_df.to_csv("svm_w2v_linear_classification_report_opt.csv", index=True)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
Test Classification Report:
{'0': {'precision': 0.7875560814475177, 'recall': 0.8891239007013247, 'f1-score': 0.8352636670239746, 'support': 17966}, '1': {'precision': 0.18460908718788374, 'recall': 0.09474789915966386, 'f1-score': 0.12522560044425934, 'support': 4760}, 'accuracy': 0.7227404734665142, 'macro avg': {'precision': 0.4860825843177007, 'recall': 0.4919358999304943, 'f1-score': 0.48024463373411697, 'support': 22726}, 'weighted avg': {'precision': 0.6612677908255051, 'recall': 0.7227404734665142, 'f1-score': 0.686544966112268, 'support': 22726}}


In [8]:
# Hyperparameter search for SVC-model
param_dist = {
    'kernel': ['poly'],
    'degree': [3, 4, 5],
    'C': uniform(0.01, 1),
    'gamma': uniform(0.01, 0.1),
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

random_search = RandomizedSearchCV(
    SVC(),
    param_distributions=param_dist,
    n_iter=10, 
    scoring='f1',
    cv=2, 
    verbose=1,  
    n_jobs=-1,  
    random_state=42,
    refit=True 
)

random_search.fit(X_train, y_train)

# Retrieing best model found
best_model = random_search.best_estimator_

# Load test data
df_test = pd.read_csv("..\\..\\..\\..\\..\\..\\data\\mixed_dataset\\test_cleaned.csv")
df_test = df_test.dropna(subset=['tweet_cleaned'])

# Use vectorizer to transform test data
X_test, _, y_test, _, _ = vectorize_w2v(df_test, "tweet", "label", vector_size=300)

# Evaluate model on test set
y_test_pred = best_model.predict(X_test)
test_report = classification_report(y_test, y_test_pred, output_dict=True)
cm = confusion_matrix(y_test, y_test_pred)

#--------------------------------------
# Calculate additional metrics and add them to classification report df
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')
f1 = f1_score(y_test, y_test_pred, average='weighted')

classification_report_df = pd.DataFrame(test_report).transpose()

classification_report_df['accuracy'] = accuracy
classification_report_df['precision'] = precision
classification_report_df['recall'] = recall
classification_report_df['f1_score'] = f1
#----------------------

print("Test Classification Report:")
print(test_report)

# Save best model and hyperparameter search results
with open("svm_w2v_poly_best_model_opt.joblib", "wb") as file:
    joblib.dump(best_model, file)

results_df = pd.DataFrame(random_search.cv_results_)
results_df.to_csv("svm_w2v_poly_results_opt.csv", index=False)

# Save configuration of best model
best_model_config = best_model.get_params()
best_model_config_df = pd.DataFrame([best_model_config])
best_model_config_df.to_csv("svm_w2v_poly_best_model_config_opt.csv", index=False)

# Save confusion matrix
unique_labels = sorted(y_test.unique())
cm_df = pd.DataFrame(cm, 
                     index=[f"Actual_{label}" for label in unique_labels],
                     columns=[f"Predicted_{label}" for label in unique_labels])
cm_df.to_csv("svm_w2v_poly_confusion_matrix_opt.csv", index=True)

# Save classification report
classification_report_df = pd.DataFrame(test_report).transpose()
classification_report_df.to_csv("svm_w2v_poly_classification_report_opt.csv", index=True)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
Test Classification Report:
{'0': {'precision': 0.8347448874199546, 'recall': 0.22492485806523432, 'f1-score': 0.35436488797299076, 'support': 17966}, '1': {'precision': 0.2214145932345541, 'recall': 0.8319327731092437, 'f1-score': 0.34974608081254144, 'support': 4760}, 'accuracy': 0.3520637155680718, 'macro avg': {'precision': 0.5280797403272544, 'recall': 0.528428815587239, 'f1-score': 0.3520554843927661, 'support': 22726}, 'weighted avg': {'precision': 0.7062817966726824, 'recall': 0.3520637155680718, 'f1-score': 0.353397470825946, 'support': 22726}}


In [9]:
# Hyperparameter search for SVC-model
param_dist = {
    'kernel': ['rbf'],
    'degree': [3, 4, 5],
    'C': uniform(0.01, 1),
    'gamma': uniform(0.01, 0.1),
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

random_search = RandomizedSearchCV(
    SVC(),
    param_distributions=param_dist,
    n_iter=10, 
    scoring='f1',
    cv=2, 
    verbose=1,  
    n_jobs=-1,  
    random_state=42,
    refit=True 
)

random_search.fit(X_train, y_train)

# Retrieing best model found
best_model = random_search.best_estimator_

# Load test data
df_test = pd.read_csv("..\\..\\..\\..\\..\\..\\data\\mixed_dataset\\test_cleaned.csv")
df_test = df_test.dropna(subset=['tweet_cleaned'])

# Use vectorizer to transform test data
X_test, _, y_test, _, _ = vectorize_w2v(df_test, "tweet", "label", vector_size=300)

# Evaluate model on test set
y_test_pred = best_model.predict(X_test)
test_report = classification_report(y_test, y_test_pred, output_dict=True)
cm = confusion_matrix(y_test, y_test_pred)

#--------------------------------------
# Calculate additional metrics and add them to classification report df
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')
f1 = f1_score(y_test, y_test_pred, average='weighted')

classification_report_df = pd.DataFrame(test_report).transpose()

classification_report_df['accuracy'] = accuracy
classification_report_df['precision'] = precision
classification_report_df['recall'] = recall
classification_report_df['f1_score'] = f1
#----------------------

print("Test Classification Report:")
print(test_report)

# Save best model and hyperparameter search results
with open("svm_w2v_rbf_best_model_opt.joblib", "wb") as file:
    joblib.dump(best_model, file)

results_df = pd.DataFrame(random_search.cv_results_)
results_df.to_csv("svm_w2v_rbf_results_opt.csv", index=False)

# Save configuration of best model
best_model_config = best_model.get_params()
best_model_config_df = pd.DataFrame([best_model_config])
best_model_config_df.to_csv("svm_w2v_rbf_best_model_config_opt.csv", index=False)

# Save confusion matrix
unique_labels = sorted(y_test.unique())
cm_df = pd.DataFrame(cm, 
                     index=[f"Actual_{label}" for label in unique_labels],
                     columns=[f"Predicted_{label}" for label in unique_labels])
cm_df.to_csv("svm_w2v_rbf_confusion_matrix_opt.csv", index=True)

# Save classification report
classification_report_df = pd.DataFrame(test_report).transpose()
classification_report_df.to_csv("svm_w2v_rbf_classification_report_opt.csv", index=True)


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Test Classification Report:
{'0': {'precision': 0.7856745994999537, 'recall': 0.9445062896582433, 'f1-score': 0.8578000202204024, 'support': 17966}, '1': {'precision': 0.11613475177304965, 'recall': 0.027521008403361344, 'f1-score': 0.04449728260869565, 'support': 4760}, 'accuracy': 0.7524421367596585, 'macro avg': {'precision': 0.45090467563650166, 'recall': 0.48601364903080235, 'f1-score': 0.451148651414549, 'support': 22726}, 'weighted avg': {'precision': 0.6454383205604103, 'recall': 0.7524421367596585, 'f1-score': 0.6874523553857758, 'support': 22726}}
