In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from scipy.stats import uniform
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import matthews_corrcoef

### evaluation metric = f1 score

#### tfidf via custom functionBySven

In [2]:
!pip install gensim

%run ../../../../../functions/vectorize_functions.py


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [3]:
# Load training data
df_train = pd.read_csv("../../../../../../data/mixed_dataset/train_cleaned.csv")
df_train = df_train.dropna(subset=['tweet_cleaned'])

In [4]:
# TF-IDF Vectorization on training data + saving it 
X_train, X_test, y_train, y_test, tfidf_vectorizer = vectorize_tfidf(df=df_train, text_column="tweet_cleaned", label_column="label")

with open("svm_tfidf_vectorizer_opt.joblib", "wb") as file:
    joblib.dump(tfidf_vectorizer, file)

In [5]:
# Load test data
df_test = pd.read_csv("../../../../../../data/mixed_dataset/test_cleaned.csv")
df_test = df_test.dropna(subset=['tweet_cleaned'])

In [6]:
# Use vectorizer to transform test data
X_test = tfidf_vectorizer.transform(df_test['tweet_cleaned'])
y_test = df_test["label"]

## scoring = recall

In [7]:
results_list = []

# Hyperparameter search for SVC-model
param_dist = {
    'kernel': ['sigmoid'],
    'C': uniform(0.01, 1),
    'gamma': uniform(0.01, 0.1),
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

random_search = RandomizedSearchCV(
    SVC(),
    param_distributions=param_dist,
    n_iter=10, 
    scoring='recall',
    cv=2, 
    verbose=1,  
    n_jobs=-1,  
    random_state=42,
    refit=True 
)

random_search.fit(X_train, y_train)

# Retrieing best model found
best_model = random_search.best_estimator_

# Save best model and hyperparameter search results
with open("svm_tfidf_sigmoid_best_model_opt.joblib", "wb") as file:
    joblib.dump(best_model, file)

y_train_pred = best_model.predict(X_train)
train_report = classification_report(y_train, y_train_pred, output_dict=True)

# Evaluate model on test set
y_test_pred = best_model.predict(X_test)
test_report = classification_report(y_test, y_test_pred, output_dict=True)
cm = confusion_matrix(y_test, y_test_pred)

#--------------------------------------
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_mcc = matthews_corrcoef(y_train, y_train_pred)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_mcc = matthews_corrcoef(y_test, y_test_pred)

result_dict = {

    # Train metrics
    'train_precision_0': train_report['0']['precision'],
    'train_recall_0': train_report['0']['recall'],
    'train_f1_0': train_report['0']['f1-score'],
    'train_support_0': train_report['0']['support'],
    'train_precision_1': train_report['1']['precision'],
    'train_recall_1': train_report['1']['recall'],
    'train_f1_1': train_report['1']['f1-score'],
    'train_support_1': train_report['1']['support'],
    'train_accuracy': train_accuracy,
    'train_recall': train_recall,
    'train_precision': train_precision,
    'train_f1': train_f1,
    'train_mcc': train_mcc, 

    # Test metrics
    'test_precision_0': test_report['0']['precision'],
    'test_recall_0': test_report['0']['recall'],
    'test_f1_0': test_report['0']['f1-score'],
    'test_support_0': test_report['0']['support'],
    'test_precision_1': test_report['1']['precision'],
    'test_recall_1': test_report['1']['recall'],
    'test_f1_1': test_report['1']['f1-score'],
    'test_support_1': test_report['1']['support'],
    'test_accuracy': test_accuracy,
    'test_recall': test_recall,
    'test_precision': test_precision,
    'test_f1': test_f1,
    'test_mcc': test_mcc

}

results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_tfidf_sigmoid_results.csv', index=False)

# ---- save results and ouput

results_df = pd.DataFrame(random_search.cv_results_)
results_df.to_csv("svm_tfidf_sigmoid_results_opt.csv", index=False)

# Save configuration of best model
best_model_config = best_model.get_params()
best_model_config_df = pd.DataFrame([best_model_config])
best_model_config_df.to_csv("svm_tfidf_sigmoid_best_model_config_opt.csv", index=False)

# Save confusion matrix
unique_labels = sorted(y_test.unique())
cm_df = pd.DataFrame(cm, 
                     index=[f"Actual_{label}" for label in unique_labels],
                     columns=[f"Predicted_{label}" for label in unique_labels])
cm_df.to_csv("svm_tfidf_sigmoid_confusion_matrix_opt.csv", index=True)

# Save classification report
classification_report_df = pd.DataFrame(test_report).transpose()
classification_report_df.to_csv("svm_tfidf_sigmoid_classification_report_opt.csv", index=True)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


In [7]:
results_list = []

param_dist = {
    'kernel': ['linear'],
    'C': uniform(0.01, 1),
    'gamma': uniform(0.01, 0.1),
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

random_search = RandomizedSearchCV(
    SVC(),
    param_distributions=param_dist,
    n_iter=10, 
    scoring='recall',
    cv=2, 
    verbose=1,  
    n_jobs=-1,  
    random_state=42,
    refit=True 
)

random_search.fit(X_train, y_train)

# Retrieving best model found
best_model = random_search.best_estimator_

# Save best model and hyperparameter search results
with open("svm_tfidf_linear_best_model_opt.joblib", "wb") as file:
    joblib.dump(best_model, file)

y_train_pred = best_model.predict(X_train)
train_report = classification_report(y_train, y_train_pred, output_dict=True)

# Evaluate model on test set
y_test_pred = best_model.predict(X_test)
test_report = classification_report(y_test, y_test_pred, output_dict=True)
cm = confusion_matrix(y_test, y_test_pred)

#--------------------------------------
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_mcc = matthews_corrcoef(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_mcc = matthews_corrcoef(y_test, y_test_pred)
result_dict = {
    # Train metrics
    'train_precision_0': train_report['0']['precision'],
    'train_recall_0': train_report['0']['recall'],
    'train_f1_0': train_report['0']['f1-score'],
    'train_support_0': train_report['0']['support'],
    'train_precision_1': train_report['1']['precision'],
    'train_recall_1': train_report['1']['recall'],
    'train_f1_1': train_report['1']['f1-score'],
    'train_support_1': train_report['1']['support'],
    'train_accuracy': train_accuracy,
    'train_recall': train_recall,
    'train_precision': train_precision,
    'train_f1': train_f1,
    'train_mcc': train_mcc, 
    # Test metrics
    'test_precision_0': test_report['0']['precision'],
    'test_recall_0': test_report['0']['recall'],
    'test_f1_0': test_report['0']['f1-score'],
    'test_support_0': test_report['0']['support'],
    'test_precision_1': test_report['1']['precision'],
    'test_recall_1': test_report['1']['recall'],
    'test_f1_1': test_report['1']['f1-score'],
    'test_support_1': test_report['1']['support'],
    'test_accuracy': test_accuracy,
    'test_recall': test_recall,
    'test_precision': test_precision,
    'test_f1': test_f1,
    'test_mcc': test_mcc
}

#----------------------

results_list.append(result_dict)
results_df = pd.DataFrame(results_list)
results_df.to_csv('svm_tfidf_linear_results.csv', index=False)

# ---- save results and ouput
results_df = pd.DataFrame(random_search.cv_results_)
results_df.to_csv("svm_tfidf_linear_results_opt.csv", index=False)

# Save configuration of best model
best_model_config = best_model.get_params()
best_model_config_df = pd.DataFrame([best_model_config])
best_model_config_df.to_csv("svm_tfidf_linear_best_model_config_opt.csv", index=False)

# Save confusion matrix
unique_labels = sorted(y_test.unique())
cm_df = pd.DataFrame(cm, 
                     index=[f"Actual_{label}" for label in unique_labels],
                     columns=[f"Predicted_{label}" for label in unique_labels])
cm_df.to_csv("svm_tfidf_linear_confusion_matrix_opt.csv", index=True)

# Save classification report
classification_report_df = pd.DataFrame(test_report).transpose()
classification_report_df.to_csv("svm_tfidf_linear_classification_report_opt.csv", index=True)


Fitting 2 folds for each of 10 candidates, totalling 20 fits


In [8]:
results_list = []

# Hyperparameter search for SVC-model
param_dist = {
    'kernel': ['poly'],
    'degree': [3, 4, 5],
    'C': uniform(0.01, 1),
    'gamma': uniform(0.01, 0.1),
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

random_search = RandomizedSearchCV(
    SVC(),
    param_distributions=param_dist,
    n_iter=10, 
    scoring='recall',
    cv=2, 
    verbose=1,  
    n_jobs=-1,  
    random_state=42,
    refit=True 
)

random_search.fit(X_train, y_train)

# Retrieing best model found
best_model = random_search.best_estimator_

# Save best model and hyperparameter search results
with open("svm_tfidf_poly_best_model_opt.joblib", "wb") as file:
    joblib.dump(best_model, file)

y_train_pred = best_model.predict(X_train)
train_report = classification_report(y_train, y_train_pred, output_dict=True)

# Evaluate model on test set
y_test_pred = best_model.predict(X_test)
test_report = classification_report(y_test, y_test_pred, output_dict=True)
cm = confusion_matrix(y_test, y_test_pred)

#--------------------------------------
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_mcc = matthews_corrcoef(y_train, y_train_pred)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_mcc = matthews_corrcoef(y_test, y_test_pred)

result_dict = {

    # Train metrics
    'train_precision_0': train_report['0']['precision'],
    'train_recall_0': train_report['0']['recall'],
    'train_f1_0': train_report['0']['f1-score'],
    'train_support_0': train_report['0']['support'],
    'train_precision_1': train_report['1']['precision'],
    'train_recall_1': train_report['1']['recall'],
    'train_f1_1': train_report['1']['f1-score'],
    'train_support_1': train_report['1']['support'],
    'train_accuracy': train_accuracy,
    'train_recall': train_recall,
    'train_precision': train_precision,
    'train_f1': train_f1,
    'train_mcc': train_mcc, 

    # Test metrics
    'test_precision_0': test_report['0']['precision'],
    'test_recall_0': test_report['0']['recall'],
    'test_f1_0': test_report['0']['f1-score'],
    'test_support_0': test_report['0']['support'],
    'test_precision_1': test_report['1']['precision'],
    'test_recall_1': test_report['1']['recall'],
    'test_f1_1': test_report['1']['f1-score'],
    'test_support_1': test_report['1']['support'],
    'test_accuracy': test_accuracy,
    'test_recall': test_recall,
    'test_precision': test_precision,
    'test_f1': test_f1,
    'test_mcc': test_mcc

}

results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_tfidf_poly_results.csv', index=False)

# ---- save results and ouput

results_df = pd.DataFrame(random_search.cv_results_)
results_df.to_csv("svm_tfidf_poly_results_opt.csv", index=False)

# Save configuration of best model
best_model_config = best_model.get_params()
best_model_config_df = pd.DataFrame([best_model_config])
best_model_config_df.to_csv("svm_tfidf_poly_best_model_config_opt.csv", index=False)

# Save confusion matrix
unique_labels = sorted(y_test.unique())
cm_df = pd.DataFrame(cm, 
                     index=[f"Actual_{label}" for label in unique_labels],
                     columns=[f"Predicted_{label}" for label in unique_labels])
cm_df.to_csv("svm_tfidf_poly_confusion_matrix_opt.csv", index=True)

# Save classification report
classification_report_df = pd.DataFrame(test_report).transpose()
classification_report_df.to_csv("svm_poly_sigmoid_classification_report_opt.csv", index=True)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
results_list = []

# Hyperparameter search for SVC-model
param_dist = {
    'kernel': ['rbf'],
    'C': uniform(0.01, 1),
    'gamma': uniform(0.01, 0.1),
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}]
}

random_search = RandomizedSearchCV(
    SVC(),
    param_distributions=param_dist,
    n_iter=10, 
    scoring='recall',
    cv=2, 
    verbose=1,  
    n_jobs=-1,  
    random_state=42,
    refit=True 
)

random_search.fit(X_train, y_train)

# Retrieing best model found
best_model = random_search.best_estimator_

# Save best model and hyperparameter search results
with open("svm_tfidf_rbf_best_model_opt.joblib", "wb") as file:
    joblib.dump(best_model, file)

y_train_pred = best_model.predict(X_train)
train_report = classification_report(y_train, y_train_pred, output_dict=True)

# Evaluate model on test set
y_test_pred = best_model.predict(X_test)
test_report = classification_report(y_test, y_test_pred, output_dict=True)
cm = confusion_matrix(y_test, y_test_pred)

#--------------------------------------
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_mcc = matthews_corrcoef(y_train, y_train_pred)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_mcc = matthews_corrcoef(y_test, y_test_pred)

result_dict = {

    # Train metrics
    'train_precision_0': train_report['0']['precision'],
    'train_recall_0': train_report['0']['recall'],
    'train_f1_0': train_report['0']['f1-score'],
    'train_support_0': train_report['0']['support'],
    'train_precision_1': train_report['1']['precision'],
    'train_recall_1': train_report['1']['recall'],
    'train_f1_1': train_report['1']['f1-score'],
    'train_support_1': train_report['1']['support'],
    'train_accuracy': train_accuracy,
    'train_recall': train_recall,
    'train_precision': train_precision,
    'train_f1': train_f1,
    'train_mcc': train_mcc, 

    # Test metrics
    'test_precision_0': test_report['0']['precision'],
    'test_recall_0': test_report['0']['recall'],
    'test_f1_0': test_report['0']['f1-score'],
    'test_support_0': test_report['0']['support'],
    'test_precision_1': test_report['1']['precision'],
    'test_recall_1': test_report['1']['recall'],
    'test_f1_1': test_report['1']['f1-score'],
    'test_support_1': test_report['1']['support'],
    'test_accuracy': test_accuracy,
    'test_recall': test_recall,
    'test_precision': test_precision,
    'test_f1': test_f1,
    'test_mcc': test_mcc

}

results_list.append(result_dict)

results_df = pd.DataFrame(results_list)

results_df.to_csv('svm_tfidf_rbf_results.csv', index=False)

# ---- save results and ouput

results_df = pd.DataFrame(random_search.cv_results_)
results_df.to_csv("svm_tfidf_rbf_results_opt.csv", index=False)

# Save configuration of best model
best_model_config = best_model.get_params()
best_model_config_df = pd.DataFrame([best_model_config])
best_model_config_df.to_csv("svm_tfidf_rbf_best_model_config_opt.csv", index=False)

# Save confusion matrix
unique_labels = sorted(y_test.unique())
cm_df = pd.DataFrame(cm, 
                     index=[f"Actual_{label}" for label in unique_labels],
                     columns=[f"Predicted_{label}" for label in unique_labels])
cm_df.to_csv("svm_tfidf_rbf_confusion_matrix_opt.csv", index=True)

# Save classification report
classification_report_df = pd.DataFrame(test_report).transpose()
classification_report_df.to_csv("svm_tfidf_rbf_classification_report_opt.csv", index=True)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
