In [None]:
!pip install nlpaug

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# upload the datasets

from google.colab import files
print("Upload edos_labelled_aggregated.csv")
files.upload()
print("Upload edos_test_category_5.csv")
files.upload()

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import re
import string
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
import time
import nlpaug.augmenter.word as naw

In [None]:
# Load the dataset
train_df = pd.read_csv('edos_labelled_aggregated.csv')
test_df = pd.read_csv('edos_test_category_5.csv')

# Filter the relevant columns
train_df = train_df[['text', 'label_sexist', 'split', 'label_category']]

# Filter sexist sentences
sexist_df = train_df[train_df['label_sexist'] == 'sexist']

# Split the data
train_sexist_df = sexist_df[sexist_df['split'] == 'train']
val_sexist_df = sexist_df[sexist_df['split'] == 'dev']

train_sexist_texts = train_sexist_df['text'].tolist()
train_sexist_labels = train_sexist_df['label_category'].tolist()
val_sexist_texts = val_sexist_df['text'].tolist()
val_sexist_labels = val_sexist_df['label_category'].tolist()
test_texts = test_df['text'].tolist()
test_labels = test_df['label_category'].tolist()

In [None]:
# Encode the categories

label_encoder = LabelEncoder()
train_sexist_labels = label_encoder.fit_transform(train_sexist_labels)
val_sexist_labels = label_encoder.transform(val_sexist_labels)
test_labels = label_encoder.transform(test_labels)

In [None]:
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# The set of stopwords
stop_words = set(stopwords.words('english'))

# Text preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords and apply lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text


# Preprocess the texts
train_sexist_texts = [preprocess_text(text) for text in train_sexist_texts]
val_sexist_texts = [preprocess_text(text) for text in val_sexist_texts]
test_texts = [preprocess_text(text) for text in test_texts]

In [None]:
# Combine train and validation sets for training the SVM model
combined_train_texts = train_sexist_texts + val_sexist_texts
combined_train_labels = list(train_sexist_labels) + list(val_sexist_labels)

# Create a DataFrame to store the augmented texts and labels
combined_train_dataframe = pd.DataFrame({'text': combined_train_texts, 'label': combined_train_labels})

# Define a synonym augmentation function
def augment_text(text, aug_max):
    aug = naw.SynonymAug(aug_src='wordnet', aug_max=aug_max)
    augmented_text = aug.augment(text)
    return augmented_text



for idx, row in combined_train_dataframe.iterrows():
  # create an augmented_text
  augmented_tx_1 = augment_text(row['text'], aug_max=1)[0]
  augmented_tx_2 = augment_text(row['text'], aug_max=2)[0]
  augmented_tx_3 = augment_text(row['text'], aug_max=3)[0]

  # append to the next row dataframe the augmented_text and the label at that row
  combined_train_dataframe.loc[idx + 0.1] = [augmented_tx_1, row['label']]
  combined_train_dataframe.loc[idx + 0.2] = [augmented_tx_2, row['label']]
  combined_train_dataframe.loc[idx + 0.3] = [augmented_tx_3, row['label']]

# Sort the dataframe by index to reposition the inserted rows
combined_train_dataframe = combined_train_dataframe.sort_index().reset_index(drop=True)

combined_train_texts = combined_train_dataframe['text'].tolist()
combined_train_labels = combined_train_dataframe['label'].tolist()

In [None]:
start_time = time.time()  # Record the start time

# Define the SVM pipeline with TF-IDF vectorizer
svm_pipeline = make_pipeline(
    TfidfVectorizer(),
    SVC(kernel='linear', C=1.0)  # Linear kernel
)

# Implement k-fold cross-validation
kf = KFold(n_splits=3)  # Define the k-fold with 5 splits

accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for train_index, val_index in kf.split(combined_train_texts):
    train_texts = [combined_train_texts[i] for i in train_index]
    val_texts = [combined_train_texts[i] for i in val_index]
    train_labels = [combined_train_labels[i] for i in train_index]
    val_labels = [combined_train_labels[i] for i in val_index]

    # Train the model
    svm_pipeline.fit(train_texts, train_labels)

    # Predict on the validation set
    val_predicted_labels = svm_pipeline.predict(val_texts)

    # Evaluate the model
    accuracy = accuracy_score(val_labels, val_predicted_labels)
    precision = precision_score(val_labels, val_predicted_labels, average='weighted')
    recall = recall_score(val_labels, val_predicted_labels, average='weighted')
    f1 = f1_score(val_labels, val_predicted_labels, average='weighted')

    # Store the results
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Calculate the average scores
average_accuracy = sum(accuracy_list) / len(accuracy_list)
average_precision = sum(precision_list) / len(precision_list)
average_recall = sum(recall_list) / len(recall_list)
average_f1 = sum(f1_list) / len(f1_list)

print("Average Accuracy:", average_accuracy)
print("Average Precision:", average_precision)
print("Average Recall:", average_recall)
print("Average F1 Score:", average_f1)

end_time = time.time()  # Record the end time
execution_time = end_time - start_time  # Calculate the execution time
print(f"SVM with k-fold cross-validation execution time: {execution_time} seconds")

# # Train the final model on the entire training set
# svm_pipeline.fit(combined_train_texts, combined_train_labels)

In [None]:
# Predict on the test data
predicted_labels_svm = svm_pipeline.predict(test_texts)

# Evaluate the final SVM model on the test set
accuracy = accuracy_score(test_labels, predicted_labels_svm)
precision = precision_score(test_labels, predicted_labels_svm, average='weighted')
recall = recall_score(test_labels, predicted_labels_svm, average='weighted')
f1 = f1_score(test_labels, predicted_labels_svm, average='weighted')
conf_matrix_svm = confusion_matrix(test_labels, predicted_labels_svm)

print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)
print("Test F1 Score:", f1)

# Visualize the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_svm, display_labels=label_encoder.classes_)
fig, ax = plt.subplots()
disp.plot(ax=ax)
plt.xticks(rotation=90)
plt.show()