# Combining the best tested approaches

*   WordPiece Tokenization
*   Hyperparameter tuning
*   Larger Electra Model
*   Data Augmentation
*   Cross-Validation


Not Included (could be used)
*   Word Embeddings
*   Advanced or custom Models
*   Advanced algorithms for Hyperparameter optimization

Libraries

In [None]:
!pip install torch transformers[torch] datasets accelerate evaluate tensorboard scikit-learn nlpaug alive-progress

In [None]:
import torch
torch.device('cuda:0')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Loading Data

In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/nlp/edos_labelled_aggregated.csv'
df = pd.read_csv(file_path)

# Filter the relevant columns
df = df[['text', 'label_sexist', 'split', 'label_category']]

# Encode the labels for the first task
df['label_sexist'] = df['label_sexist'].apply(lambda x: 1 if x == 'sexist' else 0)

# Filter sexist sentences
sexist_df = df[df['label_sexist'] == 1]

classes_labels = sexist_df['label_category'].unique().copy()

print(classes_labels)

# Split the data
train_sexist_df = sexist_df[sexist_df['split'] == 'train']
val_sexist_df = sexist_df[sexist_df['split'] == 'dev']

train_sexist_texts = train_sexist_df['text'].tolist()
train_sexist_labels = train_sexist_df['label_category'].tolist()
val_sexist_texts = val_sexist_df['text'].tolist()
val_sexist_labels = val_sexist_df['label_category'].tolist()

Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the categories
label_encoder = LabelEncoder()
train_sexist_labels = label_encoder.fit_transform(train_sexist_labels)
val_sexist_labels = label_encoder.transform(val_sexist_labels)

num_labels = len(label_encoder.classes_)

Text preprocessing

In [None]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# The set of stopwords
stop_words = set(stopwords.words('english'))

# Text preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords and apply lemmatization
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Preprocess the texts
train_sexist_texts = [preprocess_text(text) for text in train_sexist_texts]
val_sexist_texts = [preprocess_text(text) for text in val_sexist_texts]

# Feature extraction with unigrams and bigrams
vectorizer = CountVectorizer(ngram_range=(1, 2))
train_features = vectorizer.fit_transform(train_sexist_texts)
val_features = vectorizer.transform(val_sexist_texts)

print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

# Convert to PyTorch tensors
train_features = torch.tensor(train_features.toarray(), dtype=torch.float32)
val_features = torch.tensor(val_features.toarray(), dtype=torch.float32)
train_labels = torch.tensor(train_sexist_labels, dtype=torch.long)
val_labels = torch.tensor(val_sexist_labels, dtype=torch.long)

# Save the processed data for further use in model training
torch.save((train_features, train_labels), 'train_data.pt')
torch.save((val_features, val_labels), 'val_data.pt')

print("Data preprocessing complete.")

Augmented Data

In [None]:
import nlpaug.augmenter.word as naw

combined_train_dataframe = pd.DataFrame({'text': train_sexist_texts, 'label': train_sexist_labels})


# Define a synonym augmentation function
def augment_text(text, aug_max):
    aug = naw.SynonymAug(aug_src='wordnet', aug_max=aug_max)
    augmented_text = aug.augment(text)
    return augmented_text



for idx, row in combined_train_dataframe.iterrows():
  # create an augmented_text
  augmented_tx_1 = augment_text(row['text'], aug_max=1)[0]
  augmented_tx_2 = augment_text(row['text'], aug_max=2)[0]
  augmented_tx_3 = augment_text(row['text'], aug_max=3)[0]

  # append to the next row dataframe the augmented_text and the label at that row
  combined_train_dataframe.loc[idx + 0.1] = [augmented_tx_1, row['label']]
  combined_train_dataframe.loc[idx + 0.2] = [augmented_tx_2, row['label']]
  combined_train_dataframe.loc[idx + 0.3] = [augmented_tx_3, row['label']]

# Sort the dataframe by index to reposition the inserted rows
combined_train_dataframe = combined_train_dataframe.sort_index().reset_index(drop=True)


combined_train_texts = combined_train_dataframe['text'].tolist()
combined_train_labels = combined_train_dataframe['label'].tolist()



Preparation to Cross Validation


In [None]:
import numpy as np
# Convert texts and labels to numpy arrays for KFold
combined_train_texts_np = np.array(combined_train_texts)
combined_train_labels_np = np.array(combined_train_labels)

In [None]:
# Create a custom dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
from transformers import ElectraForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=200,
    evaluation_strategy='epoch',
    save_strategy='epoch',
)

# Initialize the model
model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=num_labels)

Tokenizer

In [None]:
# Initialize the BERT tokenizer (WordPiece)

from transformers import BertTokenizer, ElectraTokenizer
# tokenizer_wordpiece = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer_wordpiece = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

def tokenizer(texts):
    return tokenizer_wordpiece(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')

Cross validation

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

# Cross-validation loop
kf = KFold(n_splits=3, shuffle=True, random_state=42)
fold = 0
for train_index, val_index in kf.split(combined_train_texts_np):
    fold += 1
    print(f"Training fold {fold}...")

    # Split the data
    train_texts_fold = combined_train_texts_np[train_index].tolist()
    val_texts_fold = combined_train_texts_np[val_index].tolist()
    train_labels_fold = combined_train_labels_np[train_index].tolist()
    val_labels_fold = combined_train_labels_np[val_index].tolist()

    # Tokenize the data
    train_encodings_fold = tokenizer(train_texts_fold)
    val_encodings_fold = tokenizer(val_texts_fold)

    # Create dataset
    train_dataset_fold = TextDataset(train_encodings_fold, train_labels_fold)
    val_dataset_fold = TextDataset(val_encodings_fold, val_labels_fold)

    # Initialize the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_fold,
        eval_dataset=val_dataset_fold,
    )

    # Train the model
    trainer.train()
    eval_results = trainer.evaluate()

    # Predictions
    predictions = trainer.predict(val_dataset_fold)
    predicted_labels = torch.tensor(predictions.predictions).argmax(dim=-1)

    # F1 Score and Confusion Matrix
    precision = precision_score(val_labels_fold, predicted_labels, average='weighted')
    recall = recall_score(val_labels_fold, predicted_labels, average='weighted')
    f1 = f1_score(val_labels_fold, predicted_labels, average='weighted')
    conf_matrix = confusion_matrix(val_labels_fold, predicted_labels)

    print(f"Fold {fold} F1 Score: {f1}")

    # Visualize the confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=label_encoder.classes_)
    fig, ax = plt.subplots()
    disp.plot(ax=ax)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import ElectraTokenizer, ElectraForSequenceClassification, TrainingArguments, Trainer
import torch
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Load the dataset
file_path = '/content/drive/MyDrive/nlp/edos_test_category_5.csv'
df = pd.read_csv(file_path)

# Filter the relevant columns
df = df[['text', 'label_sexist', 'split', 'label_category']]

# Encode the labels for the first task
df['label_sexist'] = df['label_sexist'].apply(lambda x: 1 if x == 'sexist' else 0)

# Filter sexist sentences
sexist_df = df[df['label_sexist'] == 1]

# Split the data
test_df = sexist_df[sexist_df['split'] == 'test']
test_texts = test_df['text'].tolist()
test_labels = test_df['label_category'].tolist()

# Encode the categories
test_labels = label_encoder.fit_transform(test_labels)

num_labels = len(label_encoder.classes_)

# Tokenize the test texts
test_encodings = tokenizer(test_texts)

test_dataset = TextDataset(test_encodings, test_labels)

# Initialize the trainer with the training arguments and model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=None,  # No need to train again
    eval_dataset=test_dataset  # Use test dataset for evaluation
)

# Evaluate the model
eval_results = trainer.evaluate()

# Predictions
predictions = trainer.predict(test_dataset)
predicted_labels = torch.tensor(predictions.predictions).argmax(dim=-1)

# F1 Score and Confusion Matrix
precision = precision_score(test_labels, predicted_labels, average='weighted')
recall = recall_score(test_labels, predicted_labels, average='weighted')
f1 = f1_score(test_labels, predicted_labels, average='weighted')
conf_matrix = confusion_matrix(test_labels, predicted_labels)

print("F1 Score on Test Data:", f1)

# Visualize the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=label_encoder.classes_)
fig, ax = plt.subplots()
disp.plot(ax=ax)
plt.xticks(rotation=90)
plt.show()
