In [None]:
!pip install torch transformers[torch] datasets accelerate evaluate tensorboard scikit-learn

In [None]:
import torch
torch.device('cuda:0')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Task 1

In [None]:
import pandas as pd

# Load the dataset
# Load dataset
dataset_filename = '/content/drive/MyDrive/nlp/edos_labelled_aggregated.csv'
df = pd.read_csv(dataset_filename)

# Display the first few rows of the dataset to understand its structure
df.head()

In [None]:
len(df.index)

In [None]:
sexist_df = df[df['label_sexist'] == 'sexist']
len(sexist_df.index)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
import torch
from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments

# Filter the relevant columns
df = df[['text', 'label_sexist', 'split']]

classes_labels = df['label_sexist'].unique().copy()

# Encode the labels
df['label_sexist'] = df['label_sexist'].apply(lambda x: 1 if x == 'sexist' else 0)

# Split the data
train_df = df[df['split'] == 'train']
val_df = df[df['split'] == 'dev']

train_texts = train_df['text'].tolist()
train_labels = train_df['label_sexist'].tolist()
val_texts = val_df['text'].tolist()
val_labels = val_df['label_sexist'].tolist()


In [None]:
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)


In [None]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)


In [None]:
model = ElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator', num_labels=2)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


In [None]:
trainer.train()
eval_results = trainer.evaluate()




In [None]:
eval_results = trainer.evaluate()



In [None]:
# Predictions
predictions = trainer.predict(val_dataset)
predicted_labels = torch.tensor(predictions.predictions).argmax(dim=-1)

# F1 Score
f1 = f1_score(val_labels, predicted_labels)
conf_matrix = confusion_matrix(val_labels, predicted_labels, normalize='all')

print("F1 Score:", f1)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
                              display_labels=classes_labels
                              )

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)
# Rotate the x-axis labels
plt.xticks(rotation=90)
# Display the plot
plt.show()

# Task 2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
import torch
from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = '/content/drive/MyDrive/nlp/edos_labelled_aggregated.csv'
df = pd.read_csv(file_path)

# Filter the relevant columns
df = df[['text', 'label_sexist', 'split', 'label_category']]

# Encode the labels for the first task
df['label_sexist'] = df['label_sexist'].apply(lambda x: 1 if x == 'sexist' else 0)

# Filter sexist sentences
sexist_df = df[df['label_sexist'] == 1]

classes_labels = sexist_df['label_category'].unique().copy()

print(classes_labels)

# Split the data
train_sexist_df = sexist_df[sexist_df['split'] == 'train']
val_sexist_df = sexist_df[sexist_df['split'] == 'dev']

train_sexist_texts = train_sexist_df['text'].tolist()
train_sexist_labels = train_sexist_df['label_category'].tolist()
val_sexist_texts = val_sexist_df['text'].tolist()
val_sexist_labels = val_sexist_df['label_category'].tolist()

# Encode the categories
label_encoder = LabelEncoder()
train_sexist_labels = label_encoder.fit_transform(train_sexist_labels)
val_sexist_labels = label_encoder.transform(val_sexist_labels)

num_labels = len(label_encoder.classes_)


In [None]:
train_sexist_encodings = tokenizer(train_sexist_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
val_sexist_encodings = tokenizer(val_sexist_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')


In [None]:
train_sexist_dataset = TextDataset(train_sexist_encodings, train_sexist_labels)
val_sexist_dataset = TextDataset(val_sexist_encodings, val_sexist_labels)


In [None]:
model = ElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator', num_labels=num_labels)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sexist_dataset,
    eval_dataset=val_sexist_dataset
)


In [None]:
trainer.train()
eval_results = trainer.evaluate()

In [None]:
# Predictions
predictions = trainer.predict(val_sexist_dataset)
predicted_labels = torch.tensor(predictions.predictions).argmax(dim=-1)

# F1 Score
f1 = f1_score(val_sexist_labels, predicted_labels, average='weighted')
conf_matrix = confusion_matrix(val_sexist_labels, predicted_labels, normalize='all')

print("F1 Score:", f1)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

classes_labels = np.unique(sexist_df['label_category'])


disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
                              display_labels=classes_labels
                              )

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)
# Rotate the x-axis labels
plt.xticks(rotation=90)
# Display the plot
plt.show()

# Step 3: Evaluate Different Preprocessing Techniques

## Run 1: Using Bigrams

In [None]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')

def create_bigrams(texts):
    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
    bigrams = bigram_vectorizer.fit_transform(texts)
    return bigrams


In [None]:

train_sexist_bigrams = create_bigrams(train_sexist_texts)
val_sexist_bigrams = create_bigrams(val_sexist_texts)

train_sexist_bigram_texts = [" ".join(map(str, list(train_sexist_bigrams[i].nonzero()[1]))) for i in range(train_sexist_bigrams.shape[0])]
val_sexist_bigram_texts = [" ".join(map(str, list(val_sexist_bigrams[i].nonzero()[1]))) for i in range(val_sexist_bigrams.shape[0])]



In [None]:

train_sexist_bigram_encodings = tokenize_function(train_sexist_bigram_texts)
val_sexist_bigram_encodings = tokenize_function(val_sexist_bigram_texts)

In [None]:

train_sexist_bigram_dataset = TextDataset(train_sexist_bigram_encodings, train_sexist_labels)
val_sexist_bigram_dataset = TextDataset(val_sexist_bigram_encodings, val_sexist_labels)


In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sexist_bigram_dataset,
    eval_dataset=val_sexist_bigram_dataset
)

In [None]:
trainer.train()
eval_results_bigram = trainer.evaluate()

In [None]:
# Predictions
predictions_bigram = trainer.predict(val_sexist_bigram_dataset)
predicted_labels_bigram = torch.tensor(predictions_bigram.predictions).argmax(dim=-1)

# F1 Score and Confusion Matrix for bigrams
f1_bigram = f1_score(val_sexist_labels, predicted_labels_bigram, average='weighted')
conf_matrix_bigram = confusion_matrix(val_sexist_labels, predicted_labels_bigram, normalize='all')

print("F1 Score with Bigrams:", f1_bigram)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_bigram,
                              display_labels=classes_labels
                              )

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)
# Rotate the x-axis labels
plt.xticks(rotation=90)
# Display the plot
plt.show()

## Run 2: Using WordPiece Tokenization

In [None]:
from transformers import BertTokenizer

tokenizer_wordpiece = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_wordpiece(texts):
    return tokenizer_wordpiece(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')


In [None]:
train_sexist_wp_encodings = tokenize_wordpiece(train_sexist_texts)
val_sexist_wp_encodings = tokenize_wordpiece(val_sexist_texts)


In [None]:
train_sexist_wp_dataset = TextDataset(train_sexist_wp_encodings, train_sexist_labels)
val_sexist_wp_dataset = TextDataset(val_sexist_wp_encodings, val_sexist_labels)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sexist_wp_dataset,
    eval_dataset=val_sexist_wp_dataset
)




In [None]:
trainer.train()
eval_results_wp = trainer.evaluate()

In [None]:
# Predictions
predictions_wp = trainer.predict(val_sexist_wp_dataset)
predicted_labels_wp = torch.tensor(predictions_wp.predictions).argmax(dim=-1)

# F1 Score and Confusion Matrix for WordPiece
f1_wp = f1_score(val_sexist_labels, predicted_labels_wp, average='weighted')
conf_matrix_wp = confusion_matrix(val_sexist_labels, predicted_labels_wp, normalize='all')

print("F1 Score with WordPiece:", f1_wp)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np


disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_wp,
                              display_labels=classes_labels
                              )

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)
# Rotate the x-axis labels
plt.xticks(rotation=90)
# Display the plot
plt.show()

## Run 3: Using Different Text Preprocessing (e.g., Lowercasing, Removing Punctuation)

In [None]:
import re
import string

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Apply preprocessing
train_sexist_preprocessed_texts = [preprocess_text(text) for text in train_sexist_texts]
val_sexist_preprocessed_texts = [preprocess_text(text) for text in val_sexist_texts]


In [None]:
train_sexist_preprocessed_encodings = tokenize_function(train_sexist_preprocessed_texts)
val_sexist_preprocessed_encodings = tokenize_function(val_sexist_preprocessed_texts)


In [None]:
train_sexist_preprocessed_dataset = TextDataset(train_sexist_preprocessed_encodings, train_sexist_labels)
val_sexist_preprocessed_dataset = TextDataset(val_sexist_preprocessed_encodings, val_sexist_labels)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sexist_preprocessed_dataset,
    eval_dataset=val_sexist_preprocessed_dataset
)



In [None]:
trainer.train()
eval_results_preprocessed = trainer.evaluate()

In [None]:
# Predictions
predictions_preprocessed = trainer.predict(val_sexist_preprocessed_dataset)
predicted_labels_preprocessed = torch.tensor(predictions_preprocessed.predictions).argmax(dim=-1)

# F1 Score and Confusion Matrix for preprocessed texts
f1_preprocessed = f1_score(val_sexist_labels, predicted_labels_preprocessed, average='weighted')
conf_matrix_preprocessed = confusion_matrix(val_sexist_labels, predicted_labels_preprocessed, normalize='all')

print("F1 Score with Preprocessed Texts:", f1_preprocessed)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Get unique labels from the 'label_category' column
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_preprocessed,
                              display_labels=classes_labels
                              )

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)
# Rotate the x-axis labels
plt.xticks(rotation=90)
# Display the plot
plt.show()

In [None]:
# Print the F1 scores and confusion matrices for comparison
print("F1 Score without Bigrams:", f1)
print("Confusion Matrix without Bigrams:\n", conf_matrix)

print("F1 Score with Bigrams:", f1_bigram)
print("Confusion Matrix with Bigrams:\n", conf_matrix_bigram)

print("F1 Score with WordPiece:", f1_wp)
print("Confusion Matrix with WordPiece:\n", conf_matrix_wp)

print("F1 Score with Preprocessed Texts:", f1_preprocessed)
print("Confusion Matrix with Preprocessed Texts:\n", conf_matrix_preprocessed)


# 4 Improving performances


## 1 Hyperparameter tuning

In [None]:
from transformers import ElectraForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    evaluation_strategy='epoch'
)


In [None]:

model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=num_labels)


In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sexist_preprocessed_dataset,
    eval_dataset=val_sexist_preprocessed_dataset
)

In [None]:

trainer.train()
eval_results_preprocessed_tuned = trainer.evaluate()

In [None]:
# Predictions
predictions_preprocessed_tuned = trainer.predict(val_sexist_preprocessed_dataset)
predicted_labels_preprocessed_tuned = torch.tensor(predictions_preprocessed_tuned.predictions).argmax(dim=-1)

# F1 Score and Confusion Matrix for tuned preprocessed texts
f1_preprocessed_tuned = f1_score(val_sexist_labels, predicted_labels_preprocessed_tuned, average='weighted')
conf_matrix_preprocessed_tuned = confusion_matrix(val_sexist_labels, predicted_labels_preprocessed_tuned, normalize='all')

print("F1 Score with Tuned Preprocessed Texts:", f1_preprocessed_tuned)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Get unique labels from the 'label_category' column
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_preprocessed_tuned,
                              display_labels=classes_labels
                              )

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)
# Rotate the x-axis labels
plt.xticks(rotation=90)
# Display the plot
plt.show()

## 2 Using a Larger Model

In [None]:
# Initialize the model
model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=num_labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sexist_preprocessed_dataset,
    eval_dataset=val_sexist_preprocessed_dataset
)


trainer.train()
eval_results_base = trainer.evaluate()


In [None]:
# Predictions
predictions_base = trainer.predict(val_sexist_preprocessed_dataset)
predicted_labels_base = torch.tensor(predictions_base.predictions).argmax(dim=-1)

# F1 Score and Confusion Matrix for base model
f1_base = f1_score(val_sexist_labels, predicted_labels_base, average='weighted')
conf_matrix_base = confusion_matrix(val_sexist_labels, predicted_labels_base, normalize='all')

print("F1 Score with Base Model:", f1_base)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Get unique labels from the 'label_category' column
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_base,
                              display_labels=classes_labels
                              )

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)
# Rotate the x-axis labels
plt.xticks(rotation=90)
# Display the plot
plt.show()

# 5 Combining best performances approaches



## 1 Preprocess the Text and Tokenize Using WordPiece

In [None]:
import re
import string
from transformers import BertTokenizer

# Define text preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Apply preprocessing
train_sexist_preprocessed_texts = [preprocess_text(text) for text in train_sexist_texts]
val_sexist_preprocessed_texts = [preprocess_text(text) for text in val_sexist_texts]

# Initialize the BERT tokenizer (WordPiece)
tokenizer_wordpiece = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_wordpiece(texts):
    return tokenizer_wordpiece(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')

# Tokenize the preprocessed texts
train_sexist_wp_encodings = tokenize_wordpiece(train_sexist_preprocessed_texts)
val_sexist_wp_encodings = tokenize_wordpiece(val_sexist_preprocessed_texts)


In [None]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_sexist_wp_dataset = TextDataset(train_sexist_wp_encodings, train_sexist_labels)
val_sexist_wp_dataset = TextDataset(val_sexist_wp_encodings, val_sexist_labels)


In [None]:
from transformers import ElectraForSequenceClassification, Trainer, TrainingArguments

# Initialize the model
model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=num_labels)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sexist_wp_dataset,
    eval_dataset=val_sexist_wp_dataset
)


In [None]:
trainer.train()
eval_results_combined = trainer.evaluate()

In [None]:
# Predictions
predictions_combined = trainer.predict(val_sexist_wp_dataset)
predicted_labels_combined = torch.tensor(predictions_combined.predictions).argmax(dim=-1)

# F1 Score and Confusion Matrix for combined techniques
from sklearn.metrics import f1_score, confusion_matrix

f1_combined = f1_score(val_sexist_labels, predicted_labels_combined, average='weighted')
conf_matrix_combined = confusion_matrix(val_sexist_labels, predicted_labels_combined, normalize='all')

print("F1 Score with Combined Techniques:", f1_combined)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Get unique labels from the 'label_category' column
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_combined,
                              display_labels=classes_labels
                              )

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)
# Rotate the x-axis labels
plt.xticks(rotation=90)
# Display the plot
plt.show()

# 6 Others


## Trigrams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def create_trigrams(texts):
    trigram_vectorizer = CountVectorizer(ngram_range=(1, 3), token_pattern=r'\b\w+\b', min_df=1)
    trigrams = trigram_vectorizer.fit_transform(texts)
    return trigram_vectorizer, trigrams

# Apply trigrams to the training and validation texts
trigram_vectorizer, train_sexist_trigrams = create_trigrams(train_sexist_texts)
_, val_sexist_trigrams = create_trigrams(val_sexist_texts)

# Convert the trigrams to lists of strings for the tokenizer
train_sexist_trigram_texts = [" ".join(map(str, list(train_sexist_trigrams[i].nonzero()[1]))) for i in range(train_sexist_trigrams.shape[0])]
val_sexist_trigram_texts = [" ".join(map(str, list(val_sexist_trigrams[i].nonzero()[1]))) for i in range(val_sexist_trigrams.shape[0])]


In [None]:
train_sexist_trigram_encodings = tokenize_function(train_sexist_trigram_texts)
val_sexist_trigram_encodings = tokenize_function(val_sexist_trigram_texts)



In [None]:
train_sexist_trigram_dataset = TextDataset(train_sexist_trigram_encodings, train_sexist_labels)
val_sexist_trigram_dataset = TextDataset(val_sexist_trigram_encodings, val_sexist_labels)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sexist_trigram_dataset,
    eval_dataset=val_sexist_trigram_dataset
)




In [None]:
trainer.train()
eval_results_trigram = trainer.evaluate()


In [None]:
# Predictions
predictions_trigram = trainer.predict(val_sexist_trigram_dataset)
predicted_labels_trigram = torch.tensor(predictions_trigram.predictions).argmax(dim=-1)

# F1 Score and Confusion Matrix for trigrams
f1_trigram = f1_score(val_sexist_labels, predicted_labels_trigram, average='weighted')
conf_matrix_trigram = confusion_matrix(val_sexist_labels, predicted_labels_trigram, normalize='all')

print("F1 Score with Trigrams:", f1_trigram)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Get unique labels from the 'label_category' column
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_trigram,
                              display_labels=classes_labels
                              )

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)
# Rotate the x-axis labels
plt.xticks(rotation=90)
# Display the plot
plt.show()

## Using TF-IDF Vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def create_tfidf_vectors(texts):
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), token_pattern=r'\b\w+\b', min_df=1)
    tfidf_vectors = tfidf_vectorizer.fit_transform(texts)
    return tfidf_vectorizer, tfidf_vectors

# Apply TF-IDF vectors to the training and validation texts
tfidf_vectorizer, train_sexist_tfidf = create_tfidf_vectors(train_sexist_texts)
_, val_sexist_tfidf = create_tfidf_vectors(val_sexist_texts)

# Convert the TF-IDF vectors to lists of strings for the tokenizer
train_sexist_tfidf_texts = [" ".join(map(str, list(train_sexist_tfidf[i].nonzero()[1]))) for i in range(train_sexist_tfidf.shape[0])]
val_sexist_tfidf_texts = [" ".join(map(str, list(val_sexist_tfidf[i].nonzero()[1]))) for i in range(val_sexist_tfidf.shape[0])]


In [None]:
train_sexist_tfidf_encodings = tokenize_function(train_sexist_tfidf_texts)
val_sexist_tfidf_encodings = tokenize_function(val_sexist_tfidf_texts)


In [None]:
train_sexist_tfidf_dataset = TextDataset(train_sexist_tfidf_encodings, train_sexist_labels)
val_sexist_tfidf_dataset = TextDataset(val_sexist_tfidf_encodings, val_sexist_labels)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sexist_tfidf_dataset,
    eval_dataset=val_sexist_tfidf_dataset
)



In [None]:
trainer.train()
eval_results_tfidf = trainer.evaluate()


In [None]:
# Predictions
predictions_tfidf = trainer.predict(val_sexist_tfidf_dataset)
predicted_labels_tfidf = torch.tensor(predictions_tfidf.predictions).argmax(dim=-1)

# F1 Score and Confusion Matrix for TF-IDF vectors
f1_tfidf = f1_score(val_sexist_labels, predicted_labels_tfidf, average='weighted')
conf_matrix_tfidf = confusion_matrix(val_sexist_labels, predicted_labels_tfidf, normalize='all')

print("F1 Score with TF-IDF Vectors:", f1_tfidf)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Get unique labels from the 'label_category' column
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_tfidf,
                              display_labels=classes_labels
                              )

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)
# Rotate the x-axis labels
plt.xticks(rotation=90)
# Display the plot
plt.show()

## Using Word Embeddings (e.g., GloVe)

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
import numpy as np

# Load GloVe embeddings
def load_glove_embeddings(filepath):
    embeddings_index = {}
    with open(filepath, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_filepath = 'glove.6B.300d.txt'  # Replace with the actual path
embeddings_index = load_glove_embeddings(glove_filepath)

In [None]:
from transformers import ElectraTokenizer

tokenizer_wordpiece = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
word_index = tokenizer_wordpiece.get_vocab()

In [None]:
embedding_dim = 300  # Dimension of GloVe embeddings

def create_embedding_matrix(word_index, embeddings_index, embedding_dim):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

word_index = tokenizer_wordpiece.get_vocab()
embedding_matrix = create_embedding_matrix(word_index, embeddings_index, embedding_dim)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import ElectraModel, ElectraConfig

class CustomElectraModel(nn.Module):
    def __init__(self, electra_model, embedding_matrix, num_labels):
        super(CustomElectraModel, self).__init__()
        self.electra = electra_model
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.fc = nn.Linear(electra_model.config.hidden_size + embedding_dim, num_labels)

    def forward(self, input_ids, attention_mask=None):
        electra_output = self.electra(input_ids=input_ids, attention_mask=attention_mask)
        electra_embeddings = electra_output.last_hidden_state[:, 0, :]

        glove_embeddings = self.embedding(input_ids)
        glove_embeddings = glove_embeddings.mean(dim=1)

        combined_embeddings = torch.cat((electra_embeddings, glove_embeddings), dim=1)

        logits = self.fc(combined_embeddings)

        # Debug
        print(f"Input IDs shape: {input_ids.shape}")
        print(f"Electra embeddings shape: {electra_embeddings.shape}")
        print(f"GloVe embeddings shape: {glove_embeddings.shape}")
        print(f"Combined embeddings shape: {combined_embeddings.shape}")
        print(f"Logits shape: {logits.shape}")

        return logits


# Initialize the ELECTRA model
electra_model = ElectraModel.from_pretrained('google/electra-base-discriminator')
num_labels = 4
model = CustomElectraModel(electra_model, embedding_matrix, num_labels)


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import ElectraTokenizer

# Load the dataset
file_path = '/content/drive/MyDrive/nlp/edos_labelled_individual_annotations.csv'
df = pd.read_csv(file_path)

# Filter the relevant columns
df = df[['text', 'label_sexist', 'split', 'label_category']]

# Encode the labels for the first task
df['label_sexist'] = df['label_sexist'].apply(lambda x: 1 if x == 'sexist' else 0)

# Filter sexist sentences
sexist_df = df[df['label_sexist'] == 1]

# Split the data
train_sexist_df = sexist_df[sexist_df['split'] == 'train']
val_sexist_df = sexist_df[sexist_df['split'] == 'dev']

train_sexist_texts = train_sexist_df['text'].tolist()
train_sexist_labels = train_sexist_df['label_category'].tolist()
val_sexist_texts = val_sexist_df['text'].tolist()
val_sexist_labels = val_sexist_df['label_category'].tolist()

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import ElectraTokenizer
import torch
from torch.utils.data import Dataset

# Encode the categories
label_encoder = LabelEncoder()
train_sexist_labels = label_encoder.fit_transform(train_sexist_labels)
val_sexist_labels = label_encoder.transform(val_sexist_labels)

num_labels = len(label_encoder.classes_)

# Initialize the tokenizer
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

# Tokenize the data
train_encodings = tokenizer(train_sexist_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_sexist_texts, truncation=True, padding=True, max_length=128)

# Create a custom dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_sexist_labels)
val_dataset = TextDataset(val_encodings, val_sexist_labels)

# Print the first item to verify labels
print(train_dataset[0])



In [None]:
# Load and preprocess data
file_path = '/content/drive/MyDrive/nlp/edos_labelled_individual_annotations.csv'
df = pd.read_csv(file_path)

# Filter the relevant columns
df = df[['text', 'label_sexist', 'split', 'label_category']]

# Encode the labels for the first task
df['label_sexist'] = df['label_sexist'].apply(lambda x: 1 if x == 'sexist' else 0)

# Filter sexist sentences
sexist_df = df[df['label_sexist'] == 1]

# Split the data
train_sexist_df = sexist_df[sexist_df['split'] == 'train']
val_sexist_df = sexist_df[sexist_df['split'] == 'dev']

train_sexist_texts = train_sexist_df['text'].tolist()
train_sexist_labels = train_sexist_df['label_category'].tolist()
val_sexist_texts = val_sexist_df['text'].tolist()
val_sexist_labels = val_sexist_df['label_category'].tolist()

# Encode the categories
label_encoder = LabelEncoder()
train_sexist_labels = label_encoder.fit_transform(train_sexist_labels)
val_sexist_labels = label_encoder.transform(val_sexist_labels)

num_labels = len(label_encoder.classes_)

# Initialize the tokenizer
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

# Tokenize the data
train_encodings = tokenizer(train_sexist_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_sexist_texts, truncation=True, padding=True, max_length=128)

# Create dataset
train_dataset = TextDataset(train_encodings, train_sexist_labels)
val_dataset = TextDataset(val_encodings, val_sexist_labels)

# Print the first item to verify labels
print(train_dataset[0])

In [None]:
from transformers import Trainer, TrainingArguments

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        print("Inputs keys:", inputs.keys())
        if "labels" not in inputs:
            raise ValueError("Labels are missing in the inputs")

        labels = inputs.pop("labels")
        print(f"Labels: {labels}")

        outputs = model(**inputs)
        print(f"Outputs: {outputs}")

        loss = F.cross_entropy(outputs, labels)
        return (loss, outputs) if return_outputs else loss

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()
eval_results = trainer.evaluate()

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import ElectraTokenizer, ElectraModel, TrainingArguments, Trainer
import torch.nn.functional as F
import torch

# Load the dataset
file_path = '/content/drive/MyDrive/nlp/edos_labelled_individual_annotations.csv'
df = pd.read_csv(file_path)

# Filter the relevant columns
df = df[['text', 'label_sexist', 'split', 'label_category']]

# Encode the labels for the first task
df['label_sexist'] = df['label_sexist'].apply(lambda x: 1 if x == 'sexist' else 0)

# Filter sexist sentences
sexist_df = df[df['label_sexist'] == 1]

# Split the data
train_sexist_df = sexist_df[sexist_df['split'] == 'train']
val_sexist_df = sexist_df[sexist_df['split'] == 'dev']

train_sexist_texts = train_sexist_df['text'].tolist()
train_sexist_labels = train_sexist_df['label_category'].tolist()
val_sexist_texts = val_sexist_df['text'].tolist()
val_sexist_labels = val_sexist_df['label_category'].tolist()

# Encode the categories
label_encoder = LabelEncoder()
train_sexist_labels = label_encoder.fit_transform(train_sexist_labels)
val_sexist_labels = label_encoder.transform(val_sexist_labels)

num_labels = len(label_encoder.classes_)

# Initialize the tokenizer
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

# Tokenize the data
train_encodings = tokenizer(train_sexist_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_sexist_texts, truncation=True, padding=True, max_length=128)

# Create a custom dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_sexist_labels)
val_dataset = TextDataset(val_encodings, val_sexist_labels)

# Verify the data structure
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
for batch in train_dataloader:
    print(batch)
    break

# Custom Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        print("Inputs keys:", inputs.keys())
        if "labels" not in inputs:
            raise ValueError("Labels are missing in the inputs")

        labels = inputs.pop("labels")
        print(f"Labels: {labels}")

        outputs = model(**inputs)
        print(f"Outputs: {outputs}")

        loss = F.cross_entropy(outputs, labels)
        return (loss, outputs) if return_outputs else loss

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the ELECTRA model
electra_model = ElectraModel.from_pretrained('google/electra-base-discriminator')
embedding_matrix = np.zeros((tokenizer.vocab_size, 300))  # Placeholder for actual GloVe embeddings
model = CustomElectraModel(electra_model, embedding_matrix, num_labels)

# Initialize the trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()
eval_results = trainer.evaluate()

# Predictions
predictions = trainer.predict(val_dataset)
predicted_labels = torch.tensor(predictions.predictions).argmax(dim=-1)

# F1 Score and Confusion Matrix
from sklearn.metrics import f1_score, confusion_matrix

f1 = f1_score(val_sexist_labels, predicted_labels, average='weighted')
conf_matrix = confusion_matrix(val_sexist_labels, predicted_labels)

print("F1 Score with ELECTRA and GloVe:", f1)
print("Confusion Matrix with ELECTRA and GloVe:\n", conf_matrix)

# Visualize the confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

classes_labels = label_encoder.classes_

disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=classes_labels)

fig, ax = plt.subplots()
disp.plot(ax=ax)
plt.xticks(rotation=90)
plt.show()


In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        print("Inputs keys:", inputs.keys())
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = F.cross_entropy(outputs, labels)
        return (loss, outputs) if return_outputs else loss


In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Custom trainer setup
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


In [None]:
trainer.train()
eval_results = trainer.evaluate()


In [None]:
# Predictions
predictions = trainer.predict(val_dataset)
predicted_labels = torch.tensor(predictions.predictions).argmax(dim=-1)

# F1 Score and Confusion Matrix
from sklearn.metrics import f1_score, confusion_matrix

f1_electra_wb = f1_score(val_sexist_labels, predicted_labels, average='weighted')
conf_matrix_electra_wb = confusion_matrix(val_sexist_labels, predicted_labels, normalize='all')

print("F1 Score with ELECTRA and GloVe:", f1_electra_wb)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Get unique labels from the 'label_category' column
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_electra_wb,
                              display_labels=classes_labels
                              )

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)
# Rotate the x-axis labels
plt.xticks(rotation=90)
# Display the plot
plt.show()

## Hyperparameter tuning

In [None]:
!pip install optuna

In [None]:
from transformers import TrainerCallback
import optuna

class HyperparameterSearch(TrainerCallback):
    def __init__(self, trial):
        self.trial = trial

    def on_evaluate(self, args, state, control, **kwargs):
        metrics = kwargs["metrics"]
        self.trial.report(metrics["eval_loss"], step=state.global_step)
        if self.trial.should_prune():
            raise optuna.TrialPruned()

def objective(trial):
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=trial.suggest_int('num_train_epochs', 2, 5),
        per_device_train_batch_size=trial.suggest_categorical('per_device_train_batch_size', [8, 16, 32]),
        learning_rate=trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True),
        warmup_steps=trial.suggest_int('warmup_steps', 0, 500),
        weight_decay=trial.suggest_float('weight_decay', 0.0, 0.3),
        logging_dir='./logs',
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[HyperparameterSearch(trial)]
    )

    trainer.train()
    return trainer.evaluate()["eval_loss"]

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

best_trial = study.best_trial
print(f"Best trial: {best_trial.values}")
print(f"Best hyperparameters: {best_trial.params}")


## Roberta

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Encode the categories
label_encoder = LabelEncoder()
train_sexist_labels = label_encoder.fit_transform(train_sexist_labels)
val_sexist_labels = label_encoder.transform(val_sexist_labels)

num_labels = len(label_encoder.classes_)

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the data
train_encodings = tokenizer(train_sexist_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_sexist_texts, truncation=True, padding=True, max_length=128)


In [None]:
# Create a custom dataset class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_sexist_labels)
val_dataset = TextDataset(val_encodings, val_sexist_labels)

In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


In [None]:
# Train the model
trainer.train()
eval_results_embeddings = trainer.evaluate()


In [None]:
# Predictions
predictions_embeddings = trainer.predict(val_sexist_wp_dataset)
predicted_labels_embeddings = torch.tensor(predictions_embeddings.predictions).argmax(dim=-1)

# F1 Score and Confusion Matrix for embeddings
f1_embeddings = f1_score(val_sexist_labels, predicted_labels_embeddings, average='weighted')
conf_matrix_embeddings = confusion_matrix(val_sexist_labels, predicted_labels_embeddings, normalize='all')

print("F1 Score with GloVe Embeddings:", f1_embeddings)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Get unique labels from the 'label_category' column
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_embeddings,
                              display_labels=classes_labels
                              )

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)
# Rotate the x-axis labels
plt.xticks(rotation=90)
# Display the plot
plt.show()

# Data Augmentation


In [None]:
!pip install nlpaug
!pip install transformers[torch]
!pip install alive-progress

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = '/content/drive/MyDrive/nlp/edos_labelled_aggregated.csv'
df = pd.read_csv(file_path)

# Filter the relevant columns
df = df[['text', 'label_sexist', 'split', 'label_category']]

# Encode the labels for the first task
df['label_sexist'] = df['label_sexist'].apply(lambda x: 1 if x == 'sexist' else 0)

# Filter sexist sentences
sexist_df = df[df['label_sexist'] == 1]

classes_labels = sexist_df['label_category'].unique().copy()

print(classes_labels)

# Split the data
train_sexist_df = sexist_df[sexist_df['split'] == 'train']
val_sexist_df = sexist_df[sexist_df['split'] == 'dev']

train_sexist_texts = train_sexist_df['text'].tolist()
train_sexist_labels = train_sexist_df['label_category'].tolist()
val_sexist_texts = val_sexist_df['text'].tolist()
val_sexist_labels = val_sexist_df['label_category'].tolist()

In [None]:
# Encode the categories
label_encoder = LabelEncoder()
train_sexist_labels = label_encoder.fit_transform(train_sexist_labels)
val_sexist_labels = label_encoder.transform(val_sexist_labels)

num_labels = len(label_encoder.classes_)

In [None]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf


combined_train_dataframe = pd.DataFrame({'text': train_sexist_texts, 'label': train_sexist_labels})

# Define a synonym augmentation function
def augment_text(text):
    aug = naw.SynonymAug(aug_src='wordnet')
    augmented_text = aug.augment(text)
    return augmented_text

for idx, row in combined_train_dataframe.iterrows():
  # create an augmented_text
  augmented_tx = augment_text(row['text'])[0]
  # append to the next row dataframe the augmented_text and the label at that row
  combined_train_dataframe.loc[idx + 0.5] = [augmented_tx, row['label']]

# Sort the dataframe by index to reposition the inserted rows
combined_train_dataframe = combined_train_dataframe.sort_index().reset_index(drop=True)

combined_train_texts = combined_train_dataframe['text'].tolist()
combined_train_labels = combined_train_dataframe['label'].tolist()

combined_train_dataframe

# Cross-Validation


In [None]:
import numpy as np
# Convert texts and labels to numpy arrays for KFold
combined_train_texts_np = np.array(combined_train_texts)
combined_train_labels_np = np.array(combined_train_labels)

In [None]:
# Create a custom dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification, TrainingArguments, Trainer


# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=200,
)

# Initialize the ELECTRA model for sequence classification
model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=num_labels)


# Initialize the tokenizer
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')


In [None]:
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

# Cross-validation loop
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 0
for train_index, val_index in kf.split(combined_train_texts_np):
    fold += 1
    print(f"Training fold {fold}...")

    # Split the data
    train_texts_fold = combined_train_texts_np[train_index].tolist()
    val_texts_fold = combined_train_texts_np[val_index].tolist()
    train_labels_fold = combined_train_labels_np[train_index].tolist()
    val_labels_fold = combined_train_labels_np[val_index].tolist()

    # Tokenize the data
    train_encodings_fold = tokenizer(train_texts_fold, truncation=True, padding=True, max_length=128)
    val_encodings_fold = tokenizer(val_texts_fold, truncation=True, padding=True, max_length=128)

    # Create dataset
    train_dataset_fold = TextDataset(train_encodings_fold, train_labels_fold)
    val_dataset_fold = TextDataset(val_encodings_fold, val_labels_fold)

    # Initialize the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_fold,
        eval_dataset=val_dataset_fold,
    )

    # Train the model
    trainer.train()
    eval_results = trainer.evaluate()

    # Predictions
    predictions = trainer.predict(val_dataset_fold)
    predicted_labels = torch.tensor(predictions.predictions).argmax(dim=-1)

    # F1 Score and Confusion Matrix
    f1 = f1_score(val_labels_fold, predicted_labels, average='weighted')
    conf_matrix = confusion_matrix(val_labels_fold, predicted_labels)

    print(f"Fold {fold} F1 Score: {f1}")

    # Visualize the confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=label_encoder.classes_)
    fig, ax = plt.subplots()
    disp.plot(ax=ax)
    plt.xticks(rotation=90)
    plt.show()