## Importing important libraries

In [1]:
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import Dataset

## Hyperparameters 

Easy and straightforward hyperparameter tuning

In [2]:
# Hyperparameters
BATCH_SIZE = 128
LEARNING_RATE = 1e-5
EPOCHS = 3
TEST_SIZE = 0.2
RANDOM_STATE = 42
MAX_LEN = 128  # Maximum sequence length

## Setup and Data Preparation

Loading the dataset, preprocessing the text, and preparing the data for the model.

In [3]:
# Load the datasets
dataset1_file = 'dataset_stijn_generated.csv'
dataset2_file = 'cleaned_combined_dataset.csv'
dataset3_file = 'emotion_data_merged_4.csv'
try:
    dataset1 = pd.read_csv(dataset1_file)
    dataset2 = pd.read_csv(dataset2_file)
    dataset3 = pd.read_csv(dataset3_file)
except FileNotFoundError:
    print("One or both of the files not found.")
    exit()
except Exception as e:
    print(f"Error reading file(s): {e}")
    exit()

# Concatenate datasets vertically
data = pd.concat([dataset1, dataset2, dataset3], ignore_index=True)

data_combined = data.dropna(subset=['emotion']).query("emotion != 'neutral'")
data = data_combined.drop_duplicates()

# Shuffle the combined dataset
data = data.sample(frac=1).reset_index(drop=True)

# Display basic information about the combined dataset
print("Preview of the combined dataset:")
print(data.head())

print("\nSummary statistics of the combined dataset:")
print(data.describe())

print("\nInformation about columns in the combined dataset:")
print(data.info())

sentences = data['sentence'].values
labels = data['emotion'].values

One or both of the files not found.


NameError: name 'dataset1' is not defined

In [None]:
# This function expects numpy arrays or lists for y_true and y_pred
def compute_f1(y_true, y_pred):
    # Since y_pred would be logits or probabilities from the model, 
    # we need to convert these to discrete predictions.
    # Assuming y_pred is already the discrete predictions after using torch.argmax on logits
    return f1_score(y_true, y_pred, average='weighted')

In [None]:
# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

In [None]:
# Tokenization and Data Preparation
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
input_ids = []
attention_masks = []

In [None]:
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = MAX_LEN,      # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attention masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

In [None]:
# Convert lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [None]:
# Split data into train and validation sets
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int((1 - TEST_SIZE) * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

## Model training

Setting up the RoBERTa model, defining the training loop, and initiating the training process.

In [None]:
# Model Setup
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Loop
for epoch_i in range(0, EPOCHS):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    model.train()
    
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    print("Training loss: {0:.2f}".format(loss.item()))

print("Training complete.")

## Load and Preprocess the Test Set

Load the test set, preprocess it similarly to the training and validation datasets, ensuring the same tokenizer and sequence length.

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, tokenizer, sentences, labels=None, max_len=128):
        self.tokenizer = tokenizer
        self.sentences = sentences
        self.labels = labels
        self.max_len = max_len
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, item):
        sentence = str(self.sentences[item])
        encoding = self.tokenizer.encode_plus(
          sentence,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
        )
        
        inputs = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        if self.labels is not None:
            label = self.labels[item]
            return inputs, label
        
        return inputs

# Load your test set
df_test = pd.read_csv('test_stijn.csv')

# Preprocess labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df_test['emotion'].values)
sentences = df_test['sentence'].values

test_dataset = EmotionDataset(tokenizer, sentences, labels, max_len=MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)



## Predict Emotions on the Test Set
Predict the emotions for the test set sentences using the trained model.

In [None]:
def evaluate(model, data_loader, device):
    model = model.eval()

    true_labels = []
    predictions = []

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)

            outputs = model(**inputs)
            _, preds = torch.max(outputs.logits, dim=1)

            true_labels.extend(labels.cpu().numpy())
            predictions.extend(preds.cpu().numpy())

    return true_labels, predictions

true_labels, predictions = evaluate(model, test_loader, device)


##  Generate the Confusion Matrix and Metrics

With the true labels and predictions, we can now generate a confusion matrix and calculate other evaluation metrics like precision, recall, and F1-score.

In [None]:
# Confusion Matrix
cm = confusion_matrix(true_labels, predictions)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

fig, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(cm_normalized, annot=True, ax=ax, cmap='Blues', fmt='.2%')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(label_encoder.classes_, rotation=45)
ax.yaxis.set_ticklabels(label_encoder.classes_, rotation=45)
plt.show()

# Classification Report
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))


## Preprocessing and Data Loading for PyTorch

Loading and preprocessing the test set for kaggle competition

In [None]:
# Assuming `tokenizer` is your pre-trained tokenizer (e.g., RobertaTokenizer)
class TestDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_len):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, item):
        sentence = str(self.sentences[item])
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# Load the test dataset
test_df = pd.read_csv('test (1).csv', sep='\t')

# Remove punctuation from the 'sentence' column
test_df['sentence'] = test_df['sentence'].str.replace(r'[^\w\s]', '', regex=True)

# Assume MAX_LEN and tokenizer are defined
test_dataset = TestDataset(test_df['sentence'].tolist(), tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=128)  # Match the batch size or adjust based on your GPU


## Make Predictions with the PyTorch Model

In [None]:
model.eval()  # Set the model to evaluation mode
predictions = []

with torch.no_grad():
    for data in test_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# Assuming `le` is your label encoder
predicted_emotions = le.inverse_transform(predictions)

## Prepare the Submission DataFrame and Save

In [None]:
# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df.index,  # Use original test DataFrame index as ID
    'emotion': predicted_emotions
})

# Save the submission file
submission_df.to_csv('submission_torch.csv', index=False, sep=',')
