# Baseline

In [None]:
!pip install transformers datasets torch scikit-learn nltk

In [2]:
import torch
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
import pandas as pd
from collections import Counter
from nltk.translate.bleu_score import corpus_bleu

## 1. Grammatical Acceptability (CoLA)

In [5]:
dataset = load_dataset("glue", "cola")
validation_data = dataset['validation']

# Calculate Baseline Metrics
majority_class = 1 if validation_data['label'].count(1) > validation_data['label'].count(0) else 0
true_labels = validation_data['label']
predicted_labels_baseline = [majority_class] * len(validation_data)

baseline_cola_accuracy = accuracy_score(true_labels, predicted_labels_baseline)
baseline_cola_f1 = f1_score(true_labels, predicted_labels_baseline) #Default is binary f1-score.
baseline_cola_mcc = matthews_corrcoef(true_labels, predicted_labels_baseline)  # Calculate MCC


print(f"CoLA Majority class baseline accuracy: {baseline_cola_accuracy:.4f}")
print(f"CoLA Majority class baseline F1-score: {baseline_cola_f1:.4f}")
print(f"CoLA Majority class baseline MCC: {baseline_cola_mcc:.4f}") # MCC for majority class baseline is typically 0. You could also calculate the actual MCC by comparing predictions with ground truths.


CoLA Majority class baseline accuracy: 0.6913
CoLA Majority class baseline F1-score: 0.8175
CoLA Majority class baseline MCC: 0.0000


## 2. Grammar Correction (Lang-8)

In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Load preprocessed Lang-8 data
try:
    lang8_df = pd.read_csv("lang8.csv")
except FileNotFoundError:
    print("Error: preprocessed_lang8.csv not found. Make sure you have preprocessed the Lang-8 data.")
    exit(1)  # Or handle the error as needed


# Split into train and validation (if you haven't already) - IMPORTANT!
train_df = lang8_df[:30000]  # Example: use the first 1000 examples for demonstration
validation_df = lang8_df[30000:35000] # Example: use examples 1000-1050 for demonstration. Change this index if you want to use more values.


# Calculate the identity baseline BLEU score on the VALIDATION set
references = [[text.split()] for text in validation_df['corrected_text']]
candidates = [text.split() for text in validation_df['text']]

baseline_bleu = corpus_bleu(references, candidates)
print(f"Lang-8 Identity Baseline BLEU: {baseline_bleu:.4f}")



def save_metrics_to_csv(metrics, filename="baseline_metrics.csv"):
    try:
        metrics_df = pd.DataFrame([metrics])
        metrics_df.to_csv(filename, index=False)
        print(f"Metrics saved to {filename}")
    except Exception as e:
        print(f"Error saving metrics: {e}")

# Example of saving the baseline BLEU score
baseline_metrics = {"bleu": baseline_bleu}
save_metrics_to_csv(baseline_metrics, filename="lang8_baseline_metrics.csv") # Save baseline bleu score.

Lang-8 Identity Baseline BLEU: 0.5360
Metrics saved to lang8_baseline_metrics.csv


## 3. Sentiment Analysis (SST-full)

In [None]:

# Load the SST-2 dataset
dataset_sst = load_dataset("glue", "sst2")

# Access the validation data
validation_data = dataset_sst["validation"]


# Calculate baseline accuracy
sentiment_counts = Counter(validation_data['label'])
majority_sentiment = sentiment_counts.most_common(1)[0][0]
baseline_accuracy = sentiment_counts[majority_sentiment] / len(validation_data)

print(f"SST-2 Majority class baseline accuracy: {baseline_accuracy:.4f}")

# Calculate baseline macro F1-score (more appropriate for imbalanced datasets or when you care about both positive and negative classes equally)
true_labels = validation_data['label']
predicted_labels_baseline = [majority_sentiment] * len(validation_data)

baseline_macro_f1 = f1_score(true_labels, predicted_labels_baseline, average='macro')  # Use 'macro' for multiclass

print(f"SST-2 Majority class baseline macro F1-score: {baseline_macro_f1:.4f}")


# Calculate baseline F1 (useful for comparison to other published results that might use this metric for SST-2)
baseline_f1 = f1_score(true_labels, predicted_labels_baseline, average='binary') # Use 'binary' for SST-2 as it is typically considered a binary task. If your data has significantly imbalanced classes, the argument pos_label might also be helpful.


print(f"SST-2 Majority class baseline F1-score: {baseline_f1:.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

SST-2 Majority class baseline accuracy: 0.5092
SST-2 Majority class baseline macro F1-score: 0.3374
SST-2 Majority class baseline F1-score: 0.6748


## 4. Emotion Detection (GoEmotions)

In [9]:
# Load GoEmotions dataset
emotions = load_dataset("google-research-datasets/go_emotions")

# Access the validation data
validation_data = emotions["validation"]

# Preprocess labels for baseline calculation (multi-label to single-label for baseline purposes)
def preprocess_labels_for_baseline(example):
    # Take the first emotion label as the representative label for baseline
    example["baseline_label"] = example["labels"][0] #Takes the first emotion label.
    return example


#For multilabel you have to use some method to choose only 1 target label as a baseline, as done above.
preprocessed_validation_data = validation_data.map(preprocess_labels_for_baseline)


# Calculate baseline (using macro F1-score, as it's multi-class and potentially imbalanced)

# Find the most common baseline label in the VALIDATION set.
baseline_label_counts = Counter(preprocessed_validation_data['baseline_label'])
majority_baseline_label = baseline_label_counts.most_common(1)[0][0]

# Predict the majority class for all samples in the validation set.
predicted_baseline_labels = [majority_baseline_label] * len(preprocessed_validation_data)



baseline_macro_f1 = f1_score(
    preprocessed_validation_data["baseline_label"], predicted_baseline_labels, average="macro"
)

baseline_accuracy = accuracy_score(preprocessed_validation_data["baseline_label"], predicted_baseline_labels)

print(f"GoEmotions Baseline Macro-F1: {baseline_macro_f1:.4f}")
print(f"GoEmotions Baseline Accuracy: {baseline_accuracy:.4f}")


GoEmotions Baseline Macro-F1: 0.0162
GoEmotions Baseline Accuracy: 0.2934
