In [None]:
import random
import numpy as np
import pandas as pd
import torch
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from imblearn.over_sampling import RandomOverSampler
from datasets import load_dataset
from huggingface_hub import login
from tqdm.auto import tqdm

def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

set_seed(42)

LR = 5e-5
EPS = 1e-8
EPOCHS = 50
BATCH_SIZE = 32
ACCUMULATION_STEPS = 4
MAX_LEN = 128
EARLY_STOPPING_PATIENCE = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Log in to Hugging Face
login(token='YOUR_HF_TOKEN')  # Replace with your actual Hugging Face token

# Load dataset
dataset = load_dataset('onekat/lit-dataset')

# Extract the only split
df = pd.DataFrame(dataset['train'])

# Ensure correct label mappings
label_names = {0: 'Intensifier', 1: 'Emphatic', 2: 'Etymological'}
df['sentido'] = df['label'].map(label_names)

# Load spaCy and extract sentences containing 'literalmente'
nlp = spacy.load('es_core_news_sm')

def extract_literalmente_sentence(text):
    doc = nlp(text)
    sentences_with_literalmente = []
    for sent in doc.sents:
        if 'literalmente' in sent.text.lower():
            sentences_with_literalmente.append(sent.text.strip())
    return ' '.join(sentences_with_literalmente)

df['text_literalmente'] = df['text'].apply(extract_literalmente_sentence)

# Remove rows without 'literalmente'
df = df[df['text_literalmente'].str.strip() != '']

# Model configurations
model_label_configs = {
    'Model_1': {'labels_to_include': ['Etymological', 'Intensifier', 'Emphatic'], 'label_mapping': {'Etymological': 2, 'Intensifier': 0, 'Emphatic': 1}},
    'Model_2': {'labels_to_include': ['Intensifier', 'Emphatic'], 'label_mapping': {'Intensifier': 0, 'Emphatic': 1}},
    'Model_3': {'labels_to_include': ['Etymological', 'Intensifier'], 'label_mapping': {'Etymological': 2, 'Intensifier': 0}},
    'Model_4': {'labels_to_include': ['Etymological', 'Emphatic'], 'label_mapping': {'Etymological': 2, 'Emphatic': 1}},
    'Model_5': {'labels_to_include': ['Etymological', 'Intensifier', 'Emphatic'], 'label_mapping': {'Etymological': 2, 'Intensifier': 0, 'Emphatic': 0}},
    'Model_6': {'labels_to_include': ['Etymological', 'Intensifier', 'Emphatic'], 'label_mapping': {'Etymological': 2, 'Intensifier': 0, 'Emphatic': 2}},
}

tokenizer = BertTokenizerFast.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', do_lower_case=True)

def tokenize_data(sentences, labels):
    inputs = tokenizer(
        list(sentences),
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return inputs['input_ids'], inputs['attention_mask'], torch.tensor(labels)

def create_dataloader(inputs, masks, labels, sampler):
    data = TensorDataset(inputs, masks, labels)
    return DataLoader(data, sampler=sampler(data), batch_size=BATCH_SIZE)

def train_and_evaluate(model_name, label_config):
    print(f"\nTraining {model_name}...\n")
    df_model = df.copy()
    df_model = df_model[df_model['sentido'].isin(label_config['labels_to_include'])]
    df_model['label_mapped'] = df_model['sentido'].map(label_config['label_mapping'])
    df_model.dropna(subset=['text_literalmente', 'label_mapped'], inplace=True)
    sentences = df_model['text_literalmente'].tolist()
    labels = df_model['label_mapped'].tolist()
    
    # Apply Random Oversampling
    temp_df = pd.DataFrame({'text': sentences, 'label': labels})
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(temp_df[['text']], temp_df['label'])
    sentences_resampled = X_resampled['text'].tolist()
    labels_resampled = y_resampled.tolist()
    
    # Split dataset
    X_train, X_temp, y_train, y_temp = train_test_split(
        sentences_resampled, labels_resampled, test_size=0.2, random_state=42, stratify=labels_resampled)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
    
    print(f"Number of training samples: {len(X_train)}")
    print(f"Number of validation samples: {len(X_val)}")
    print(f"Number of test samples: {len(X_test)}")
    
    train_inputs, train_masks, train_labels = tokenize_data(X_train, y_train)
    val_inputs, val_masks, val_labels = tokenize_data(X_val, y_val)
    test_inputs, test_masks, test_labels = tokenize_data(X_test, y_test)
    
    train_dataloader = create_dataloader(train_inputs, train_masks, train_labels, RandomSampler)
    val_dataloader = create_dataloader(val_inputs, val_masks, val_labels, SequentialSampler)
    test_dataloader = create_dataloader(test_inputs, test_masks, test_labels, SequentialSampler)
    
    num_labels = len(set(label_config['label_mapping'].values()))
    model = BertForSequenceClassification.from_pretrained(
        'dccuchile/bert-base-spanish-wwm-uncased',
        num_labels=num_labels,
        output_attentions=False,
        output_hidden_states=False,
    )
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=LR, eps=EPS)
    total_steps = len(train_dataloader) * EPOCHS // ACCUMULATION_STEPS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    
    best_val_accuracy = 0
    epochs_no_improve = 0
    
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        optimizer.zero_grad()
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)
        for step, batch in enumerate(progress_bar):
            batch_input_ids = batch[0].to(device)
            batch_input_mask = batch[1].to(device)
            batch_labels = batch[2].to(device)
            outputs = model(batch_input_ids, attention_mask=batch_input_mask, labels=batch_labels)
            loss = outputs.loss / ACCUMULATION_STEPS
            total_loss += loss.item()
            loss.backward()
            if (step + 1) % ACCUMULATION_STEPS == 0 or (step + 1) == len(train_dataloader):
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            progress_bar.set_postfix({'Loss': loss.item() * ACCUMULATION_STEPS})
        avg_train_loss = total_loss / len(train_dataloader)
        
        # Validation
        model.eval()
        val_preds, val_labels_list = [], []
        with torch.no_grad():
            for batch in val_dataloader:
                batch_input_ids = batch[0].to(device)
                batch_input_mask = batch[1].to(device)
                batch_labels = batch[2].to(device)
                outputs = model(batch_input_ids, attention_mask=batch_input_mask)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_labels_list.extend(batch_labels.cpu().numpy())
        val_accuracy = np.mean(np.array(val_preds) == np.array(val_labels_list))
        print(f"Epoch {epoch + 1}/{EPOCHS} - Average Training Loss: {avg_train_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")
        
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            epochs_no_improve = 0
            model_name_hub = f'onekat/{model_name}'
            model_card_filename = f"README_{model_name}.md"
            
            inverse_label_mapping = {v: k for k, v in label_config['label_mapping'].items()}
            target_names = [inverse_label_mapping[i] for i in range(num_labels)]
            val_classification_report = classification_report(
                val_labels_list, val_preds, target_names=target_names, output_dict=True
            )
            
            # Save README
            with open(model_card_filename, "w") as f:
                f.write(f"# {model_name}\n")
                f.write(f"Validation Accuracy: {val_accuracy:.4f}\n")
                f.write(f"Classification Report:\n{val_classification_report}\n")
            
            # Push model to Hugging Face Hub
            model.push_to_hub(model_name_hub, token='YOUR_HF_TOKEN')
            tokenizer.push_to_hub(model_name_hub, token='YOUR_HF_TOKEN')
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= EARLY_STOPPING_PATIENCE:
                print(f"Early stopping at epoch {epoch + 1}.")
                break
    
    print(f"Training of {model_name} completed.")

# Train all models
for model_name, label_config in model_label_configs.items():
    train_and_evaluate(model_name, label_config)

print("Training completed.")
