In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
import torch.nn as nn
import numpy as np
import re
import tensorflow as tf

  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


In [2]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [3]:
class EarlyStopping:
    def __init__(self, patience=3, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_score = None
        self.early_stop = False
        self.counter = 0
        self.best_model_state = None

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.best_model_state = model.state_dict()
        elif score <= self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.best_model_state = model.state_dict()
            self.counter = 0

In [4]:
def train_epoch(model, data_loader, optimizer, device, scheduler, loss_fn):
    model = model.train()
    losses = []
    correct_predictions = 0
    total = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
#criterion
        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, labels)

        correct_predictions += torch.sum(preds == labels)
        total += len(labels)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / total, np.mean(losses)

In [5]:
def eval_model(model, data_loader, device, loss_fn):
    model = model.eval()
    losses = []
    correct_predictions = 0
    total = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs.logits, dim=1)
            loss = loss_fn(outputs.logits, labels)

            correct_predictions += torch.sum(preds == labels)
            total += len(labels)
            losses.append(loss.item())

    return correct_predictions.double() / total, np.mean(losses)

In [6]:
# Main training function
def train_bert_model(train_df, test_df, text_column, label_column):
    # OneHotEncoder for labels
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    labels = ohe.fit_transform(train_df[label_column].values.reshape(-1, 1))
    label_map = {i: category for i, category in enumerate(ohe.categories_[0])}
    num_labels = len(ohe.categories_[0])

    # Split train into train and validation
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_df[text_column].values,
        np.argmax(labels, axis=1),
        test_size=0.2,
        random_state=42
    )

    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels
    )

    # Create datasets
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextDataset(val_texts, val_labels, tokenizer)
    test_dataset = TextDataset(test_df[text_column].values,
                            np.argmax(ohe.transform(test_df[label_column].values.reshape(-1, 1)), axis=1),
                            tokenizer)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Training setup
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_loader) * 10  # 10 epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    loss_fn = CrossEntropyLoss().to(device)
    early_stopping = EarlyStopping(patience=3)
    
    # Training loop
    for epoch in range(20):
        train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, scheduler, loss_fn)
        val_acc, val_loss = eval_model(model, val_loader, device, loss_fn)
        
        print(f'Epoch {epoch + 1}')
        print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc:.3f}')
        print(f'Val Loss: {val_loss:.3f} | Val Acc: {val_acc:.3f}')

        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping triggered")
            break

    # Load best model
    model.load_state_dict(early_stopping.best_model_state)

    # Test evaluation
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for d in test_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())

    # Convert numerical predictions back to original labels
    predictions = [label_map[pred] for pred in predictions]
    true_labels = [label_map[label] for label in true_labels]

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions))

    # save the model
    model.save_pretrained("./model2")

In [7]:
df = pd.read_csv("./input/simpsons_dataset.csv")
print(df.shape)
df.head(3)

(158314, 2)


Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...


In [8]:
df = df[df.raw_character_text.isin([
    'Lisa Simpson', 'Homer Simpson', 'Bart Simpson', 'Marge Simpson', 
    'Ned Flanders', 'Grampa Simpson', 'Milhouse Van Houten', 
    'Nelson Muntz', 'Groundskeeper Willie'])]

In [9]:
df.raw_character_text.value_counts()

raw_character_text
Homer Simpson           29782
Marge Simpson           14141
Bart Simpson            13759
Lisa Simpson            11489
Ned Flanders             2144
Grampa Simpson           1880
Milhouse Van Houten      1862
Nelson Muntz             1172
Groundskeeper Willie      534
Name: count, dtype: int64

In [10]:
df.dropna(inplace=True)
df.shape

(72020, 2)

In [11]:
df.raw_character_text.unique()

array(['Lisa Simpson', 'Bart Simpson', 'Nelson Muntz',
       'Milhouse Van Houten', 'Homer Simpson', 'Marge Simpson',
       'Ned Flanders', 'Grampa Simpson', 'Groundskeeper Willie'],
      dtype=object)

In [12]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip().lower()

In [13]:
df['spoken_words'] = df['spoken_words'].map(preprocess)

In [14]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [15]:
train_df.shape, test_df.shape

((57616, 2), (14404, 2))

In [16]:
train_df.sample(1)

Unnamed: 0,raw_character_text,spoken_words
6970,Homer Simpson,yeah


In [17]:
train_bert_model(train_df, test_df, 'spoken_words', 'raw_character_text')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1
Train Loss: 1.459 | Train Acc: 0.464
Val Loss: 1.393 | Val Acc: 0.470
Epoch 2
Train Loss: 1.238 | Train Acc: 0.551
Val Loss: 1.350 | Val Acc: 0.506
Epoch 3
Train Loss: 0.995 | Train Acc: 0.648
Val Loss: 1.475 | Val Acc: 0.486
Epoch 4
Train Loss: 0.755 | Train Acc: 0.735
Val Loss: 1.754 | Val Acc: 0.487
Epoch 5
Train Loss: 0.575 | Train Acc: 0.802
Val Loss: 1.973 | Val Acc: 0.472
Early stopping triggered

Classification Report:
                      precision    recall  f1-score   support

        Bart Simpson       0.37      0.43      0.40      2628
      Grampa Simpson       0.25      0.15      0.19       346
Groundskeeper Willie       0.71      0.35      0.47        97
       Homer Simpson       0.58      0.62      0.60      5530
        Lisa Simpson       0.41      0.33      0.36      2172
       Marge Simpson       0.45      0.53      0.49      2636
 Milhouse Van Houten       0.20      0.06      0.10       347
        Ned Flanders       0.41      0.27      0.32       404
  