In [14]:
import logging
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from torch.utils.data import DataLoader, random_split
import sys
sys.path.append("../../") # go
    

from common.util import get_device


device = get_device()
print(f"device: {device}")

is_on_kaggle = False
model_name = "google-bert/bert-base-uncased"
if is_on_kaggle:
    LOCAL_MODEL_PATH = '/kaggle/input/bert-model/pytorch/bertmodel/2/bert_regressor (2)/local_bert_base_uncased_model'
    BATCH_SIZE = 16
    MAX_LEN = 256
    model_path = r"/kaggle/input/bert-model/pytorch/bertmodel/2/bert_regressor/bert_regressor.pth"
    train_data_path  = r"/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv"
    test_data_path = r"/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv"
    TOKENIZER_PATH = r"/kaggle/input/bert-model/pytorch/bertmodel/2/bert_regressor (2)/local_bert_base_uncased_tokenizer"
    output_file_path = r"/kaggle/working/submission.csv"
else:
    folder_path = "../data/"
    LOCAL_MODEL_PATH = '../bert_base_uncased_model.pkl'
    BATCH_SIZE = 16
    MAX_LEN = 256
    model_path = f"{folder_path}bert_regressor.pth"
    train_data_path = f"{folder_path}train.csv"
    test_data_path = f"{folder_path}test.csv"
    TOKENIZER_PATH = f"bert_base_uncased_tokenizer"
    output_file_path = f"{folder_path}submission.csv"


device: mps


In [15]:
class EssayDataset(Dataset):
    """ Custom Dataset class for essays """

    def __init__(self, tokenizer, essays, max_length, labels):
        self.tokenizer = tokenizer
        self.texts = essays
        self.max_length = max_length
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].flatten()
        attention_mask = inputs['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }, self.labels[idx]



class CustomDataModule(pl.LightningDataModule):
    def __init__(self, data, labels, batch_size=32, val_split=0.2, test_split=0.1):
        super().__init__()
        self.data = data
        self.labels = labels
        self.batch_size = batch_size
        self.val_split = val_split
        self.test_split = test_split

    def setup(self, stage=None):
        tokenizer = BertTokenizer.from_pretrained(model_name)
        dataset = EssayDataset(tokenizer, self.data, MAX_LEN, self.labels)
        # dataset = EssayDataset(self.data, self.labels)
        val_size = int(len(dataset) * self.val_split)
        test_size = int(len(dataset) * self.test_split)
        train_size = len(dataset) - val_size - test_size

        self.train_dataset, self.val_dataset, self.test_dataset = random_split(
            dataset, [train_size, val_size, test_size]
        )

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)


class BertRegressor(nn.Module):
    """ BERT Model for Regression Tasks """

    def __init__(self, pre_trained_model_name):
        super(BertRegressor, self).__init__()
        self.bert = BertModel.from_pretrained(pre_trained_model_name)
        self.out = nn.Linear(self.bert.config.hidden_size, 1)  # Use 'out' to match the state dict

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.out(pooled_output)


In [10]:

model = BertRegressor(model_name).to(device)
tokenizer = BertTokenizer.from_pretrained(model_name)
df = pd.read_csv(train_data_path)
X = df[[ 'full_text']]
y = df['score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = EssayDataset(tokenizer, list(X_train["full_text"]), MAX_LEN, list(y_train))
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)

running_loss = 0.
last_loss = 0.

loss_fn = nn.MSELoss()  # mean square error
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [11]:
for i, data in tqdm(enumerate(train_dataloader)):
    # Every data instance is an input + label pair
    inputs, labels = data
    inputs = {k: v.to(device) for k, v in inputs.items()}
    labels = labels.to(device)

    # Zero your gradients for every batch!
    optimizer.zero_grad()

    # Make predictions for this batch
    outputs = model(**inputs)

    # Compute the loss and its gradients
    loss = loss_fn(outputs.reshape(-1), labels.type(torch.float32))
    loss.backward()

    # Adjust learning weights
    optimizer.step()

    # Gather data and report
    running_loss += loss.item()
    if i % 1000 == 999:
        last_loss = running_loss / 1000  # loss per batch
        print('  batch {} loss: {}'.format(i + 1, last_loss))
        running_loss = 0.

torch.save(model.state_dict(), LOCAL_MODEL_PATH)


102it [05:29,  3.23s/it]


KeyboardInterrupt: 