In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
from sklearn.preprocessing import StandardScaler


config = {
    "train_data": "SMG2020/ch/train.txt",
    "dev_data": "SMG2020/ch/dev.txt",
    "ynorm": "jointscale",
    "model_type": "bert",
    "model_name": "statworx/bert-base-german-cased-finetuned-swiss",
    "lossfn": "MAELoss",
    "save_predictions": True,
    "num_train_epochs": 50,
    "train_batch_size": 32,
    "max_seq_length": 128,
    "epochs": 50,
    "lr": 2e-5,
    "seed": 42
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device: {device}')

torch.cuda.empty_cache()

Device: cuda


In [2]:
class GeolocationDataset(Dataset):
    def __init__(self, texts, coordinates):
        self.texts = texts
        self.coordinates = coordinates
        self.tokenizer = AutoTokenizer.from_pretrained(config['model_name'])

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.texts[idx], padding='max_length', truncation=True, return_tensors='pt')
        inputs = {key: val.squeeze() for key, val in inputs.items()}  # Remove batch dimension
        coords = torch.tensor(self.coordinates[idx], dtype=torch.float)
        return inputs, coords


col_names = ['lat', 'lon', 'text']
train_data = pd.read_csv('vardial-shared-tasks/SMG2020/ch/train.txt',
                         delimiter='\t', header=None, names=col_names)
dev_data = pd.read_csv('vardial-shared-tasks/SMG2020/ch/dev.txt',
                       delimiter='\t', header=None, names=col_names)

scaler = StandardScaler()
train_coords = scaler.fit_transform(train_data[['lat', 'lon']].values)
dev_coords = scaler.transform(dev_data[['lat', 'lon']].values)

# Save scaler for later use
Path('data/ch').mkdir(exist_ok=True)
with open('data/ch/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

train_dataset = GeolocationDataset(train_data['text'].tolist(), train_coords)
dev_dataset = GeolocationDataset(dev_data['text'].tolist(), dev_coords)

train_loader = DataLoader(train_dataset, batch_size=config['train_batch_size'], shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=config['train_batch_size'], shuffle=False)

In [3]:
R = 6371  # Radius of Earth in kilometers

def haversine_distance(c1, c2):
    # c1 and c2 are arrays containing latitude and longitude in degrees
    c1 = np.radians(c1)
    c2 = np.radians(c2)
    dlat = c2[:, 0] - c1[:, 0]
    dlon = c2[:, 1] - c1[:, 1]

    a = np.sin(dlat / 2)**2 + np.cos(c1[:, 0]) * np.cos(c2[:, 0]) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    return R * c  # Distance in kilometers

def median_distance(preds, labels, scaler=scaler):
    if scaler:
        preds = scaler.inverse_transform(preds)
        labels = scaler.inverse_transform(labels)
    return np.median(haversine_distance(preds, labels))

def mean_distance(preds, labels, scaler):
    if scaler:
        preds = scaler.inverse_transform(preds)
        labels = scaler.inverse_transform(labels)
    return np.mean(haversine_distance(preds, labels))

In [4]:
from transformers import AutoModelForSequenceClassification, AdamW

model = AutoModelForSequenceClassification.from_pretrained(config['model_name'], num_labels=2)

optimizer = AdamW(model.parameters(), config['lr'])

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at statworx/bert-base-german-cased-finetuned-swiss and are newly initialized: ['bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [5]:
from torch.utils.tensorboard import SummaryWriter
import os

class TensorBoardCheckpoint:
    def __init__(self, log_dir, checkpoint_path, best_only=True):
        self.writer = SummaryWriter(log_dir)
        self.checkpoint_path = checkpoint_path
        self.best_metric = float('inf')
        self.best_only = best_only

    def log_metrics(self, metrics, step):
        for metric_name, metric_value in metrics.items():
            self.writer.add_scalar(metric_name, metric_value, step)

    def save_checkpoint(self, model, optimizer, epoch, metrics):
        checkpoint_path = os.path.join(self.checkpoint_path, 'best_model_checkpoint.pth')
        if metrics['Median_Distance/dev'] < self.best_metric:
            self.best_metric = metrics['Median_Distance/dev']
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'metrics': metrics,
            }, checkpoint_path)
            self.writer.add_text('Best_Checkpoint', self.checkpoint_path, epoch)
            print(f"New best checkpoint saved at {self.checkpoint_path}")
        elif not self.best_only:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'metrics': metrics,
            }, checkpoint_path)
            print(f"Checkpoint saved at {checkpoint_path}")

    def close(self):
        self.writer.close()

In [6]:
import torch
from torch.nn import L1Loss
from tqdm import tqdm
import numpy as np
import datetime

loss_function = L1Loss()

run_dir = f'data/ch/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}'
tb_checkpoint = TensorBoardCheckpoint(log_dir=f'{run_dir}/tensorboard_logs',
                                      checkpoint_path=f'{run_dir}')

def compute_median_distance(preds, labels, scaler):
    preds = scaler.inverse_transform(preds)
    labels = scaler.inverse_transform(labels)
    return median_distance(preds, labels)

def train(model, train_loader, dev_loader, optimizer, loss_function, scaler, epochs=10):
    loss_function = loss_function.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()

            # Forward pass
            inputs, labels = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            outputs = model(**inputs)

            # Compute loss
            loss = loss_function(outputs.logits, labels)
            total_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} - Training loss: {avg_train_loss:.4f}")

        # Evaluate on the development set
        model.eval()
        dev_preds = []
        dev_labels = []
        with torch.no_grad():
            for batch in tqdm(dev_loader):
                inputs, labels = batch
                inputs = {k: v.to(device) for k, v in inputs.items()}
                labels = labels.to(device)
                outputs = model(**inputs)
                dev_preds.append(outputs.logits.cpu().numpy())
                dev_labels.append(labels.cpu().numpy())

        # Concatenate all the predictions and labels
        dev_preds = np.vstack(dev_preds)
        dev_labels = np.vstack(dev_labels)

        median_dist = median_distance(dev_preds, dev_labels, scaler)
        mean_dist = mean_distance(dev_preds, dev_labels, scaler)

        metrics = {'Loss/train': avg_train_loss, 'Median_Distance/dev': median_dist, 'Mean_Distance/dev': mean_dist}
        tb_checkpoint.log_metrics(metrics, epoch)
        tb_checkpoint.save_checkpoint(model, optimizer, epoch, metrics)

with open('data/ch/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

train(model, train_loader, dev_loader, optimizer, loss_function, scaler, epochs=config['epochs'])

tb_checkpoint.close()

  0%|          | 3/707 [00:30<1:59:58, 10.23s/it]


KeyboardInterrupt: 