In [49]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
from sklearn.preprocessing import StandardScaler


config = {
    "train_data": "SMG2020/ch/train.txt",
    "dev_data": "SMG2020/ch/dev.txt",
    "scaler": "joint",
    "model_type": "bert",
    "model_name": "statworx/bert-base-german-cased-finetuned-swiss",
    "lossfn": "MAELoss",
    "save_predictions": True,
    "num_train_epochs": 50,
    "train_batch_size": 32,
    "max_seq_length": 128,
    "epochs": 50,
    "lr": 2e-5,
    "seed": 42,
    'projection': 'utm'
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device: {device}')

torch.cuda.empty_cache()

Device: cuda


In [43]:
class JointScaler():
	def __init__(self):
		self.means = None
		self.stddev = None

	def fit_transform(self, data):
		self.fit(data)
		return self.transform(data)
	
	def fit(self, data):
		self.means = np.mean(data, axis=0)
		centereddata = data - self.means
		self.stddev = np.std(centereddata)

	def transform(self, data):
		return (data - self.means) / self.stddev

	def inverse_transform(self, data):
		return (data * self.stddev) + self.means
	

scalers = {
	'independent': StandardScaler,
	'joint': JointScaler
}

scaler = scalers[config['scaler']]()

In [3]:
class GeolocationDataset(Dataset):
    def __init__(self, texts, coordinates):
        self.texts = texts
        self.coordinates = coordinates
        self.tokenizer = AutoTokenizer.from_pretrained(config['model_name'])

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.texts[idx], padding='max_length', truncation=True, return_tensors='pt')
        inputs = {key: val.squeeze() for key, val in inputs.items()}  # Remove batch dimension
        coords = torch.tensor(self.coordinates[idx], dtype=torch.float)
        return inputs, coords


col_names = ['lat', 'lon', 'text']

train_data = pd.read_csv('vardial-shared-tasks/SMG2020/ch/train.txt',
                         delimiter='\t', header=None, names=col_names)
dev_data = pd.read_csv('vardial-shared-tasks/SMG2020/ch/dev.txt',
                       delimiter='\t', header=None, names=col_names)



train_coords = scaler.fit_transform(train_data[['lat', 'lon']].values)
train_dataset = GeolocationDataset(train_data['text'].tolist(), train_coords)
train_loader = DataLoader(train_dataset, batch_size=config['train_batch_size'], shuffle=True)

Path('data/ch').mkdir(exist_ok=True)
with open('data/ch/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

dev_coords = scaler.transform(dev_data[['lat', 'lon']].values)
dev_dataset = GeolocationDataset(dev_data['text'].tolist(), dev_coords)
dev_loader = DataLoader(dev_dataset, batch_size=config['train_batch_size'], shuffle=False)

In [50]:
import utm

zone_number = 32
zone_letter = 'T'

train_data.apply(lambda row: utm.from_latlon(
    row['lat'], row['lon'], force_zone_number=zone_number, force_zone_letter=zone_letter), axis=1)



0         (381131.8048563974, 5230807.862731854, 32, T)
1         (439783.2134515929, 5189909.538309485, 32, T)
2         (438114.2812478363, 5248830.813624965, 32, T)
3         (483390.1911730778, 5246305.664604473, 32, T)
4         (427548.5019832254, 5248951.614359201, 32, T)
                              ...                      
22595     (487904.1636466054, 5238515.311577261, 32, T)
22596      (365623.15180610865, 5181109.5548293, 32, T)
22597     (543374.9405695203, 5199765.483507183, 32, T)
22598    (419850.32615028275, 5237936.389318025, 32, T)
22599     (446619.0354707878, 5270976.269557163, 32, T)
Length: 22600, dtype: object

In [4]:
R = 6371  # Radius of Earth [km]

def haversine_distance(c1, c2):
    # c1 and c2 are arrays containing lat/lon in degrees
    c1 = np.radians(c1)
    c2 = np.radians(c2)
    dlat = c2[:, 0] - c1[:, 0]
    dlon = c2[:, 1] - c1[:, 1]

    a = np.sin(dlat / 2)**2 + np.cos(c1[:, 0]) * np.cos(c2[:, 0]) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    return R * c  # Distance [km]

def median_distance(preds, labels, scaler=scaler):
    if scaler:
        preds = scaler.inverse_transform(preds)
        labels = scaler.inverse_transform(labels)
    return np.median(haversine_distance(preds, labels))

def mean_distance(preds, labels, scaler=scaler):
    if scaler:
        preds = scaler.inverse_transform(preds)
        labels = scaler.inverse_transform(labels)
    return np.mean(haversine_distance(preds, labels))

In [5]:
from transformers import AutoModelForSequenceClassification, AdamW

model = AutoModelForSequenceClassification.from_pretrained(config['model_name'], num_labels=2)
optimizer = AdamW(model.parameters(), config['lr'])
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at statworx/bert-base-german-cased-finetuned-swiss and are newly initialized: ['bert.pooler.dense.weight', 'classifier.weight', 'classifier.bias', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
from torch.utils.tensorboard import SummaryWriter
import datetime
import csv
import os

class TensorBoardCheckpoint:
    def __init__(self, log_dir, checkpoint_path, best_only=True):
        self.checkpoint_path = checkpoint_path
        self.best_metric = float('inf')
        self.best_only = best_only
        self.date_str = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        self.log_dir = f'{log_dir}/{self.date_str}'
        self.writer = SummaryWriter(self.log_dir)
        self.metric_path = f'{self.log_dir}/metrics.csv'

    def log_metrics(self, metrics, step):
        for metric_name, metric_value in metrics.items():
            self.writer.add_scalar(metric_name, metric_value, step)

        with open(self.metric_path, 'a', newline='') as f:
            writer = csv.writer(f)
            if not os.path.isfile(self.metric_path):
                writer.writerow(metrics.keys())

            writer.writerow(metrics.values()) 

    def save_checkpoint(self, model, optimizer, epoch, metrics, scaler):
        checkpoint_path = f'{self.checkpoint_path}/{self.date_str}_best_model.pth'
        if metrics['Median_Distance/dev'] < self.best_metric:
            self.best_metric = metrics['Median_Distance/dev']
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'metrics': metrics,
                'scaler': scaler
            }, checkpoint_path)
            self.writer.add_text('New_Best_Checkpoint', self.checkpoint_path, epoch)
            print(f"New best checkpoint saved at {self.checkpoint_path}")
        elif not self.best_only:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'metrics': metrics,
            }, checkpoint_path)
            print(f"Checkpoint saved at {checkpoint_path}")

    def close(self):
        self.writer.close()

In [None]:
import torch
from torch.nn import L1Loss
from tqdm import tqdm
import numpy as np

loss_function = L1Loss()

tb_checkpoint = TensorBoardCheckpoint(log_dir='data/ch/logs',
                                      checkpoint_path='data/ch/checkpoints')

def compute_median_distance(preds, labels, scaler):
    preds = scaler.inverse_transform(preds)
    labels = scaler.inverse_transform(labels)
    return median_distance(preds, labels)

def train(model, train_loader, dev_loader, optimizer, loss_function, scaler, epochs=10):
    loss_function = loss_function.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()

            # Forward pass
            inputs, labels = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            outputs = model(**inputs)

            loss = loss_function(outputs.logits, labels)
            total_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} - Training loss: {avg_train_loss:.4f}")

        # Evaluate on dev (validation)
        model.eval()
        dev_preds = []
        dev_labels = []
        with torch.no_grad():
            for batch in tqdm(dev_loader):
                inputs, labels = batch
                inputs = {k: v.to(device) for k, v in inputs.items()}
                labels = labels.to(device)
                outputs = model(**inputs)
                dev_preds.append(outputs.logits.cpu().numpy())
                dev_labels.append(labels.cpu().numpy())

        dev_preds = np.vstack(dev_preds)
        dev_labels = np.vstack(dev_labels)

        median_dist = median_distance(dev_preds, dev_labels, scaler)
        mean_dist = mean_distance(dev_preds, dev_labels, scaler)

        metrics = {'Loss/train': avg_train_loss, 'Median_Distance/dev': median_dist, 'Mean_Distance/dev': mean_dist}
        tb_checkpoint.log_metrics(metrics, epoch)
        tb_checkpoint.save_checkpoint(model, optimizer, epoch, metrics, scaler)

with open('data/ch/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

train(model, train_loader, dev_loader, optimizer, loss_function, scaler, epochs=config['epochs'])

tb_checkpoint.close()

In [48]:
import os
import torch
import numpy as np
from tqdm import tqdm

checkpoint_dir = 'data/ch/checkpoints'

best_checkpoint = None
best_results = {
    'median_distance': float('inf'),
    'mean_distance': float('inf')
}

test_gold_data = pd.read_csv('vardial-shared-tasks/SMG2020/ch/test_gold.txt',
                       delimiter='\t', header=None, names=col_names)

for checkpoint_file in os.listdir(checkpoint_dir):
    checkpoint_path = os.path.join(checkpoint_dir, checkpoint_file)
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])

    checkpoint_scaler = checkpoint['scaler']()
    checkpoint_scaler.fit(train_data[['lat', 'lon']].values)

    test_gold_coords = checkpoint_scaler.transform(test_gold_data[['lat', 'lon']].values)
    test_gold_dataset = GeolocationDataset(test_gold_data['text'].tolist(), test_gold_coords)
    test_gold_loader = DataLoader(test_gold_dataset, batch_size=config['train_batch_size'], shuffle=False)

    model.eval()

    with torch.no_grad():
        test_preds = []
        for batch in tqdm(test_gold_loader):
            inputs, labels = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            
            outputs = model(**inputs)
            logits = outputs.logits
            test_preds.append(logits.cpu().numpy())

    test_preds = np.concatenate(test_preds, axis=0)

    results = {
        'median_distance': median_distance(test_gold_coords, test_preds, checkpoint_scaler),
        'mean_distance': mean_distance(test_gold_coords, test_preds, checkpoint_scaler)
    }

    print(f'{checkpoint_file}: {results}\n')

    if results['median_distance'] < best_results['median_distance']:
        best_checkpoint = checkpoint_file
        best_results['median_distance'] = results['median_distance']
        best_results['mean_distance'] = results['mean_distance']

print("\nBest Checkpoint:", best_checkpoint)
print("Best Results:", best_results)

100%|██████████| 97/97 [00:14<00:00,  6.66it/s]


20231107-230346_best_model.pth: {'median_distance': 16.44642298143306, 'mean_distance': 23.11484565124471}


100%|██████████| 97/97 [00:14<00:00,  6.72it/s]

20231108-155200_best_model.pth: {'median_distance': 16.570519292930843, 'mean_distance': 23.857210359269036}
Best Checkpoint: data/ch/checkpoints\20231107-230346_best_model.pth
Best Results: {'median_distance': 16.44642298143306, 'mean_distance': 23.11484565124471}



