# TDT13 Project - Oskar Holm (F2023)

This project is based on the shared task related to Social Media Geolocation (SMG) from VarDial 2020 and 2021, specifically the Workshop on Natural Language Processing (NLP) for Similar Languages, Varieties, and Dialects. Unlike typical VarDial tasks that involve choosing from a set of variety labels, this task focuses on predicting the latitude and longitude from which a social media post was made.

The task remained the same in both 2020 and 2021, covering three language areas: Bosnian-Croatian-Montenegrin-Serbian, German (Germany and Austria), and German-speaking Switzerland. This project is limited to the German-speaking Switzerland area due to time constraints and resource availability.

The goal of the project is to replicate the results of a study that used a BERT-based classifier for this double regression task. The dataset from the 2020 VarDial challenge is chosen because it had more submissions compared to the 2021 dataset. 

## Dependencies and Imports

In [2]:
%load_ext autoreload

In [17]:
%autoreload 2

import torch
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import json 
from transformers import AutoModelForSequenceClassification, AdamW
from torch.nn import L1Loss
from tqdm import tqdm

from lib.scalers import scalers
from lib.train_utils import TensorBoardCheckpoint
from lib.geo_utils import mean_distance, median_distance, to_projection, GeolocationDataset

## Load Config and Datasets

In [14]:
config_name = 'utm_lr2e-5'
with open('./configs.json', 'r') as f: 
    config = json.load(f)[config_name]

config

{'train_data': 'SMG2020/ch/train.txt',
 'dev_data': 'SMG2020/ch/dev.txt',
 'scaler': 'joint',
 'model_type': 'bert',
 'model_name': 'statworx/bert-base-german-cased-finetuned-swiss',
 'lossfn': 'MAELoss',
 'save_predictions': True,
 'train_batch_size': 32,
 'max_seq_length': 128,
 'epochs': 50,
 'lr': 2e-05,
 'seed': 42,
 'projection': 'utm',
 'zone_number': 32,
 'zone_letter': 'T'}

In [18]:
train_data = pd.read_table('vardial-shared-tasks/SMG2020/ch/train.txt', header=None, names=['lat', 'lon', 'text'])
dev_data = pd.read_table('vardial-shared-tasks/SMG2020/ch/dev.txt', header=None, names=['lat', 'lon', 'text'])

train_data, col_names = to_projection(train_data, config)
dev_data, _ = to_projection(dev_data, config)

scaler = scalers[config['scaler']]()

train_coords = scaler.fit_transform(train_data[col_names[:2]].values)
train_dataset = GeolocationDataset(train_data['text'].tolist(), train_coords, config)
train_loader = DataLoader(train_dataset, batch_size=config['train_batch_size'], shuffle=True)

Path('data/ch').mkdir(exist_ok=True)
with open('data/ch/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

dev_coords = scaler.transform(dev_data[col_names[:2]].values)
dev_dataset = GeolocationDataset(dev_data['text'].tolist(), dev_coords, config)
dev_loader = DataLoader(dev_dataset, batch_size=config['train_batch_size'], shuffle=False)

train_data

Unnamed: 0,easting,northing,text
0,381131.804856,5.230808e+06,Dr Chester Bennington isch tot 😔😔😔 #rip #linki...
1,439783.213452,5.189910e+06,Mini Fründin hed Lust uf Doktorspieli gha... ....
2,438114.281248,5.248831e+06,Slayer isch besser. Det han ich gescht mini Dr...
3,483390.191173,5.246306e+06,gaht au innere stund? bin grad am speck brate ...
4,427548.501983,5.248952e+06,sie: thy er: ? sie: thy= thank you er: player ...
...,...,...,...
22595,487904.163647,5.238515e+06,"Bin grad in Bus igstige, da seit de Buschauffe..."
22596,365623.151806,5.181110e+06,Rien ne surpassera Dragostea Din Tei de O-zone...
22597,543374.940570,5.199765e+06,het öpert au kei bock meh zum schaffa und lust...
22598,419850.326150,5.237936e+06,Oh wenn wedermol en jodel -5 het wos ned verdi...


## Training and Evaluation

### Load Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device: {device}')
torch.cuda.empty_cache()

model = AutoModelForSequenceClassification.from_pretrained(config['model_name'], num_labels=2)
optimizer = AdamW(model.parameters(), config['lr'])
model.to(device)

### Training Loop

In [None]:
loss_function = L1Loss()

tb_checkpoint = TensorBoardCheckpoint(log_dir='data/ch/logs',
                                      checkpoint_path='data/ch/checkpoints', run_name=config_name)

def train(model, train_loader, dev_loader, optimizer, loss_function, scaler, epochs=10):
    loss_function = loss_function.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()

            # Forward pass
            inputs, labels = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            outputs = model(**inputs)

            # Calculate loss
            loss = loss_function(outputs.logits, labels)
            total_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} - Training loss: {avg_train_loss:.4f}")

        # Evaluate on dev (validation)
        model.eval()
        dev_preds = []
        dev_labels = []
        with torch.no_grad():
            for batch in tqdm(dev_loader):
                inputs, labels = batch
                inputs = {k: v.to(device) for k, v in inputs.items()}
                labels = labels.to(device)
                outputs = model(**inputs)
                dev_preds.append(outputs.logits.cpu().numpy())
                dev_labels.append(labels.cpu().numpy())

        # Metrics
        dev_preds = np.vstack(dev_preds)
        dev_labels = np.vstack(dev_labels)

        median_dist = median_distance(dev_preds, dev_labels, scaler)
        mean_dist = mean_distance(dev_preds, dev_labels, scaler)

        metrics = {'Loss/train': avg_train_loss, 'Median_Distance/dev': median_dist, 'Mean_Distance/dev': mean_dist}
        tb_checkpoint.log_metrics(metrics, epoch)
        tb_checkpoint.save_checkpoint(model, optimizer, epoch, metrics, scaler)

with open('data/ch/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

train(model, train_loader, dev_loader, optimizer, loss_function, scaler, epochs=config['epochs'])

tb_checkpoint.close()

### Model Evaluation

In [None]:
def evaluate_geolocation_model(checkpoint_dir, checkpoint_file, train_data, test_gold_data, config, model, device):
    checkpoint_path = f'{checkpoint_dir}/{checkpoint_file}_best_model.pth'
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])

    print(checkpoint['metrics'])

    checkpoint_scaler = checkpoint['scaler']
    checkpoint_scaler.fit(train_data[['easting', 'northing']].values)

    test_gold_coords = checkpoint_scaler.transform(test_gold_data[['easting', 'northing']].values)
    test_gold_dataset = GeolocationDataset(test_gold_data['text'].tolist(), test_gold_coords)
    test_gold_loader = DataLoader(test_gold_dataset, batch_size=config['train_batch_size'], shuffle=False)

    model.eval()

    with torch.no_grad():
        test_preds = []
        for batch in tqdm(test_gold_loader):
            inputs, labels = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)

            outputs = model(**inputs)
            logits = outputs.logits
            test_preds.append(logits.cpu().numpy())

    test_preds = np.concatenate(test_preds, axis=0)

    results = {
        'median_distance': median_distance(test_gold_coords, test_preds, checkpoint_scaler, config),
        'mean_distance': mean_distance(test_gold_coords, test_preds, checkpoint_scaler, config)
    }

    print(f'{checkpoint_file}: {results}\n')

In [None]:
# checkpoint_dir = 'data/ch/checkpoints'

# for checkpoint_file in os.listdir(checkpoint_dir):
#     checkpoint_path = os.path.join(checkpoint_dir, checkpoint_file)
#     checkpoint = torch.load(checkpoint_path)
#     model.load_state_dict(checkpoint['model_state_dict'])

#     checkpoint_scaler = checkpoint['scaler']()
#     checkpoint_scaler.fit(train_data[['lat', 'lon']].values)

#     test_gold_coords = checkpoint_scaler.transform(test_gold_data[['lat', 'lon']].values)
#     test_gold_dataset = GeolocationDataset(test_gold_data['text'].tolist(), test_gold_coords)
#     test_gold_loader = DataLoader(test_gold_dataset, batch_size=config['train_batch_size'], shuffle=False)

#     model.eval()

#     with torch.no_grad():
#         test_preds = []
#         for batch in tqdm(test_gold_loader):
#             inputs, labels = batch
#             inputs = {k: v.to(device) for k, v in inputs.items()}
#             labels = labels.to(device)
            
#             outputs = model(**inputs)
#             logits = outputs.logits
#             test_preds.append(logits.cpu().numpy())

#     test_preds = np.concatenate(test_preds, axis=0)

#     results = {
#         'median_distance': median_distance(test_gold_coords, test_preds, checkpoint_scaler),
#         'mean_distance': mean_distance(test_gold_coords, test_preds, checkpoint_scaler)
#     }

#     print(f'{checkpoint_file}: {results}\n')

#     if results['median_distance'] < best_results['median_distance']:
#         best_checkpoint = checkpoint_file
#         best_results['median_distance'] = results['median_distance']
#         best_results['mean_distance'] = results['mean_distance']

# print("\nBest Checkpoint:", best_checkpoint)
# print("Best Results:", best_results)