# Location detection

The task is to detect location names from ukrainian and russian texts.

In [None]:
!pip install datasets transformers evaluate seqeval pycld2 peft

In [None]:
!pip install accelerate -U

In [None]:
%matplotlib inline

import os
import re
import spacy
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from torch import nn
import string
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import TrainingArguments, get_linear_schedule_with_warmup
from transformers import Trainer, pipeline
from peft import get_peft_model, LoraConfig, TaskType, PeftConfig, PeftModel
import gc
import pycld2
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
!python -m spacy download uk_core_news_sm

In [None]:
global_device = 'cpu'

if torch.cuda.is_available():
    global_device = 'cuda'

DATA_DIR = "../data/location_detection/"
os.environ['WANDB_DISABLED']='true'

## Load data

In [None]:
uk_dataset = pd.read_csv(DATA_DIR + "uk_geo_dataset.csv", converters={'loc_markers': eval})
ru_dataset = pd.read_csv(DATA_DIR + "ru_geo_dataset.csv", converters={'loc_markers': eval})

In [None]:
uk_dataset.head()

In [None]:
ru_dataset.head()

## Evaluation

Using metric function from kaggle competition:

In [None]:
def process_text(text):
    text = re.sub(r'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'\d', '', text)
    text = re.sub(r'\b\w\b\s?]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower()

def metric(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    tp, fp, fn, p = 0.0, 0.0, 0.0, 0.0

    for y_true_sample, y_pred_sample in zip(y_true, y_pred):
        y_true_sample = set([process_text(s) for s in y_true_sample])
        y_pred_sample = set([process_text(s) for s in y_pred_sample])

        tp += len(y_true_sample & y_pred_sample)
        fp += len(y_pred_sample - y_true_sample)
        fn += len(y_true_sample - y_pred_sample)
        p += len(y_true_sample)

    if tp + fp == 0:
        if p == 0:
            precision = 1.0
        else:
            precision = 0.0
    else:
        precision = tp/(tp + fp)

    if tp + fn == 0:
        if p == 0:
            recall = 1.0
        else:
            recall = 0.0
    else:
        recall = tp/(tp+fn)

    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2*precision*recall/(precision+recall)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

## Data preparation

To mark loaction using two tokens: begining of location (B-LOC) and inside of location (I-LOC).

This is necesary, because with using only one LOC token two different locations and one location with two words in it would look the same.

In [None]:
nlp = spacy.load('uk_core_news_sm', disable=['tagger', 'parser', 'ner', 'texcat'])

def tokenize(texts, all_texts_loc_markers, tokenizer, batch_size=128, n_process=-1):
    all_texts_tokenized = list(tokenizer.pipe(texts, batch_size=batch_size, n_process=n_process))

    result_tokens = []
    result_labels = []
    for tokenized_text, loc_markers in zip(all_texts_tokenized, all_texts_loc_markers):
        tokens = [token.text for token in tokenized_text]
        labels = ['O'] * len(tokenized_text)

        for idx, token in enumerate(tokenized_text):
            for start, end in loc_markers:
                if token.idx >= start and token.idx + len(token.text) <= end:
                    if token.idx == start:
                        # If token start pos == marker start,
                        # then it is begining of new location name
                        labels[idx] = 'B-LOC'
                    else:
                        # If token start pos > marker start,
                        # then it is inside of location name
                        labels[idx] = 'I-LOC'
        result_tokens.append(tokens)
        result_labels.append(labels)

    return result_tokens, result_labels

In [None]:
def process_dataset(dataset, result_path, add_col=None, n_splits=100):
    try:
        os.remove(result_path)
    except OSError:
        pass

    for split in tqdm(np.array_split(dataset, n_splits), total=n_splits, bar_format='{l_bar}{bar:100}{r_bar}'):
        tokens, labels = tokenize(split['text'].to_list(), split['loc_markers'].to_list(), nlp)
        if add_col is not None:
            result_df = pd.DataFrame({'tokens': tokens, 'labels': labels, add_col: split[add_col].to_list()})
        else:
            result_df = pd.DataFrame({'tokens': tokens, 'labels': labels})
        if not os.path.isfile(result_path):
            result_df.to_parquet(result_path, engine='fastparquet')
        else:
            result_df.to_parquet(result_path, engine='fastparquet', append=True)

    # Save fastparquet as pyarrow
    result_df = pd.read_parquet(result_path, engine='fastparquet')
    result_df.to_parquet(result_path)

## Adevrarial validation

In [None]:
av_test_dataset = pd.read_csv(DATA_DIR + 'competition/test.csv', converters = {'locations': eval})

In [None]:
av_uk_dataset = uk_dataset.sample(len(av_test_dataset))
av_uk_dataset['is_test'] = 0
av_test_dataset['is_test'] = 1
av_dataset = pd.concat([av_uk_dataset, av_test_dataset])

In [None]:
av_vectorizer = TfidfVectorizer().fit(av_dataset['text'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    av_vectorizer.transform(av_dataset['text']),
    av_dataset['is_test']
)
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
av_pred = gbc.predict(X_test)
roc_auc_score(y_test, av_pred)

In [None]:
uk_dataset['is_pred_test'] = gbc.predict(av_vectorizer.transform(uk_dataset['text']))

In [None]:
uk_dataset = uk_dataset[uk_dataset['is_pred_test'] == 1]
len(uk_dataset)

## Save datasets

In [None]:
uk_train_dataset = uk_dataset[uk_dataset['is_valid'] == False].reset_index(drop=True)
uk_holdout_dataset = uk_dataset[uk_dataset['is_valid'] == True].reset_index(drop=True)
process_dataset(uk_train_dataset.sample(frac=0.01), DATA_DIR + 'uk_geo_dataset_processed_train_av.parquet', 'location_count', n_splits=10)
process_dataset(uk_holdout_dataset.sample(frac=0.01), DATA_DIR + 'uk_geo_dataset_processed_holdout_av.parquet', n_splits=10)

In [None]:
ru_train_dataset = ru_dataset[ru_dataset['doc_id'] <= 700000].reset_index(drop=True)
ru_holdout_dataset = ru_dataset[ru_dataset['doc_id'] > 700000].reset_index(drop=True)
process_dataset(ru_train_dataset.sample(frac=0.001), DATA_DIR + 'ru_geo_dataset_processed_train.parquet', 'doc_id')
process_dataset(ru_holdout_dataset.sample(frac=0.001), DATA_DIR + 'ru_geo_dataset_processed_holdout.parquet')

In [None]:
uk_train_processed_dataset = load_dataset(
    'parquet',
    data_files=DATA_DIR + 'uk_geo_dataset_processed_train_av.parquet',
    split='train'
)
uk_holdout_processed_dataset = load_dataset(
    'parquet',
    data_files=DATA_DIR + 'uk_geo_dataset_processed_holdout_av.parquet',
    split='train'
)

In [None]:
ru_train_processed_dataset = load_dataset(
    'parquet',
    data_files=DATA_DIR + 'ru_geo_dataset_processed_train.parquet',
    split='train'
)
ru_holdout_processed_dataset = load_dataset(
    'parquet',
    data_files=DATA_DIR + 'ru_geo_dataset_processed_holdout.parquet',
    split='train'
)

## Models

In [None]:
model_name = 'xlm-roberta-base'

labels = ['S', 'O', 'B-LOC', 'I-LOC']
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for i, l in enumerate(labels)}

bert_tokenizer = AutoTokenizer.from_pretrained(model_name)

## Aligning labels with BERT tokens

In [None]:
def align_labels_with_word_ids(labels, word_ids):
    new_labels = []
    current_word = None

    for word_id in word_ids:
        if word_id is None:
            # special tokens
            current_word = word_id
            new_labels.append(label2id["S"])
        elif word_id != current_word:
            # start of new word
            current_word = word_id
            new_labels.append(label2id[labels[word_id]])
        else:
            # part of a word
            label = labels[word_id]

            if label == "B-LOC":
                label = "I-LOC"

            new_labels.append(label2id[label])

    return new_labels

def align_labels(examples):
    bert_tokens = bert_tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    new_labels = []
    for i, labels in enumerate(examples['labels']):
        word_ids = bert_tokens.word_ids(i)
        new_labels.append(align_labels_with_word_ids(labels, word_ids))

    bert_tokens['labels'] = new_labels
    return bert_tokens

In [None]:
uk_train_processed_dataset = uk_train_processed_dataset.map(
    align_labels,
    batched=True
)
uk_holdout_processed_dataset = uk_holdout_processed_dataset.map(
    align_labels,
    batched=True
)

In [None]:
ru_train_processed_dataset = ru_train_processed_dataset.map(
    align_labels,
    batched=True
)
ru_holdout_processed_dataset = ru_holdout_processed_dataset.map(
    align_labels,
    batched=True
)

## Training arguments

In [None]:
import evaluate

metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)

  true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
  prediction_label = [[id2label[p] for p, l in zip(prediction, label) if l != -100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=prediction_label, references=true_labels)
  return {
      'precision': all_metrics['overall_precision'],
      'recall': all_metrics['overall_recall'],
      'f1': all_metrics['overall_f1']
  }

In [None]:
def get_model():
  model = AutoModelForTokenClassification.from_pretrained(
      model_name,
      label2id=label2id,
      id2label=id2label
  )
  lora_config = LoraConfig(
      task_type=TaskType.TOKEN_CLS,
      inference_mode=False,
      r=64,
      lora_alpha=32,
      lora_dropout=0.1
  )
  return get_peft_model(model, lora_config)

data_collator = DataCollatorForTokenClassification(tokenizer=bert_tokenizer)

In [None]:
def get_opt_sched(model, dataset_len, batch_size):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, 
        T_0=1000
    )

    return optimizer, scheduler

## Training

Training models for best_epoch_number on all train dataset:

In [None]:
batch_size=16
def get_train_args(name):
  return TrainingArguments(
      name,
      overwrite_output_dir=True,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      evaluation_strategy='steps',
      save_strategy='steps',
      logging_strategy='steps',
      eval_steps=100,
      logging_steps=100,
      save_steps=100,
      num_train_epochs=1,
      fp16=True,
      dataloader_pin_memory=False
  )

def train(model, train_dataset, test_dataset, args):
  opt, sched = get_opt_sched(model, len(train_dataset), batch_size)
  trainer = Trainer(
      model=model,
      args=args,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
      data_collator=data_collator,
      tokenizer=bert_tokenizer,
      compute_metrics=compute_metrics,
      optimizers=(opt, sched)
  )
  trainer.train()

In [None]:
uk_model = get_model()
uk_train_args = get_train_args(DATA_DIR + 'models/uk-loc')
train(uk_model, uk_train_processed_dataset, uk_holdout_processed_dataset, uk_train_args)

In [None]:
del uk_model, uk_train_processed_dataset
gc.collect()
torch.cuda.empty_cache()

In [None]:
ru_model = get_model()
ru_train_args = get_train_args(DATA_DIR + 'models/ru-loc')
train(ru_model, ru_train_processed_dataset, ru_holdout_processed_dataset, ru_train_args)

In [None]:
del ru_model
gc.collect()
torch.cuda.empty_cache()

## Kaggle submission

In [None]:
uk_checkpoint = DATA_DIR + 'models/uk-loc/checkpoint-2000'
ru_checkpoint = DATA_DIR + 'models/ru-loc/checkpoint-1600'

uk_inference = AutoModelForTokenClassification.from_pretrained(model_name, label2id=label2id, id2label=id2label)
ru_inference = AutoModelForTokenClassification.from_pretrained(model_name, label2id=label2id, id2label=id2label)

uk_inference = PeftModel.from_pretrained(uk_inference, uk_checkpoint).merge_and_unload()
ru_inference = PeftModel.from_pretrained(ru_inference, ru_checkpoint).merge_and_unload()

uk_classifier = pipeline(
    'token-classification', model=uk_inference, tokenizer=bert_tokenizer, aggregation_strategy='simple'
)
ru_classifier = pipeline(
    'token-classification', model=ru_inference, tokenizer=bert_tokenizer, aggregation_strategy='simple'
)

In [None]:
def filter_text(text):
  pattern = re.compile('['
        '\U0001F600-\U0001F64F'
        '\U0001F300-\U0001F5FF'
        '\U0001F680-\U0001F6FF'
        '\U00010000-\U00010FFF'
        '\U000024C2-\U0001F251'
        '\u2600-\u2B55'
  ']+')
  text = pattern.sub('', text)
  text = re.sub(r'https?://\S+', '', text)
  text = re.sub(r'\n', ' ', text)
  text = re.sub(r' +', ' ', text)
  return text

In [None]:
competition_test = pd.read_csv(DATA_DIR + 'competition/test.csv', converters = {'locations': eval})

In [None]:
competition_test.head()

In [None]:
competition_test['text'].head().to_list()

In [None]:
competition_test['filtered_text'] = competition_test['text'].apply(filter_text)
competition_test['filtered_text'].head().to_list()

In [None]:
def get_language_code(text):
    lang = pycld2.detect(text)[2][0][1]
    return 'ru' if lang == 'ru' else 'uk'

competition_test['language'] = competition_test['text'].apply(get_language_code)
set(competition_test['language'].to_list())

In [None]:
competition_test.loc[competition_test['language'] == 'uk', 'locations'] = [
    [p['word'] for p in s] for s in
    uk_classifier(competition_test[competition_test['language'] == 'uk']['filtered_text'].to_list())
]
competition_test.loc[competition_test['language'] == 'ru', 'locations'] = [
    [p['word'] for p in s] for s in
    ru_classifier(competition_test[competition_test['language'] == 'ru']['filtered_text'].to_list())
]

In [None]:
droped = []
def post_process(locations):
    ppl = [l for l in locations if (len(l) > 3 and any(c.isupper() for c in l)) or l == 'РФ' or l == 'США' or l == 'РБ']
    droped.extend([l for l in locations if l not in ppl])
    return ppl

competition_test['locations'] = competition_test['locations'].apply(post_process)

In [None]:
droped

In [None]:
competition_test[['text', 'locations']].sample(10).to_numpy()

In [None]:
example_with_many_locations = competition_test.iloc[
    np.argsort(competition_test['locations'].apply(len)).iloc[-1]]

print(f"""
        Text: {example_with_many_locations['text']},
        Pred locations: {example_with_many_locations['locations']}
""")

In [None]:
competition_test[['text_id', 'locations']].to_csv('/kaggle/working/roberta_base_lora_av.csv', index=False)