# README

This notebook is used to pseudo-label the additional data. Before running this notebook, you will need to run the external and competition data preparation notebooks.

After this notebook, run the external relabeling notebook to label this data again.


# Setup

In [None]:
!pip install torch
!pip install transformers
!pip install numpy
!pip install pandas
!pip install sentence-transformers


Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 13.0 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 71.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 58.5 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 67.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully 

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch import nn
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import random

In [None]:
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


# Constants

In [None]:
BASE_PATH = 'gdrive/MyDrive/Lit'
PSEUDO_LABEL_MODEL_PATH = os.path.join(BASE_PATH, 'models/roberta-base')

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 28
seed_everything(seed=SEED)
MAX_LENGTH = 256

# Functions

In [None]:
def search_similar_passages(queries, search_selection, top_k, model_name='paraphrase-TinyBERT-L6-v2', cuts=None):
  model = SentenceTransformer(model_name)
  bank = []
  sentences = []
  for dataset in search_selection:
    in_dir = os.path.join(BASE_PATH, 'embeddings/encoded-' + dataset + '-' + model_name + '.pt')
    if os.path.isfile(in_dir):
      encoded = torch.load(in_dir)
      bank.extend(encoded)
    else:
      raise FileNotFoundError(f'{dataset} embeddings could not be found.')
    data_dir = os.path.join(BASE_PATH, 'data/preprocessed/' + dataset + '.csv')
    if os.path.isfile(data_dir):
      sents = pd.read_csv(data_dir)
      sents = sents.text.values
      sentences.extend(sents)
    else:
      raise FileNotFoundError(f'{dataset} passages could not be found.')
    assert len(bank) == len(sentences)

  print(f'Starting to search within {len(sentences)} text fragments...')
  
  encoded_queries = model.encode(queries, convert_to_tensor=True)

  hits = util.semantic_search(encoded_queries, bank, top_k=top_k, corpus_chunk_size=80000)
  selected = []
  for hit in hits:
    sents = [sentences[h['corpus_id']] for h in hit]
    if cuts:
      sents = sents[cuts[0]:cuts[1]]
    selected.append(sents)

  return selected


In [None]:
def zip_hits_scores(hits, scores, stdev):
  zipped = []
  for idx, hit in enumerate(hits):
    current = [(h, scores[idx], stdev[idx]) for h in hit]
    zipped.extend(current)
  return zipped


In [None]:
def filter_on_stdev(sentences, predictions, scores, stdev):
  pred_filtered = []
  sents_filtered = []
  for idx, pred in enumerate(predictions):
    dev = stdev[idx]
    gt = scores[idx]
    diff = abs(pred-gt)
    if diff < dev:
      pred_filtered.append(pred)
      sents_filtered.append(sentences[idx])
  
  return sents_filtered, pred_filtered

In [None]:
def generate_augmented_data(fold_dir, model_dir, out_dir, n_samples=5, kfolds=[0, 1, 2, 3, 4, 5]):
  for fold in kfolds:
    torch.cuda.empty_cache()
    train_fold = pd.read_csv(fold_dir + '/train_fold_' + str(fold) + '.csv')
    val_fold = pd.read_csv(fold_dir + '/val_fold_' + str(fold) + '.csv')
    queries = [str(t) for t in train_fold.excerpt.values]
    scores = [float(t) for t in train_fold.target.values]
    stdev = [float(t) for t in train_fold.standard_error.values]
    corpora = ['simplewiki', 'cb_corpus', 'wiki_snippets', 'onestop', 'asb', 'kaggle_scraped', 'bookcorpus_02']
    hits = search_similar_passages(queries, corpora, n_samples)
    zipped = zip_hits_scores(hits, scores, stdev)
    sentences = [t[0] for t in zipped]
    scores = [t[1] for t in zipped]
    stdev = [t[2] for t in zipped]
    torch.cuda.empty_cache()
    predictions = predict_fast(model_dir + '/model_fold_' + str(fold) + '/best', sentences)
    print(len(predictions))

    sents_filtered, preds_filtered = filter_on_stdev(sentences, predictions, scores, stdev)
    augmented_df = pd.DataFrame.from_dict({'excerpt': sents_filtered, 'target': preds_filtered})
    augmented_df.to_csv(out_dir + '/predicted.csv')

In [None]:
def predict_fast(model_name=None, data=None, init_model=None, tokenizer=None, num_labels=1, is_multilabel=False, output_logits=False, use_softmax=False):
  device = "cuda:0"
  tokenizer = AutoTokenizer.from_pretrained(model_name) if model_name else tokenizer
  config = AutoConfig.from_pretrained(model_name, num_labels=num_labels) if model_name else None
  model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config) if model_name else init_model
  model.to(device)
  model.eval()
  y_pred = []
  batches = chunks(data, 32)
  for batch in tqdm(batches):
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH)
    input_ids = inputs['input_ids'].to(device)
    attention = inputs['attention_mask'].to(device)
    inputs = {
        'input_ids': input_ids,
        'attention_mask': attention
    }
    with torch.no_grad():        
          outputs = model(**inputs)
    if not use_softmax:
      logits = outputs[0].detach().cpu().numpy().squeeze().tolist()
    else:
      logits = nn.functional.softmax(outputs.logits, dim=-1).detach().cpu().numpy().squeeze().tolist()
    if is_multilabel and not output_logits:
      logits = np.argmax(logits, axis=-1)
    y_pred.extend(logits)

  return y_pred

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Labeling

In [None]:
fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
model_dir = os.path.join(BASE_PATH, 'models/roberta-base')
out_dir = os.path.join(BASE_PATH, 'data/training/predicted/predicted.csv')

In [None]:
generate_augmented_data(fold_dir=fold_dir, model_dir=model_dir, out_dir=out_dir, kfolds=[0])

Starting to search within 2161598 text fragments...


369it [00:56,  6.55it/s]


11805
