In [1]:
!pip install sentence-transformers pylatexenc

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
     |████████████████████████████████| 79 kB 975 kB/s            
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting pylatexenc
  Downloading pylatexenc-2.10.tar.gz (162 kB)
     |████████████████████████████████| 162 kB 1.8 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence-transformers, pylatexenc
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.0-py3-none-any.whl size=120748 sha256=e60feb8d81ebce36bd992a985e995c1ef990d69fd9deab81fa126a6c02e899ae
  Stored in directory: /root/.cache/pip/wheels/83/c0/df/b6873ab7aac3f2465aa9144b6b4c41c4391cfecc027c8b07e7
  Building wheel for pylatexenc (setup.py) ... [?25l- \ done
[?25h  Created wheel for pylatexenc: filename=pylatexenc-2.10-

In [2]:
import numpy as np
from numpy.random import default_rng
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
from sentence_transformers import SentenceTransformer, models
from sentence_transformers.losses import DenoisingAutoEncoderLoss
from pylatexenc.latex2text import LatexNodes2Text

In [3]:

def split_data(data, train_size=6000, optim_size=6000):
    test_size = len(data) - train_size - optim_size
    rng = np.random.default_rng(49)
    labels = np.array(['train'] * train_size + ['val'] * optim_size + ['test'] * test_size)
    rng.shuffle(labels)
    return labels

torch.manual_seed(42)
np.random.seed(42)
torch.manual_seed(42)
# transformers.set_seed(42)
tqdm.pandas()
docs_df = pd.read_csv('../input/task-2-data/to_use.csv')
df = docs_df.sample(25000, random_state=42)
df['type_set'] = split_data(df)
np.unique(df['type_set'], return_counts=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


(array(['test', 'train', 'val'], dtype=object), array([13000,  6000,  6000]))

In [4]:
import re

def get_sentences(texts, val=False):
    num_sentences = 0
    sentences = []
    splitter = re.compile(r'\.\s?\n?')
    for row in texts:
        new_sentences = splitter.split(row)
        new_sentences = [line for line in new_sentences if len(line) > 10]
        sentences.extend(new_sentences)
        if val:
            continue
        num_sentences += len(new_sentences)
        if num_sentences > 100_000:
            # Sentence transformers recommends 10-100K sentences for training
            print('Exceeded 100k')
            break
    return sentences

df_train = df[df['type_set'] == 'train'].copy()
df_eval = df[df['type_set'] == 'val'].copy()
texts_train = df_train['abstract'].tolist()
texts_eval = df_eval['abstract'].tolist()

sentences_train = get_sentences(texts_train)
sentences_eval = get_sentences(texts_eval, val=True)

In [5]:
sentences_train[0]

'  The nanoscale structure of molecular assemblies plays a major role in many\n($\\mu$)-biological mechanisms'

In [6]:
tqdm.pandas()


def clean_abstract(txt, stp_wrds):
    txt = txt.lower()
    try:
        txt = LatexNodes2Text().latex_to_text(txt)
    except:
        txt = txt.lower()
    sup = word_tokenize(txt, language="english")
    lemmatizer = WordNetLemmatizer()
    sup = [lemmatizer.lemmatize(word) for word in sup]
    ret = []
    for el in sup:
        el.replace('\\', '')
        if el not in stp_wrds and el not in string.punctuation:
            ret.append(el)
    return ' '.join(ret)


stop_words = nltk.corpus.stopwords.words("english")
stop_words += ['we', 'paper', 'new', 'article', "''", "``", "”", 'et', 'al', 'study', 'state', 'of', 'the', 'art']
cleaned_sentences_train = [clean_abstract(el, stop_words) for el in tqdm(sentences_train)]
cleaned_sentences_eval = [clean_abstract(el, stop_words) for el in tqdm(sentences_eval)]
cleaned_sentences_train[0]

  0%|          | 0/41059 [00:00<?, ?it/s]

  0%|          | 0/40814 [00:00<?, ?it/s]

'nanoscale structure molecular assembly play major role many μ -biological mechanism'

In [7]:
def train_model(sentences_train,model_name):
    # load data
    print('Loading data')
    train_data = DenoisingAutoEncoderDataset(sentences_train)
    loader = DataLoader(train_data, batch_size=8, shuffle=True, drop_last=True)
    
    # initialize model
    print('Initializing model')
    transformer = models.Transformer(model_name)
    pooling = models.Pooling(transformer.get_word_embedding_dimension(), 'cls')
    model = SentenceTransformer(modules=[transformer, pooling])
    
    # initialize loss
    print('Initializing loss')
    train_loss = DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
    
    # start training
    print('Start training')
    epochs = 5
    if model_name == 'sentence-transformers/all-distilroberta-v1':
        epochs = 1
    model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=epochs,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True
    )

    model.save('output/tsdae-' + model_name.split('/')[1])

In [8]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
lst_models = tqdm(['johngiorgi/declutr-base', 'google/electra-base-discriminator',
              'sentence-transformers/all-distilroberta-v1', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract'])
for mdl in lst_models:
    lst_models.set_description(f"Processing {mdl}")
    train_model(cleaned_sentences_train, mdl)

  0%|          | 0/4 [00:00<?, ?it/s]

Loading data
Initializing model


Downloading:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at johngiorgi/declutr-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Initializing loss


Some weights of RobertaForCausalLM were not initialized from the model checkpoint at johngiorgi/declutr-base and are newly initialized: ['roberta.encoder.layer.11.crossattention.self.key.weight', 'roberta.encoder.layer.0.crossattention.self.query.weight', 'roberta.encoder.layer.10.crossattention.self.query.bias', 'roberta.encoder.layer.4.crossattention.output.dense.bias', 'roberta.encoder.layer.7.crossattention.output.LayerNorm.bias', 'roberta.encoder.layer.7.crossattention.self.key.bias', 'roberta.encoder.layer.2.crossattention.output.LayerNorm.bias', 'roberta.encoder.layer.1.crossattention.self.value.weight', 'roberta.encoder.layer.3.crossattention.output.LayerNorm.weight', 'roberta.encoder.layer.10.crossattention.output.LayerNorm.bias', 'roberta.encoder.layer.0.crossattention.self.key.bias', 'roberta.encoder.layer.5.crossattention.self.key.bias', 'roberta.encoder.layer.10.crossattention.output.LayerNorm.weight', 'roberta.encoder.layer.2.crossattention.self.key.bias', 'roberta.encode

Start training


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Loading data
Initializing model


Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Initializing loss


Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForCausalLM: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForCausalLM were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['generator_predictions.LayerNorm.weight', 'electra.encoder.layer.2.cr

Start training


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Loading data
Initializing model


Downloading:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Initializing loss


Some weights of RobertaForCausalLM were not initialized from the model checkpoint at sentence-transformers/all-distilroberta-v1 and are newly initialized: ['encoder.layer.3.crossattention.output.dense.weight', 'encoder.layer.0.crossattention.self.value.bias', 'encoder.layer.0.crossattention.output.dense.weight', 'encoder.layer.0.crossattention.self.query.weight', 'encoder.layer.2.crossattention.self.value.bias', 'encoder.layer.5.crossattention.self.query.bias', 'encoder.layer.1.crossattention.output.LayerNorm.weight', 'encoder.layer.3.crossattention.output.LayerNorm.bias', 'encoder.layer.0.crossattention.output.LayerNorm.weight', 'encoder.layer.5.crossattention.self.key.bias', 'encoder.layer.4.crossattention.self.value.bias', 'encoder.layer.4.crossattention.self.key.bias', 'encoder.layer.1.crossattention.output.dense.bias', 'encoder.layer.4.crossattention.self.query.weight', 'encoder.layer.4.crossattention.output.LayerNorm.bias', 'encoder.layer.3.crossattention.self.key.weight', 'encod

Start training


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Loading data
Initializing model


Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/220k [00:00<?, ?B/s]

Initializing loss


Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['bert.encoder.layer.8.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.self.key.weight', 'bert.encoder.layer.1.crossattention.self.query.bias'

Start training


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5132 [00:00<?, ?it/s]