# Word vectors (FastText) for Baseline

#### Create Spacy model from word vectors

```bash
python -m spacy init-model en output/cord19_docrel/spacy/en_cord19_fasttext_300d --vectors-loc output/cord19_docrel/cord19.fasttext.w2v.txt
python -m spacy init-model en output/acl_docrel/spacy/en_acl_fasttext_300d --vectors-loc output/acl_docrel/acl.fasttext.w2v.txt
```


In [1]:
import gensim
import json
import os
import requests
import pickle
import pandas as pd
import logging
from pathlib import Path
from tqdm import tqdm_notebook as tqdm
from smart_open import open
from nlp import load_dataset
import nlp
import acl.utils
from trainer_cli import ExperimentArguments



## CORD19

In [22]:
data_dir = Path('./output/cord19_docrel')

experiment_args = ExperimentArguments(
    nlp_dataset='./datasets/cord19_docrel/cord19_docrel.py',
    nlp_cache_dir='./data/nlp_cache',
    doc_id_col='doi',
    doc_a_col='from_doi',
    doc_b_col='to_doi',
    cv_fold=1,
)

docs_ds = load_dataset(experiment_args.nlp_dataset,
                       name='docs',
                       cache_dir=experiment_args.nlp_cache_dir,
                       split=nlp.Split('docs'))

In [23]:
# Extract tokens from each document and create token file.
tokens_count = 0
with open(data_dir / 'tokens.txt', 'w') as f:
    for idx, doc in docs_ds.data.to_pandas().iterrows():
        text = acl.utils.get_text_from_doc(doc)  
        for token in gensim.utils.simple_preprocess(text, min_len=2, max_len=15):
            f.write(token + ' ')
            tokens_count += 1
        f.write('\n')
print(f'Total tokens: {tokens_count:,}')


Total tokens: 16,181,414


In [26]:
import fasttext

model = fasttext.train_unsupervised(str(data_dir / 'tokens.txt'), 
                                    model='skipgram', 
                                    lr=0.05, # learning rate [0.05]
                                    dim=300,   # size of word vectors [100]
                                    ws=5,                # size of the context window [5]
                                    epoch=5,             # number of epochs [5]
                                    thread=4,            # number of threads [number of cpus]
                                   )

In [27]:
model.save_model(str(data_dir / 'cord19.fasttext.bin'))

In [28]:
from gensim.models.wrappers import FastText

ft_model = FastText.load_fasttext_format(str(data_dir / 'cord19.fasttext.bin'))
ft_model.wv.save_word2vec_format(data_dir / 'cord19.fasttext.w2v.txt')

In [None]:
# Unset
del ft_model
del model
del docs_ds
del experiment_args
del data_dir

## ACL

In [2]:
data_dir = Path('./output/acl_docrel')

experiment_args = ExperimentArguments(
    nlp_dataset='./datasets/acl_docrel/acl_docrel.py',
    nlp_cache_dir='./data/nlp_cache',
    doc_id_col='s2_id',
    doc_a_col='from_s2_id',
    doc_b_col='to_s2_id',
    cv_fold=1,
)

docs_ds = load_dataset(experiment_args.nlp_dataset,
                       name='docs',
                       cache_dir=experiment_args.nlp_cache_dir,
                       split=nlp.Split('docs'))

Downloading and preparing dataset acl_docrel/docs (download: Unknown size, generated: Unknown size, total: Unknown size) to ./data/nlp_cache/acl_docrel/docs/0.1.0...


HBox(children=(IntProgress(value=0, description='Downloading', max=312525939, style=ProgressStyle(description_…




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Dataset acl_docrel downloaded and prepared to ./data/nlp_cache/acl_docrel/docs/0.1.0. Subsequent calls will reuse this data.


In [3]:
# Extract tokens from each document and create token file.
tokens_count = 0
with open(data_dir / 'tokens.txt', 'w') as f:
    for idx, doc in docs_ds.data.to_pandas().iterrows():
        text = acl.utils.get_text_from_doc(doc)  
        for token in gensim.utils.simple_preprocess(text, min_len=2, max_len=15):
            f.write(token + ' ')
            tokens_count += 1
        f.write('\n')
        
# Total tokens: 2,194,010
print(f'Total tokens: {tokens_count:,}')

Total tokens: 2,194,010


In [4]:
import fasttext

model = fasttext.train_unsupervised(str(data_dir / 'tokens.txt'), 
                                    model='skipgram', 
                                    lr=0.05, # learning rate [0.05]
                                    dim=300,   # size of word vectors [100]
                                    ws=5,                # size of the context window [5]
                                    epoch=5,             # number of epochs [5]
                                    thread=4,            # number of threads [number of cpus]
                                   )

In [5]:
model.save_model(str(data_dir / 'acl.fasttext.bin'))

In [6]:
from gensim.models.wrappers import FastText

ft_model = FastText.load_fasttext_format(str(data_dir / 'acl.fasttext.bin'))
ft_model.wv.save_word2vec_format(data_dir / 'acl.fasttext.w2v.txt')