# Semantic Document Relations

<a href="https://colab.research.google.com/github/malteos/semantic-document-relations/blob/master/demo-wikidocrel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>









## Use GPU
Google Colab allows you to use a Tesla T4 GPU without any charge:

1. On the main menu, click Runtime and select **Change runtime type**. Set "GPU" as the hardware accelerator.
2. Click Runtime again and select **Runtime > Run All** (Watch out: the "Wikipedia articles" cell requires user input). You can also run the cells manually with Shift-ENTER.

In [1]:
!git clone https://github.com/malteos/semantic-document-relations.git

fatal: destination path 'semantic-document-relations' already exists and is not an empty directory.


In [2]:
cd semantic-document-relations

/content/semantic-document-relations


In [3]:
# Install dependencies
!pip install -r requirements.colab.txt



In [4]:
from models.transformers import JointBERT
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from wiki.utils import get_text_from_wikipedia
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import BertModel, BertTokenizer
from IPython.core.display import display, HTML
import pandas as pd
import pickle
import os
from experiments import Experiment
from wiki.data_helpers import JointBERTWikiDataHelper


In [0]:
if not torch.cuda.is_available():
  raise ValueError('CUDA is not available. Please enable GPU support under: Runtime > Change runtime')

In [6]:
# Download and extract pretrained model
!wget https://github.com/malteos/semantic-document-relations/releases/download/1.0/model_wiki.bert_base__joint__seq512.tar.gz
!tar -xzf model_wiki.bert_base__joint__seq512.tar.gz

--2020-01-21 15:32:46--  https://github.com/malteos/semantic-document-relations/releases/download/1.0/model_wiki.bert_base__joint__seq512.tar.gz
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/232886385/843aea00-35f3-11ea-8036-c70620456fb9?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200121%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200121T153246Z&X-Amz-Expires=300&X-Amz-Signature=a26d14e2f528be8c371e9d9b3c08e8e607504408e5c616e60ce11c2e9ff4066f&X-Amz-SignedHeaders=host&actor_id=0&response-content-disposition=attachment%3B%20filename%3Dmodel_wiki.bert_base__joint__seq512.tar.gz&response-content-type=application%2Foctet-stream [following]
--2020-01-21 15:32:46--  https://github-production-release-asset-2e65be.s3.amazonaws.com/232886385/843aea00-35f3-11ea-8036-c706

In [0]:
# Load experimental settings
exp_dir = '4fold_results/1/wiki.bert_base__joint__seq512'

with open(os.path.join(exp_dir, 'experiment.pickle'), 'rb') as f:
  _exp = pickle.load(f)

le = LabelEncoder()
le.fit_transform(_exp['data_helper_params']['labels'] + ['none'])

# Init model
model = JointBERT(bert_model_path='bert-base-cased', bert_cls=BertModel, prob='sigmoid', labels_count=_exp['model_params']['labels_count'])
model.load_state_dict(torch.load(os.path.join(exp_dir, 'model_weights')))

#model = BertModel.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [0]:
# Wikipedia articles
seed_title = 'Albert Einstein' #@param {type:"string"}
target_title = 'ETH Zurich' #@param {type:"string"}

In [0]:
# Retrieve article text from Wikipedia API
seed_text = get_text_from_wikipedia(seed_title)
target_text = get_text_from_wikipedia(target_title)

In [0]:
# Convert to model input
seed_ids = tokenizer.convert_tokens_to_ids((tokenizer.tokenize(seed_title) + tokenizer.tokenize(seed_text))[:250])
target_ids = tokenizer.convert_tokens_to_ids((tokenizer.tokenize(target_title) + tokenizer.tokenize(target_text))[:250])

pair_ids = [torch.tensor(tokenizer.build_inputs_with_special_tokens(seed_ids, target_ids))]
token_type_ids = [torch.tensor(tokenizer.create_token_type_ids_from_sequences(seed_ids, target_ids))]

pair_ids = pad_sequence(pair_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=1)

masks = torch.tensor([[float(i > 0) for i in ii] for ii in pair_ids])


In [0]:
# Prediction
with torch.no_grad():
  ys_pred = model(pair_ids, masks, token_type_ids).numpy()
  label_pred = le.inverse_transform([ys_pred.argmax()])[0]

In [30]:
display(HTML(f'<h1>Predicted relation of <i>{seed_title}</i> and <i>{target_title}</i> is <u>{label_pred}</u></h1>'))

In [31]:
# Probabilites for other classes
df = pd.DataFrame(dict(label=le.classes_.tolist(), probability=ys_pred.flatten().round(3).tolist()))
df

Unnamed: 0,label,probability
0,country_of_citizenship,0.0
1,different_from,0.261
2,educated_at,0.0
3,employer,0.0
4,facet_of,0.089
5,has_effect,0.081
6,has_quality,0.048
7,none,0.0
8,opposite_of,0.416
9,symptoms,0.0
