In [None]:
!pip install transformers



In [None]:
from transformers import LukeTokenizer, LukeModel, LukeForEntityPairClassification

model = LukeModel.from_pretrained("studio-ousia/luke-base")
tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
# Example 1: Computing the contextualized entity representation corresponding to the entity mention

text = "Speaker_10: And now when we insert it into the LM, apparently the one with the least number of LMS be improved in the Calgary and I think that is expected as we're just using them as random unigrams with probability of 1 within scaling one."
entity_spans = [(25, 26), (47,48), (96,98), (119,126)]  # character-based entity spans corresponding to "we", "LM", "LMS" and "Calgary"
inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
outputs = model(**inputs)
word_last_hidden_state = outputs.last_hidden_state
entity_last_hidden_state = outputs.entity_last_hidden_state
# Example 2: Inputting Wikipedia entities to obtain enriched contextualized representations

entities = [
    "we",
    "LM",
    "LMS",
    "Calgary"
]  # Wikipedia entity titles corresponding to the entity mentions "we", "LM", "LMS" and "Calgary"
entity_spans = [(25, 26), (47,48), (96,98), (119,126)]  # character-based entity spans corresponding to "we", "LM", "LMS" and "Calgary"
inputs = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
outputs = model(**inputs)
word_last_hidden_state = outputs.last_hidden_state
entity_last_hidden_state = outputs.entity_last_hidden_state
# Example 3: Classifying the relationship between two entities using LukeForEntityPairClassification head model

model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
entity_spans = [(25, 26), (47,48)]  # character-based entity spans corresponding to "we", "LM", "LMS" and "Calgary"
inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
predicted_class_idx = int(logits[0].argmax())
print("Predicted class:", model.config.id2label[predicted_class_idx])


Some weights of the model checkpoint at studio-ousia/luke-base were not used when initializing LukeModel: ['lm_head.layer_norm.bias', 'entity_predictions.transform.LayerNorm.bias', 'entity_predictions.transform.LayerNorm.weight', 'entity_predictions.transform.dense.weight', 'lm_head.dense.bias', 'entity_predictions.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'entity_predictions.transform.dense.bias']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-tacred were not used when initializin

Predicted class: no_relation


In [None]:
from transformers import LukeTokenizer, LukeModel

tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
model = LukeModel.from_pretrained("studio-ousia/luke-base")
# Compute the contextualized entity representation corresponding to the entity mention "we", "LM", "LMS" and "Calgary"

text = "Speaker_10: And now when we insert it into the LM, apparently the one with the least number of LMS be improved in the Calgary and I think that is expected as we're just using them as random unigrams with probability of 1 within scaling one."
entity_spans = [(25, 26), (47,48), (96,98), (119,126)]  # character-based entity spans corresponding to "we", "LM", "LMS" and "Calgary"

encoding = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
outputs = model(**encoding)
word_last_hidden_state = outputs.last_hidden_state
entity_last_hidden_state = outputs.entity_last_hidden_state
# Input Wikipedia entities to obtain enriched contextualized representations of word tokens

text = "Speaker_10: And now when we insert it into the LM, apparently the one with the least number of LMS be improved in the Calgary and I think that is expected as we're just using them as random unigrams with probability of 1 within scaling one."
entities = [
    "we",
    "LM",
    "LMS",
    "Calgary"
]  # Wikipedia entity titles corresponding to the entity mentions "we", "LM", "LMS" and "Calgary"
entity_spans = [
    (25, 26), 
    (47,48), 
    (96,98), 
    (119,126)
]  # character-based entity spans corresponding to "we", "LM", "LMS" and "Calgary"

encoding = tokenizer(
    text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt"
)
outputs = model(**encoding)
word_last_hidden_state = outputs.last_hidden_state
entity_last_hidden_state = outputs.entity_last_hidden_state

Some weights of the model checkpoint at studio-ousia/luke-base were not used when initializing LukeModel: ['lm_head.layer_norm.bias', 'entity_predictions.transform.LayerNorm.bias', 'entity_predictions.transform.LayerNorm.weight', 'entity_predictions.transform.dense.weight', 'lm_head.dense.bias', 'entity_predictions.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'entity_predictions.transform.dense.bias']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from transformers import LukeTokenizer, LukeForEntityClassification

tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
model = LukeForEntityClassification.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")

text = "Speaker_10: And now when we insert it into the LM, apparently the one with the least number of LMS be improved in the Calgary and I think that is expected as we're just using them as random unigrams with probability of 1 within scaling one."
entity_spans = [(119,126)]  # character-based entity spans corresponding to "we", "LM", "LMS" and "Calgary"
inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])

Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-open-entity were not used when initializing LukeForEntityClassification: ['luke.embeddings.position_ids']
- This IS expected if you are initializing LukeForEntityClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForEntityClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicted class: object


In [None]:
from transformers import LukeTokenizer, LukeForEntityPairClassification

tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")

text = "Speaker_10: And now when we insert it into the LM, apparently the one with the least number of LMS be improved in the Calgary and I think that is expected as we're just using them as random unigrams with probability of 1 within scaling one."
entity_spans = [
    (96,98), 
    (119,126)
]  # character-based entity spans corresponding to "we", "LM", "LMS" and "Calgary"
inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])

Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-tacred were not used when initializing LukeForEntityPairClassification: ['luke.embeddings.position_ids']
- This IS expected if you are initializing LukeForEntityPairClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForEntityPairClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicted class: no_relation


In [None]:
from transformers import LukeTokenizer, LukeForEntitySpanClassification

tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

text = "Speaker_10: And now when we insert it into the LM, apparently the one with the least number of LMS be improved in the Calgary and I think that is expected as we're just using them as random unigrams with probability of 1 within scaling one."
import spacy
from spacy.pipeline import merge_entities    
nlp = spacy.load("en_core_web_sm")  # or any other model
nlp.add_pipe(merge_entities)
print([token.text for token in nlp("Speaker_10: And now when we insert it into the LM, apparently the one with the least number of LMS be improved in the Calgary and I think that is expected as we're just using them as random unigrams with probability of 1 within scaling one.")])

word_start_positions = [0, 8, 14, 17, 21]  # character-based start positions of word tokens
word_end_positions = [7, 13, 16, 20, 28]  # character-based end positions of word tokens
entity_spans = []
for i, start_pos in enumerate(word_start_positions):
    for end_pos in word_end_positions[i:]:
        entity_spans.append((start_pos, end_pos))

inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
predicted_class_indices = logits.argmax(-1).squeeze().tolist()
for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
    if predicted_class_idx != 0:
        print(text[span[0] : span[1]], model.config.id2label[predicted_class_idx])

Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-conll-2003 were not used when initializing LukeForEntitySpanClassification: ['luke.embeddings.position_ids']
- This IS expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['Speaker_10', ':', 'And', 'now', 'when', 'we', 'insert', 'it', 'into', 'the', 'LM', ',', 'apparently', 'the', 'one', 'with', 'the', 'least', 'number', 'of', 'LMS', 'be', 'improved', 'in', 'the', 'Calgary', 'and', 'I', 'think', 'that', 'is', 'expected', 'as', 'we', "'re", 'just', 'using', 'them', 'as', 'random', 'unigrams', 'with', 'probability', 'of', '1', 'within', 'scaling', 'one', '.']
