# Idea
The main idea here is to create a new embedding for the words fed to the encoder. For this we train a summarizer transformer to learn the duden definitions of each word in our corpus. we then use the output of the encoder as our new embedding to train the sentiment analysis network on.

In [1]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'google/pegasus-xsum'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

In [2]:
# create a list with all english words
import nltk
nltk.download('words')
from nltk.corpus import words
english_words = words.words()

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Tobias\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
# print the number of english_words
print(len(english_words))

236736


In [4]:
# use PyDictionary to get the duden definition of each word
from PyDictionary import PyDictionary
dictionary=PyDictionary()

In [None]:
english_words_with_definitions = {}

In [None]:
# create a dictionary english_words_with_definitions for all english words as keys and their definitions as values. add the english word to dictionary only if the definition is not None
for word in english_words:
    if word not in english_words_with_definitions:
        definition = dictionary.meaning(word)
        if definition is not None:
            english_words_with_definitions[word] = definition

Error: The Following Error occured: list index out of range
Error: The Following Error occured: HTTPConnectionPool(host='wordnetweb.princeton.edu', port=80): Max retries exceeded with url: /perl/webwn?s=aam (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001B9DF160430>: Failed to establish a new connection: [WinError 10060] Ein Verbindungsversuch ist fehlgeschlagen, da die Gegenstelle nach einer bestimmten Zeitspanne nicht richtig reagiert hat, oder die hergestellte Verbindung war fehlerhaft, da der verbundene Host nicht reagiert hat'))
Error: The Following Error occured: list index out of range
Error: The Following Error occured: HTTPConnectionPool(host='wordnetweb.princeton.edu', port=80): Max retries exceeded with url: /perl/webwn?s=Aaronic (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001B9DF160400>: Failed to establish a new connection: [WinError 10060] Ein Verbindungsversuch ist fehlgeschlagen, da die Gegenstelle

KeyboardInterrupt: 

In [7]:
# print the n-th duden_definition pair
n = 10
print(len(english_words_with_definitions))
print(list(english_words_with_definitions.items())[n])
print(len(english_words_with_definitions))

366
('abacinate', {'Verb': ["blind by holding a red-hot metal plate before someone's eyes"]})
366


In [10]:
# print the last word in the dictionary
print(list(english_words_with_definitions.items())[-1])

('carbo', ['Carbohydrate'])


In [6]:
# save the dictionary to a json file
import json

with open('english_words_with_definitions.json', 'w') as f:
    json.dump(english_words_with_definitions, f)

In [None]:
# load the dictionary from the json file
with open('english_words_with_definitions.json', 'r') as f:
    english_words_with_definitions = json.load(f)

In [None]:
# finetune the summarizer to learn the duden definitions of each word using cuda
import torch
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

model.to(device)
model.train()

# import optimizer
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)

# import scheduler
from transformers import get_linear_schedule_with_warmup
num_epochs = 1
num_training_steps = num_epochs * len(english_words_with_definitions)
num_warmup_steps = 0
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

# train the model
for epoch in range(num_epochs):
    for word, definitions in tqdm(english_words_with_definitions.items()):
        for definition in definitions:
            inputs = tokenizer([word], [definition], return_tensors='pt', padding=True, truncation=True)
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs[0]
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

In [None]:
# input n random words from the corpus and see the output
import random
n = 2
random_words = random.sample(list(english_words_with_definitions.keys()), n)
for word in random_words:
    inputs = tokenizer([word], return_tensors='pt', padding=True, truncation=True)
    outputs = model.generate(**inputs)
    print(f'word: {word}')
    print(f'definition: {tokenizer.decode(outputs[0])}')
    print('------------------------------------')

In [2]:
# get the output of the encoder of the model
encoder = model.get_encoder()

In [3]:
# input a sentence to the encoder and print the output
input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt").input_ids
outputs = encoder(input_ids)
print(outputs.last_hidden_state.shape)

torch.Size([1, 7, 1024])


In [None]:
# create a new transformer model
from transformers import AutoModel, AutoTokenizer
model_name = 'google/pegasus-xsum'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# replace the embedding of the model with the output of the encoder
model.resize_token_embeddings(len(tokenizer))
model.embeddings.word_embeddings = encoder.embeddings.word_embeddings