In [4]:
#Imports

from collections import Counter

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import time
import tqdm
import json
import re
import pandas as pd

In [5]:
# importing the module
import json
  
# Opening JSON file
f_train = open('json_datasets/train.json', 'r')
raw_train = json.load(f_train)
f_train.close()

f_test = open('json_datasets/test.json', 'r')
raw_test = json.load(f_test)
f_test.close()

In [6]:
def removeQuotes(lst):
    ret = []
    for s in lst:
        if s != '``' and s != "''":
            ret.append(s)
    return ret

def camelCaseSplit(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    d = [m.group(0) for m in matches]
    new_d = []
    for token in d:
        token = token.replace('(', '')
        token = token.replace(')', '')
        token_split = token.split('_')
        for t in token_split:
            #new_d.append(t.lower())
            new_d.append(t)
    return new_d

In [7]:
def g2tPreprocess(raw):
    df = []
    for item in raw:
        graph = 'g2t:'
        for relation in item['relations']:
            graph += ' <H> ' + ' '.join(removeQuotes(relation[0])) + ' <R> '
            graph += ' '.join(camelCaseSplit(relation[1])) + ' <T> '
            graph += ' '.join(removeQuotes(relation[2]))

        ents = [' '.join(removeQuotes(entity)) for entity in item['entities']]
        text = item['text']
        for i in range(len(ents)):
            text = text.replace('<ENT_'+str(i)+'>', ents[i])
        sample = [graph, text]
        df.append(sample)
    return pd.DataFrame(df, columns=['source_text', 'target_text'])

In [34]:
for graph in raw_train:
    if len(graph['relations']) > 3:
        print(g2tPreprocess([graph]).values)
        break

[['g2t: <H> Aarhus Airport <R> location <T> Tirstrup <H> Tirstrup <R> country <T> Denmark <H> Denmark <R> capital <T> Copenhagen <H> Tirstrup <R> is Part Of <T> Central Denmark Region'
  'Aarhus Airport is located in Tirstrup , part of the Central Denmark Region of Denmark which has the capital city of Copenhagen .']]


In [27]:
for graph in raw_train:
    l = len(graph['relations'])
    
    if l > 2:
        print(graph)
        print(g2tPreprocess([graph]))
        break
train_df = g2tPreprocess(raw_train)
test_df = g2tPreprocess(raw_test)
print(test_df.source_text[15])

{'relations': [[['Aarhus', 'Airport'], 'location', ['Tirstrup']], [['Tirstrup'], 'country', ['Denmark']], [['Denmark'], 'language', ['Danish', 'language']]], 'text': '<ENT_2> is located in <ENT_3> , <ENT_0> where the <ENT_1> is spoken .', 'entities': [['Denmark'], ['Danish', 'language'], ['Aarhus', 'Airport'], ['Tirstrup']]}
                                         source_text  \
0  g2t: <H> Aarhus Airport <R> location <T> Tirst...   

                                         target_text  
0  Aarhus Airport is located in Tirstrup , Denmar...  
g2t: <H> Al-Taqaddum Air Base <R> runway Length <T> 3684.0


In [6]:
pip install --upgrade simplet5

Collecting simplet5
  Downloading simplet5-0.1.3.tar.gz (7.2 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.1 MB/s 
Collecting transformers==4.10.0
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 68.4 MB/s 
[?25hCollecting pytorch-lightning==1.4.5
  Downloading pytorch_lightning-1.4.5-py3-none-any.whl (919 kB)
[K     |████████████████████████████████| 919 kB 68.2 MB/s 
Collecting PyYAML>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 93.1 MB/s 
Collecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 48.9 MB/s 
[?25hCollecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2021.11.

In [7]:
# import
from simplet5 import SimpleT5

# instantiate
model = SimpleT5()

# load (supports t5, mt5, byT5 models)
model.from_pretrained("t5","t5-base")

# train
model.train(train_df=train_df, # pandas dataframe with 2 columns: source_text & target_text
            eval_df=test_df, # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 512, 
            target_max_token_len = 128,
            batch_size = 8,
            max_epochs = 5,
            use_gpu = True,
            outputdir = "outputs",
            early_stopping_patience_epochs = 0,
            precision = 32
            )



Global seed set to 42


Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 42
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [8]:

# load trained T5 model
model.load_model("t5","outputs/simplet5-epoch-4-train-loss-0.3465", use_gpu=True)

In [38]:
f_dev = open('json_datasets/dev.json', 'r')
raw_dev = json.load(f_dev)
f_dev.close()

In [10]:
pip install "git+https://github.com/salaniz/pycocoevalcap.git"

Collecting git+https://github.com/salaniz/pycocoevalcap.git
  Cloning https://github.com/salaniz/pycocoevalcap.git to /tmp/pip-req-build-zds5y0el
  Running command git clone -q https://github.com/salaniz/pycocoevalcap.git /tmp/pip-req-build-zds5y0el
Building wheels for collected packages: pycocoevalcap
  Building wheel for pycocoevalcap (setup.py) ... [?25l[?25hdone
  Created wheel for pycocoevalcap: filename=pycocoevalcap-1.2-py3-none-any.whl size=104312215 sha256=4cc8ec30fb183efcc04aee46289bf02d3106bb29ad341f27150bfc40430c5294
  Stored in directory: /tmp/pip-ephem-wheel-cache-ljwg8jbe/wheels/6f/c9/51/e266f0496048c16686e133d8e33644d692931a356bfb372aae
Successfully built pycocoevalcap
Installing collected packages: pycocoevalcap
Successfully installed pycocoevalcap-1.2


In [39]:
# from pycocoevalcap.bleu.bleu import Bleu
bleu = Bleu(4)

dev_df = g2tPreprocess(raw_dev)
# print(dev_df)

hyp = [model.predict(dev_df['source_text'][i])[0] for i in range(len(dev_df))]
# ret = bleu.compute_score(dev_df['target_text'], hyp)
print(hyp[:10])

# print(dev_df['source_text'][50])
# print(model.predict(dev_df['source_text'][50]))

NameError: name 'Bleu' is not defined

In [40]:
from nltk.translate.bleu_score import sentence_bleu

def g2t_metrics(predict,truth):
    BLEU_1 = 0
    BLEU_2 = 0
    BLEU_3 = 0
    BLEU_4 = 0

    for pred, tth in zip(predict, truth):
        b1_1, b2_1, b3_1, b4_1 = BLEU_score(tth, pred)
        BLEU_1 += b1_1
        BLEU_2 += b2_1
        BLEU_3 += b3_1
        BLEU_4 += b4_1

    BLEU_1 = BLEU_1 / len(truth)
    BLEU_2 = BLEU_2 / len(truth)
    BLEU_3 = BLEU_3 / len(truth)
    BLEU_4 = BLEU_4 / len(truth)
    
    print('BLEU_1: %s'%(BLEU_1))
    print('BLEU_2: %s'%(BLEU_2))
    print('BLEU_3: %s'%(BLEU_3))
    print('BLEU_4: %s'%(BLEU_4))
    
    return BLEU_1, BLEU_2, BLEU_3, BLEU_4
    
def BLEU_score(truth, predict):
    truth = truth.split(' ')
    truth = [truth]
    predict = predict.split(' ')
    BLEU_1 = sentence_bleu(truth, predict, weights=(1, 0, 0, 0))
    BLEU_2 = sentence_bleu(truth, predict, weights=(0.5, 0.5, 0, 0))
    BLEU_3 = sentence_bleu(truth, predict, weights=(0.33, 0.33, 0.33, 0))
    BLEU_4 = sentence_bleu(truth, predict, weights=(0.25, 0.25, 0.25, 0.25))  
    return BLEU_1, BLEU_2, BLEU_3, BLEU_4

In [41]:
g2t_metrics(hyp, dev_df['target_text'])

NameError: name 'hyp' is not defined

In [48]:
f_dev = open('json_datasets/dev.json', 'r')
raw_dev = json.load(f_dev)
f_dev.close()
dev_df = g2tPreprocess(raw_dev)
# print(dev_df)
# ret = bleu.compute_score(dev_df['target_text'], hyp)

In [69]:
def g2tPreprocessNoText(raw):
    df = []
    graphs = []
    entities = []
    raw_ents = []
    for item in raw:
        graph = 'g2t:'
        for relation in item['relations']:
            graph += ' <H> ' + ' '.join(removeQuotes(relation[0])) + ' <R> '
            graph += ' '.join(camelCaseSplit(relation[1])) + ' <T> '
            graph += ' '.join(removeQuotes(relation[2]))

        ents = [' '.join(removeQuotes(entity)) for entity in item['entities']]
        graphs.append(graph)
        entities.append(ents)
        raw_ents.append(item['entities'])
    return graphs, entities, raw_ents



In [75]:
#g2tPreprocessNoText(raw_train)

In [72]:
def single_g2t(graph, ents, raw_ents, g2t_model):
    predText = g2t_model.predict(graph)
    for i in range(len(ents)):
        if ents[i] in text:
            predText.replace(ents[i], "<ENT_" + str(i) + ">")
        else:
            print("WARNING: ENTITY " + ents[i] + " NOT FOUND IN PREDICTED TEXT")
    return {'text' : predText, 'entities' : raw_ents}

In [73]:
# input: batch of graphs (list of dicts with relations and entities)
# output: predicted texts with original entities taken out (list of dicts with text and entities)
def predict_g2t(graphs, g2t_model):
    pGraphs, ents, raw_ents = g2tPreprocessNoText(graphs) # processed graphs, entities
    print(pGraphs)
    print(ents)
    hyps = [single_g2t(graphs[i], ents[i], raw_ents[i], g2t_model) for i in range(len(graphs))]
    # ret = bleu.compute_score(dev_df['target_text'], hyp)
    #print(hyp[:10])
    return hyps

model = None # need to train
predict_g2t(raw_train[1:3], model)


['g2t: <H> Aarhus Airport <R> city Served <T> Aarhus , Denmark', 'g2t: <H> Aarhus Airport <R> city Served <T> Aarhus']
[['Aarhus , Denmark', 'Aarhus Airport'], ['Aarhus', 'Aarhus Airport']]


AttributeError: 'NoneType' object has no attribute 'predict'