In [1]:
import pandas as pd

In [None]:
df = pd.read_csv('arxiv.csv')
abstracts = df['abstracts'][:40000].values

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
import string
import unicodedata
import re
import matplotlib.pyplot as plt

In [None]:
#def remove_numbers(str):
#    return re.sub(r"\d*|\d*\.\d+|\d*\,\d+", '', str)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def clean_text(text):
    return unicodeToAscii(re.sub(r"”|“",'"',text).replace("’","'"))


def build_structure(input):
    acc = []
    stack = []
    for token in input:
        if token == "'s":
            acc = acc[:-1] + [(acc[-1] + token) if len(acc) > 0 else token]
        elif token in (list(string.punctuation) + ["''","``","’"]):
            stack += [(' '.join(acc),token + ' ')]
            acc = []
        else:
            acc += [token]
    stack += [(' '.join(acc),None)]
    res = None
    for x in stack[::-1]:
        res = (x[0],x[1],res)
    return res


def structure_to_sentences(structure):
    res = []
    next = structure
    while next is not None:
        str, sep, next = next
        if len(str) > 0:
            res += [str]
    return res

def transf(s):
    return structure_to_sentences(build_structure(word_tokenize(clean_text(s))))

In [None]:
abstract_sentences = []
for abstract in abstracts:
    sentences = transf(abstract)
    for sentence in sentences:
        abstract_sentences += [sentence]
len(abstract_sentences)

In [None]:
df = pd.DataFrame({'sentence':abstract_sentences})
df = df.drop_duplicates(subset=['sentence'])
df = df[df['sentence'].apply(lambda x: len(x.split(' '))) > 1]
len(df)

In [None]:
SEQUENCE_LENGTH = 200
def max_x_chars(max_len):
    def truncate_sentence(str):
        splitted_str = str.split(" ")
        word_lengths = [len(w) for w in splitted_str]
        cum_sum = np.cumsum(word_lengths)
        end = [i for i,x in enumerate(cum_sum) if x <= max_len][-1]
        return ' '.join(splitted_str[:end+1])
    return truncate_sentence

truncate = max_x_chars(SEQUENCE_LENGTH)

In [None]:
df = df[df['sentence'].apply(lambda o: type(o) is str)]
len(df)

In [None]:
df['original'] = df['sentence']

# expected output. 'hello world' become 'h e l l o # w o r l d'
df['expected'] = df['sentence'].apply(lambda x: ' ## '.join([' '.join(w) for w in x.split(' ')]))

# input. 'hello world' become 'h e l l o w o r l d'
df['sentence'] = df['sentence'].apply(lambda x: ' '.join([' '.join(w) for w in x.split(' ')]))
print(len(df))
df.head()

In [None]:
#list of string saved to file
def list_to_file(list, filename):
    with open(filename, 'w') as fp:
        for el in list:
            fp.write(f"{el}\n")


In [None]:
def split_to_files(split, name):
    list_to_file([s for s,_ in split], f"./{name}-source.txt")
    list_to_file([t for _,t in split], f"./{name}-target.txt")

In [None]:
splits = {
    'abstract' : [(source,target) for (source,target) in zip(df['sentence'],df['expected'])],
}
for k,v in splits.items():
    split_to_files(v,k)

In [None]:
!onmt_translate -model final_model.pt -src abstract-source.txt -output abstract-translation.txt

## Evaluation

In [10]:
f = open("abstract-target.txt","r")
test_l = f.read().split("\n")
f.close()
f = open("abstract-translation.txt","r")
predicted_l = f.read().split("\n")
f.close()
len(test_l), len(predicted_l)
test_l = test_l[:-1]
predicted_l = predicted_l[:-1]

In [11]:
pair_l = [(pred,test) for pred,test in zip(predicted_l,test_l) if len(test.split('##')) > 1]
predicted_l = [pred.replace(' ','').replace('##',' ') for pred,_ in pair_l]
test_l = [test.replace(' ','').replace('##',' ') for _,test in pair_l]

In [12]:
def precision(pred,gt):
    words_pred = set(pred.split(' '))
    words_gt = set(gt.split(' '))
    return len(set.intersection(words_pred,words_gt)) / len(pred.split(' '))
P = (sum([precision(pred,test) for (pred,test) in zip(predicted_l,test_l)])/len(test_l)) * 100

def recall(pred,gt):
    words_pred = set(pred.split(' '))
    words_gt = set(gt.split(' '))
    return len(set.intersection(words_pred,words_gt)) / len(gt.split(' '))
    
R = (sum([recall(pred,test) for (pred,test) in zip(predicted_l,test_l)])/len(test_l)) * 100

In [13]:
P,R,2/((1/P) + (1/R))

(91.09577509093276, 91.82453869385112, 91.45870517722169)

In [14]:
from jiwer import wer
import numpy as np
wers = [wer(test, pred) for pred,test in zip(predicted_l, test_l)]
sum(wers)/len(predicted_l)

0.07911392351871394

In [7]:
df = pd.DataFrame({'sl': [len(x.split(' ')) for x in test_l], 'wer':wers})
#df.head()

In [9]:
df.to_csv('plots/abstracts.csv')