## SciTLDR Data Analysis

In [31]:
import json
import os
import torch
from transformers import MBartTokenizer


In [163]:
# Path to data directories containing original SciTLDR data
language = "English"
ft_text_path = os.path.join(language, "jsonl", "FullText")
aic_text_path = os.path.join(language, "jsonl", "AIC")
ao_text_path = os.path.join(language, "jsonl", "Abstracts")

max_mbart = 1024
max_zmbart = 2048
max_led = 16384

In [164]:
# Function to load json data from file and output data as array.
def load_json_data(file_path):
    data = []
    file = open(file_path, "r", encoding="utf-8")
    lines = file.readlines()
    for line in lines:
        data.append(json.loads(line))
    return data

def compute_number_of_tldrs(data):
    length = 0
    for line in data:
        length += len(line["target"])
    return length

In [165]:
# Tokenization functions

tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')

def tokenize_abstract(array):
    text = ""
    for ele in array:
        text += ele
    # print(text)
    tensors = tokenizer(text, return_tensors="pt")["input_ids"]
    # print(tokens)
    return tensors.size()[1]

def tokenize_data(data):
    lengths = []
    for line in data:
        # print(line["source"])
        lengths.append(tokenize_abstract(line["source"]))
    return lengths

def too_long_elements(data, border):
    ao_too_long = []
    for ele in data:
        if ele >= border:
            ao_too_long.append(ele)
    return ao_too_long

def number_too_long_elements(data, border):
    ao_too_long = []
    for ele in data:
        if ele >= border:
            ao_too_long.append(ele)
    return len(ao_too_long)
        

In [166]:
def extract_avg_words_source(data):
    counts = []
    for line in data:
        content = line["source"]
        numbers = 0
        for ele in content:
            length = len(ele.split(" "))
            numbers += length
        counts.append(numbers)
    return sum(counts)/len(counts)

def extract_avg_words_target(data):
    counts = []
    for line in data:
        content = line["target"]
        for ele in content:
            length = len(ele.split(" "))
            counts.append(length)
    return sum(counts)/len(counts)

### Abstract-Only Data

In [167]:
# Read abstract only data
ao_train = load_json_data(os.path.join(ao_text_path, "train.jsonl"))
ao_valid = load_json_data(os.path.join(ao_text_path, "valid.jsonl"))
ao_test = load_json_data(os.path.join(ao_text_path, "test.jsonl"))
ao_data = [*ao_train, *ao_valid, *ao_test]

In [168]:
# Compute number of TLDRs per split 
ao_train_tldrs = compute_number_of_tldrs(ao_train)
ao_valid_tldrs = compute_number_of_tldrs(ao_valid)
ao_test_tldrs = compute_number_of_tldrs(ao_test)
ao_data_tldrs = compute_number_of_tldrs(ao_data)

In [169]:
# Tokenize data and compare lengths
ao_tokenized_sizes = tokenize_data(ao_data)
ao_mean_length = sum(ao_tokenized_sizes) / len(ao_tokenized_sizes)
ao_too_long_mbart = number_too_long_elements(ao_tokenized_sizes, max_mbart)
ao_too_long_zmbart = number_too_long_elements(ao_tokenized_sizes, max_zmbart)
ao_too_long_led = number_too_long_elements(ao_tokenized_sizes, max_led)

In [170]:
# Standard measures:
print(f"AO-TRAIN     - Number of elements: {len(ao_train)} | Number of TLDRs: {ao_train_tldrs}")
print(f"AO-VALID     - Number of elements:  {len(ao_valid)} | Number of TLDRs: {ao_valid_tldrs}")
print(f"AO-TEST      - Number of elements:  {len(ao_test)} | Number of TLDRs: {ao_test_tldrs}")
print(f"AO-COMPLETE  - Number of elements: {len(ao_data)} | Number of TLDRs: {ao_data_tldrs}")
print(f"Mean length of tokenized abstracts: {ao_mean_length}")
print("Number Tokens too long for model inputs:")
print(f"mBART  (token length>={max_mbart}):  {ao_too_long_mbart}")
print(f"ZmBART (token length>={max_zmbart}):  {ao_too_long_zmbart}")
print(f"LED    (token length>={max_led}): {ao_too_long_led}")

AO-TRAIN     - Number of elements: 1992 | Number of TLDRs: 1992
AO-VALID     - Number of elements:  619 | Number of TLDRs: 1453
AO-TEST      - Number of elements:  618 | Number of TLDRs: 1967
AO-COMPLETE  - Number of elements: 3229 | Number of TLDRs: 5412
Mean length of tokenized abstracts: 248.94673273459276
Number Tokens too long for model inputs:
mBART  (token length>=1024):  0
ZmBART (token length>=2048):  0
LED    (token length>=16384): 0


In [171]:
# Tokenize data and compare lengths
ao_tokenized_sizes = tokenize_data(ao_test)
ao_mean_length = sum(ao_tokenized_sizes) / len(ao_tokenized_sizes)
ao_too_long_mbart = number_too_long_elements(ao_tokenized_sizes, max_mbart)
ao_too_long_zmbart = number_too_long_elements(ao_tokenized_sizes, max_zmbart)
ao_too_long_led = number_too_long_elements(ao_tokenized_sizes, max_led)
print(f"Mean length of tokenized abstracts: {ao_mean_length}")
print("Number Tokens too long for model inputs:")
print(f"mBART  (token length>={max_mbart}):  {ao_too_long_mbart}")
print(f"ZmBART (token length>={max_zmbart}):  {ao_too_long_zmbart}")
print(f"LED    (token length>={max_led}): {ao_too_long_led}")

Mean length of tokenized abstracts: 252.2766990291262
Number Tokens too long for model inputs:
mBART  (token length>=1024):  0
ZmBART (token length>=2048):  0
LED    (token length>=16384): 0


In [172]:
ao_source_words = extract_avg_words_source(ao_test)
ao_target_words = extract_avg_words_target(ao_test)
comp_ratio = ao_source_words / ao_target_words
print(f"Number average words in document AO: {ao_source_words}")
print(f"Number average words in summary AO:  {ao_target_words}")
print(f"Compression ratio AO:                {comp_ratio}")

Number average words in document AO: 163.0242718446602
Number average words in summary AO:  18.91103202846975
Compression ratio AO:                8.62059096506389


### Abstract Introduction Conclusion Data

In [173]:
# Read abstract introduction, conclusion data
aic_train = load_json_data(os.path.join(aic_text_path, "train.jsonl"))
aic_valid = load_json_data(os.path.join(aic_text_path, "valid.jsonl"))
aic_test = load_json_data(os.path.join(aic_text_path, "test.jsonl"))
aic_data = [*aic_train, *aic_valid, *aic_test]

In [174]:
# Compute number of TLDRs per split 
aic_train_tldrs = compute_number_of_tldrs(aic_train)
aic_valid_tldrs = compute_number_of_tldrs(aic_valid)
aic_test_tldrs = compute_number_of_tldrs(aic_test)
aic_data_tldrs = compute_number_of_tldrs(aic_data)

In [175]:
# Tokenize data and compare lengths
aic_tokenized_sizes = tokenize_data(aic_data)
aic_mean_length = sum(aic_tokenized_sizes) / len(aic_tokenized_sizes)
aic_too_long_mbart = number_too_long_elements(aic_tokenized_sizes, max_mbart)
aic_too_long_zmbart = number_too_long_elements(aic_tokenized_sizes, max_zmbart)
aic_too_long_led = number_too_long_elements(aic_tokenized_sizes, max_led)

Token indices sequence length is longer than the specified maximum sequence length for this model (1585 > 1024). Running this sequence through the model will result in indexing errors


In [176]:
# Standard measures:
print(f"AIC-TRAIN     - Number of elements: {len(aic_train)} | Number of TLDRs: {aic_train_tldrs}")
print(f"AIC-VALID     - Number of elements:  {len(aic_valid)} | Number of TLDRs: {aic_valid_tldrs}")
print(f"AIC-TEST      - Number of elements:  {len(aic_test)} | Number of TLDRs: {aic_test_tldrs}")
print(f"AIC-COMPLETE  - Number of elements: {len(aic_data)} | Number of TLDRs: {aic_data_tldrs}")
print(f"Mean length of tokenized abstracts: {aic_mean_length}")
print("Number Tokens too long for model inputs:")
print(f"mBART  (token length>={max_mbart}):  {aic_too_long_mbart}")
print(f"ZmBART (token length>={max_zmbart}):  {aic_too_long_zmbart}")
print(f"LED    (token length>={max_led}): {aic_too_long_led}")

AIC-TRAIN     - Number of elements: 1992 | Number of TLDRs: 1992
AIC-VALID     - Number of elements:  619 | Number of TLDRs: 1453
AIC-TEST      - Number of elements:  618 | Number of TLDRs: 1967
AIC-COMPLETE  - Number of elements: 3229 | Number of TLDRs: 5412
Mean length of tokenized abstracts: 1528.5385568287395
Number Tokens too long for model inputs:
mBART  (token length>=1024):  2573
ZmBART (token length>=2048):  601
LED    (token length>=16384): 0


In [177]:
aic_tokenized_sizes = tokenize_data(aic_test)
aic_mean_length = sum(aic_tokenized_sizes) / len(aic_tokenized_sizes)
aic_too_long_mbart = number_too_long_elements(aic_tokenized_sizes, max_mbart)
aic_too_long_zmbart = number_too_long_elements(aic_tokenized_sizes, max_zmbart)
aic_too_long_led = number_too_long_elements(aic_tokenized_sizes, max_led)
print(f"Mean length of tokenized abstracts: {aic_mean_length}")
print("Number Tokens too long for model inputs:")
print(f"mBART  (token length>={max_mbart}):  {aic_too_long_mbart}")
print(f"ZmBART (token length>={max_zmbart}):  {aic_too_long_zmbart}")
print(f"LED    (token length>={max_led}): {aic_too_long_led}")

Mean length of tokenized abstracts: 1571.0291262135922
Number Tokens too long for model inputs:
mBART  (token length>=1024):  522
ZmBART (token length>=2048):  125
LED    (token length>=16384): 0


In [178]:
aic_source_words = extract_avg_words_source(aic_test)
aic_target_words = extract_avg_words_target(aic_test)
comp_ratio = aic_source_words / aic_target_words
print(f"Number average words in document AIC: {aic_source_words}")
print(f"Number average words in summary AIC:  {aic_target_words}")
print(f"Compression ratio AIC:                {comp_ratio}")

Number average words in document AIC: 1023.6731391585761
Number average words in summary AIC:  18.91103202846975
Compression ratio AIC:                54.13100340676701


### Full Text Data

In [179]:
# Read full text data
ft_train = load_json_data(os.path.join(ft_text_path, "train.jsonl"))
ft_valid = load_json_data(os.path.join(ft_text_path, "valid.jsonl"))
ft_test = load_json_data(os.path.join(ft_text_path, "test.jsonl"))
ft_data = [*ft_train, *ft_valid, *ft_test]

In [180]:
# Compute number of TLDRs per split 
ft_train_tldrs = compute_number_of_tldrs(ft_train)
ft_valid_tldrs = compute_number_of_tldrs(ft_valid)
ft_test_tldrs = compute_number_of_tldrs(ft_test)
ft_data_tldrs = compute_number_of_tldrs(ft_data)

In [181]:
# Tokenize data and compare lengths
ft_tokenized_sizes = tokenize_data(ft_data)
ft_mean_length = sum(ft_tokenized_sizes) / len(ft_tokenized_sizes)
ft_too_long_mbart = number_too_long_elements(ft_tokenized_sizes, max_mbart)
ft_too_long_zmbart = number_too_long_elements(ft_tokenized_sizes, max_zmbart)
ft_too_long_led = number_too_long_elements(ft_tokenized_sizes, max_led)

In [182]:
# Standard measures:
print(f"FT-TRAIN     - Number of elements: {len(ft_train)} | Number of TLDRs: {ft_train_tldrs}")
print(f"FT-VALID     - Number of elements:  {len(ft_valid)} | Number of TLDRs: {ft_valid_tldrs}")
print(f"FT-TEST      - Number of elements:  {len(ft_test)} | Number of TLDRs: {ft_test_tldrs}")
print(f"FT-COMPLETE  - Number of elements: {len(ft_data)} | Number of TLDRs: {ft_data_tldrs}")
print(f"Mean length of tokenized abstracts: {ft_mean_length}")
print("Number Tokens too long for model inputs:")
print(f"mBART  (token length>={max_mbart}):  {ft_too_long_mbart}")
print(f"ZmBART (token length>={max_zmbart}):  {ft_too_long_zmbart}")
print(f"LED    (token length>={max_led}): {ft_too_long_led}")

FT-TRAIN     - Number of elements: 1992 | Number of TLDRs: 1992
FT-VALID     - Number of elements:  619 | Number of TLDRs: 1453
FT-TEST      - Number of elements:  618 | Number of TLDRs: 1967
FT-COMPLETE  - Number of elements: 3229 | Number of TLDRs: 5412
Mean length of tokenized abstracts: 7602.2313409724375
Number Tokens too long for model inputs:
mBART  (token length>=1024):  3124
ZmBART (token length>=2048):  3083
LED    (token length>=16384): 42


In [183]:
ft_tokenized_sizes = tokenize_data(ft_test)
ft_mean_length = sum(ft_tokenized_sizes) / len(ft_tokenized_sizes)
ft_too_long_mbart = number_too_long_elements(ft_tokenized_sizes, max_mbart)
ft_too_long_zmbart = number_too_long_elements(ft_tokenized_sizes, max_zmbart)
ft_too_long_led = number_too_long_elements(ft_tokenized_sizes, max_led)
print(f"Mean length of tokenized abstracts: {ft_mean_length}")
print("Number Tokens too long for model inputs:")
print(f"mBART  (token length>={max_mbart}):  {ft_too_long_mbart}")
print(f"ZmBART (token length>={max_zmbart}):  {ft_too_long_zmbart}")
print(f"LED    (token length>={max_led}): {ft_too_long_led}")

Mean length of tokenized abstracts: 7867.3236245954695
Number Tokens too long for model inputs:
mBART  (token length>=1024):  604
ZmBART (token length>=2048):  603
LED    (token length>=16384): 7


In [184]:
ft_source_words = extract_avg_words_source(ft_test)
ft_target_words = extract_avg_words_target(ft_test)
comp_ratio = ft_source_words / ft_target_words
print(f"Number average words in document FT: {ft_source_words}")
print(f"Number average words in summary FT:  {ft_target_words}")
print(f"Compression ratio FT:                {comp_ratio}")

Number average words in document FT: 5153.211974110032
Number average words in summary FT:  18.91103202846975
Compression ratio FT:                272.49765990307094


In [185]:
# Display how many tokens more than border are used and mean length over max
too_long_led = too_long_elements(ft_tokenized_sizes, max_led)
mean_too_long_led = (sum(too_long_led) - (len(too_long_led)*max_led)) / len(too_long_led)
print(f"Mean length of too long inputs for LED: {mean_too_long_led}")
for ele in too_long_led: 
    print(f"Tokens over #{max_led}: {ele-max_led}")


Mean length of too long inputs for LED: 3225.5714285714284
Tokens over #16384: 746
Tokens over #16384: 3419
Tokens over #16384: 181
Tokens over #16384: 601
Tokens over #16384: 2066
Tokens over #16384: 14661
Tokens over #16384: 905
