### Generate CSV files out pf a text file containing TLDR translations and the original SciTLDR English data jsonl files

In [1]:
# Imports

import os
import json
import pandas as pd

In [3]:
# Functions to read in the data files

def read_jsonl_files(file_name):
    data = []
    file = open(file_name, "r", encoding="utf-8")
    lines = file.readlines()
    for line in lines:
        data.append(json.loads(line))
    return data

def read_text_file(file_name):
    data = []
    file = open(file_name, "r", encoding="utf-8")
    lines = file.readlines()
    for line in lines:
        data.append(line.replace("\n", "").replace("\r", ""))
    return data

# Helper method to transform list of strings to a single string

def list_to_string(listele):
    text = ""
    for ele in listele:
        text += ele.replace("\n", "").replace("\r", "")
    return text

In [4]:
# Create csv only out of jsonl (for English)

def create_csv(json_data, output_file):
    data = []
    for i in range(len(json_data)):
        for j in range(len(json_data[i]["target"])):
            content = {}
            content["source"] = list_to_string(json_data[i]["source"])
            content["source_labels"] = json_data[i]["source_labels"] #str() ?
            content["rouge_scores"] = json_data[i]["rouge_scores"] #str() ?
            content["paper_id"] = json_data[i]["paper_id"]
            content["target"] = json_data[i]["target"][j]
            try:
                content["title"] = json_data[i]["title"]
            except:
                print(f"No totle found for #{i}")
            data.append(content)
    dataframe = pd.DataFrame(data)
    dataframe.to_csv(output_file, index=False, encoding="utf-8")
    
def create_sep_files(json_data, output_file):
    sources = []
    targets = []
    for i in range(len(json_data)):
        for j in range(len(json_data[i]["target"])):
            sources.append(list_to_string(json_data[i]["source"]))
            targets.append(json_data[i]["target"][j])
    source_file = open(f"{output_file}-source.txt", "w", encoding="utf-8")
    for ele in sources:
        source_file.write(ele + "\n")
    target_file = open(f"{output_file}-target.txt", "w", encoding="utf-8")
    for ele in targets:
        target_file.write(ele + "\n")
    

In [5]:
# Data mapping and csv generation

def generate(json_data, translations_data, output_file):
    data = []
    for i in range(len(json_data)):
        for j in range(len(json_data[i]["target"])):
            content = {}
            content["source"] = list_to_string(json_data[i]["source"])
            content["source_labels"] = json_data[i]["source_labels"] #str() ?
            content["rouge_scores"] = [] # json_data[i]["rouge_scores"] #str() ?
            content["paper_id"] = json_data[i]["paper_id"]
            # content["target"] = json_data[i]["target"][j]
            try:
                content["title"] = json_data[i]["title"]
            except:
                print(f"No title found for #{i}")
            data.append(content)
    assert len(data) == len(translations_data), f"PROBLEM: UNEQUAL LENGTH {len(data)} != {len(translations_data)}"
    for i in range(len(translations_data)):
        data[i]["target"] = translations_data[i]        
    dataframe = pd.DataFrame(data)
    dataframe.to_csv(output_file, index=False, encoding="utf-8")
    return None

def generate_sep_files(json_data, translations_data, output_file):
    sources = []
    targets = []
    for i in range(len(json_data)):
        for j in range(len(json_data[i]["target"])):
            sources.append(list_to_string(json_data[i]["source"]))
    source_file = open(f"{output_file}-source.txt", "w", encoding="utf-8")
    for ele in sources:
        source_file.write(ele + "\n")
    target_file = open(f"{output_file}-target.txt", "w", encoding="utf-8")
    for ele in translations_data:
        target_file.write(ele + "\n")
    

### Generate English files

In [None]:
## Generate English files
# versions = ["Abstracts", "AIC", "FullText"]
# files = ["train.jsonl", "test.jsonl", "dev.jsonl"]

# for version in versions:
# for file in files:
#         data = read_jsonl_files(os.path.join("English", "jsonl", version, file))
#         create(data, f"English/{file}_en_{version}.csv")
#         file_name = file.replace(".jsonl", "")
#         create_sep_files(data, f"English/{file_name}_en_{version}")
        

### Generate Language specific files

In [7]:

versions = ["Abstracts", "AIC", "FullText"]
files_en = ["train", "test", "dev"]
files_tgt = ["train", "test", "valid"]

        

In [None]:
## Generate French specific files
for version in versions:
    for k in range(len(files_tgt)):
        data = read_jsonl_files(os.path.join("English", "jsonl", version, f"{files_en[k]}.jsonl"))
        translations = read_text_file(os.path.join("French", f"{files_tgt[k]}-fr.txt"))
        generate(data, translations, f"French/csv/{version}/{files_tgt[k]}-fr.csv")
        generate_sep_files(data, translations, f"French/csv/{version}/{files_tgt[k]}-fr")

In [None]:
## Generate Italian specific files
for version in versions:
    for k in range(len(files_tgt)):
        data = read_jsonl_files(os.path.join("English", "jsonl", version, f"{files_en[k]}.jsonl"))
        translations = read_text_file(os.path.join("Italian", f"{files_tgt[k]}-it.txt"))
        generate(data, translations, f"Italian/csv/{version}/{files_tgt[k]}-it.csv")
        generate_sep_files(data, translations, f"Italian/text/{version}/{files_tgt[k]}-it")

In [None]:
## Generate Spanish specific files
for version in versions:
    for k in range(len(files_tgt)):
        data = read_jsonl_files(os.path.join("English", "jsonl", version, f"{files_en[k]}.jsonl"))
        translations = read_text_file(os.path.join("Spanish", f"{files_tgt[k]}-es.txt"))
        generate(data, translations, f"Spanish/csv/{version}/{files_tgt[k]}-es.csv")
        generate_sep_files(data, translations, f"Spanish/text/{version}/{files_tgt[k]}-es")

In [None]:
## Generate Japanese specific files
for version in versions:
    for k in range(len(files_tgt)):
        data = read_jsonl_files(os.path.join("English", "jsonl", version, f"{files_en[k]}.jsonl"))
        translations = read_text_file(os.path.join("Japanese", f"{files_tgt[k]}-ja.txt"))
        generate(data, translations, f"Japanese/csv/{version}/{files_tgt[k]}-ja.csv")
        generate_sep_files(data, translations, f"Japanese/text/{version}/{files_tgt[k]}-ja")

In [None]:
## Generate German specific files
for version in versions:
    for k in range(len(files_tgt)):
        data = read_jsonl_files(os.path.join("English", "jsonl", version, f"{files_en[k]}.jsonl"))
        translations = read_text_file(os.path.join("German", f"{files_tgt[k]}-de.txt"))
        generate(data, translations, f"German/csv/{version}/{files_tgt[k]}-de.csv")
        generate_sep_files(data, translations, f"German/text/{version}/{files_tgt[k]}-de")

In [9]:
## Generate Russian specific files
for version in versions:
    for k in range(len(files_tgt)):
        data = read_jsonl_files(os.path.join("English", "jsonl", version, f"{files_en[k]}.jsonl"))
        translations = read_text_file(os.path.join("Russian", f"{files_tgt[k]}-ru.txt"))
        generate(data, translations, f"Russian/csv/{version}/{files_tgt[k]}-ru.csv")
        generate_sep_files(data, translations, f"Russian/text/{version}/{files_tgt[k]}-ru")

No title found for #0
No title found for #1
No title found for #2
No title found for #3
No title found for #4
No title found for #5
No title found for #6
No title found for #7
No title found for #8
No title found for #9
No title found for #10
No title found for #11
No title found for #12
No title found for #13
No title found for #14
No title found for #15
No title found for #16
No title found for #17
No title found for #18
No title found for #19
No title found for #20
No title found for #21
No title found for #22
No title found for #23
No title found for #24
No title found for #25
No title found for #26
No title found for #27
No title found for #28
No title found for #29
No title found for #30
No title found for #31
No title found for #32
No title found for #33
No title found for #34
No title found for #35
No title found for #36
No title found for #37
No title found for #38
No title found for #39
No title found for #40
No title found for #41
No title found for #42
No title found for #4