# Concatenate backups

In [1]:
import pandas as pd
import itertools

# Settings

In [2]:
RUN_ID = "e3da0078"

BACKUP_PATH = "./backups/"
INDEX_COL_NAME = "Unnamed: 0"

## Read data

In [3]:
def get_backup_path(
    chunk_id: int, 
    lang_src: str, 
    lang_dst: str,
    backup_dir_path: str = BACKUP_PATH,
    run_id: str = RUN_ID,
):
    """Returns a backup path for specified chunk"""
    file_name = f"{RUN_ID}_translate_from_{lang_src}_to_{lang_dst}_{chunk_id}"
    # return {f"{backup_dir_path}{RUN_ID}_{file_name}.csv"
    return f"{backup_dir_path}{file_name}"

In [4]:
def load_data_from_backups(
    lang_src: str,
    lang_dst: str,
    verbose: bool = True,
):
    """Loads the paraphrase dataset by concatenating te backuped chunks of it."""
    df_final = pd.DataFrame()
    for chunk_id in itertools.count():
        path_curr = get_backup_path(chunk_id, lang_src, lang_dst)
        try:
            df_chunk = pd.read_csv(path_curr, index_col=INDEX_COL_NAME)
        except FileNotFoundError:
            if verbose:
                print(f"Backup successfully loaded for {lang_dst}: {df_final.shape[0]:,} rows.")
            return df_final
        df_final = pd.concat([df_final, df_chunk], axis=0)

In [5]:
df_paraphrase_de = load_data_from_backups("English", "German")
df_paraphrase_sl = load_data_from_backups("English", "Slovenian")
df_paraphrase_cs = load_data_from_backups("English", "Czech")
df_paraphrase_en = load_data_from_backups("German", "English")

Backup successfully loaded for German: 76,879 rows.
Backup successfully loaded for Slovenian: 76,879 rows.
Backup successfully loaded for Czech: 76,879 rows.
Backup successfully loaded for English: 76,879 rows.


In [6]:
df_paraphrase_en.sample(10)

Unnamed: 0,Original,Parahprase
37736,Myths and Facts about the EU budget,Myths and facts about the EU budget
23740,Goal: Train your memory and remove all the cards,Goal: Train your memory and remove all cards
7576,"And if our words say God is amazing, but our l...","And when our words say God is amazing, but our..."
41364,"P101 – If medical advice is needed, have produ...","P101 – If medical advice is required, provide ..."
73216,eHouse Home Automation – Visualization and gra...,Home Automation eHouse – Visualization and gra...
47702,Software Development for Control Panels rooms ...,Software Development for Control Panels Room f...
61444,They made them up. So we are sold on something...,So we are sold on something with facts that ca...
4254,A list of 34 strategic initiatives on which th...,a list of 34 strategic initiatives to be imple...
21531,Flowers for Mumbai uses the most modern techno...,Flowers for Mumbai uses the most modern techno...
70988,"You can change the colors, width and style on ...","You can change the colors, width and style to ..."


## Save data

In [7]:
def save_data(data: pd.DataFrame, path: str):
    """Saves the dataset"""
    n_rows = data.shape[0] 
    if n_rows > 0:
        data.to_csv(path, index=False)
        print(f"Saving {n_rows:,} rows @ {path}")
    else:
        print(f"Skipping for {path}")

In [8]:
save_data(df_paraphrase_de, "./paraphrases_de.csv")
save_data(df_paraphrase_sl, "./paraphrases_sl.csv")
save_data(df_paraphrase_cs, "./paraphrases_cs.csv")
save_data(df_paraphrase_en, "./paraphrases_en.csv")

Saving 76,879 rows @ ./paraphrases_de.csv
Saving 76,879 rows @ ./paraphrases_sl.csv
Saving 76,879 rows @ ./paraphrases_cs.csv
Saving 76,879 rows @ ./paraphrases_en.csv
