In [1]:
import argparse
import pandas as pd
import json

def read_data(path, save_path):
    """
    Takes as input a path to a json file and saves train and test dataframes.
    """
    test_frac=0.05

    with open(path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    df = pd.DataFrame(data)

    # print number of not null rows for each dialect and the total lenth of the df
    print(f"Null values: {len(df) - df.isna().sum()}")
    print(f"Total lenth of the dataframe: {len(df)}")

    # get the sentences that have data for all 4 languages/ dialects we consider: 'de', 'ch_be', 'ch_gr', 'ch_vs'
    df_filtered = df[df[['de', 'ch_be', 'ch_gr', 'ch_vs']].notna().all(axis=1)].reset_index(drop=True)

    # extraxt a fraction of these sentences as a testset so it won't be part of the embeddings in any dialect
    num_rows_to_sample = int(len(df_filtered) * test_frac)

    # Randomly sample 0.05% of the rows
    df_test = df_filtered.sample(frac=0.05, random_state=42)

    # The sampled rows are now in df_test, and you can remove them from the original DataFrame if needed
    df_train = df.drop(df_test.id)

    # Reset the index of df_test if needed
    df_test.reset_index(drop=True, inplace=True)

    # save the train and test data in the specified folder
    df_test.to_csv(f"{save_path}/df_test.csv")
    df_train.to_csv(f"{save_path}/df_train.csv")
    print("train and test data saved.")

    return df_train, df_test

In [2]:
# The project folder can be saved in google drive and accessed through Google colab
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# path to the project folder that contains a folder code, data and models
path_project = "/content/drive/MyDrive/anlp_project/NLLB-200/ANLP_SUBMISSION"

raw_data_path = f"{path_project}/data/raw_data/sentences_ch_de_transcribed.json"
save_path = f"{path_project}/data/"

df_train, df_test = read_data(raw_data_path, save_path)

Null values: id                11213
de                11213
ch_sg              2752
ch_be              2700
ch_gr             10475
ch_zh              4065
ch_vs              2753
ch_bs              2713
ch_ag              2748
ch_lu              2715
thema             11213
code_switching      351
dtype: int64
Total lenth of the dataframe: 11213
train and test data saved.
