In [None]:
# Polars import
# Import the data
def load_line_labelling():
    """Loading the data from the nested csv files in the different "imported_time" directories. Labelled reports have a "rev.csv" ending.
    """

    df = pl.DataFrame(
    
    )

    for root, dirs, files in os.walk(paths.DATA_PATH_LABELLED):
        for file in files:
            # Get the research id from filename
            rid = file.split("_")[0]
            
            if (file.endswith("rev.csv") and "mri" not in file):
                # Create a dataframe from the csv file
                _df = pl.read_csv(os.path.join(root, file))
                
                # Add the rid to the dataframe
                _df = _df.select(
                    pl.col("text").alias("text"),
                    pl.col("class").alias("class"),
                    pl.lit(rid).alias("rid"),
                )
                # Append the dataframe to the main dataframe
                try: 
                    df = df.vstack(_df)
                except:
                    print("Error with file: ", file)
                    print("df head: ", df.head(5))
                    print("_df head: ", _df.head(5))
                    continue
    return df

def clean_line_text(df: pl.DataFrame):
    """Cleans the dataframe from the load_line_labelling function. 
    Text is cleaned by:
    1) Removes whitespace from beginning and end of text
    2) removes double spaces,
    3) remove empty lines and lines starting with "·" or "··".

    Args:
        df (pl.DataFrame): Input dataframe
    """

    df = df.with_columns(
        pl.col("text").map_elements(lambda s: s.strip())
        .map_elements(lambda s: s.replace("  ", " "))
        .map_elements(lambda s: s.replace("·", ""))
        .map_elements(lambda s: s.replace("··", ""))
        ).filter(pl.col("text").is_not_null())

    return df

def clean_line_class(df: pl.DataFrame):
    """Cleans the dataframe labels in "class".
    1) Removes whitespace from beginning and end of text
    2) Correct spelling mistakes
    3) Exclude classes that are not part of the original approach
    4) Create a new column "class_agg" with the aggregated classes of the original approach.
    5) OneHotEncode the "class_agg" column

    Args:
        df (pl.DataFrame): Input dataframe
    """

    # Class mapping spelling mistakes
    class_mapping_spelling = {
        'memds': 'medms',
    }

    # Classes of original approach abbreviation
    classes_orig = ["dm", "do", "cu", "his", "sym", "so", "mr", "labr", "labo", "medms", "medo", "tr", "head", "unk"]

    
    
    # Class mapping of original approach
    class_mapping_agg = {
        'his': 'his_sym_cu',
        'sym': 'his_sym_cu',
        'cu': 'his_sym_cu',
        'labr': 'labr_labo',
        'labo': 'labr_labo',
        'to': 'to_tr',
        'tr': 'to_tr',
        'medo': 'medo_unk_do_so',
        'unk': 'medo_unk_do_so',
        'do': 'medo_unk_do_so',
        'so': 'medo_unk_do_so',
    }

    # Cleaning the class column
    df = df.with_columns(
        pl.col("class").map_elements(lambda s: s.strip())
        .map_elements(lambda s: class_mapping_spelling.get(s, s))
        .map_elements(lambda s: s if s in classes_orig else None)
    )


    # Creating a new column with the aggregated classes
    df = df.with_columns(
        pl.col("class").map_elements(lambda s: class_mapping_agg.get(s, s))
        .alias("class_agg"),
    )

    return df

# Cleaning Text
def clean_line_text(df: pd.DataFrame):
    """Cleans the dataframe from the load_line_labelling function. 
    Text is cleaned by:
    1) Removes whitespace from beginning and end of text
    2) removes double spaces,
    3) remove empty lines and lines starting with "·" or "··".

    Args:
        df (pd.DataFrame): Input dataframe
    """

    df = (
        df
        .fillna("")
        .assign(text=lambda d: d["text"].str.strip()
                                        .str.replace("  ", " ")
                                        .str.replace("·", "")
                                        .str.replace("··", ""))
        

    )

    return df

In [None]:
# Split data in train, validation and test
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(df['text'], df['class_agg'], test_size=0.2, random_state=42, shuffle=True)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_val_texts, train_val_labels, test_size=0.2, random_state=42, shuffle=True)

In [None]:
train_val_data, train_val_labels = train_test_split(df.drop("class_agg"), df, test_size=0.2, random_state=42, shuffle=True)

In [None]:
# Create huggingface dataset
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})

# Concatenate into one dataset
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

# OneHotEncode the labels with ClassLabel
dataset = dataset.class_encode_column("label")

# Save the dataset
dataset.save_to_disk(os.path.join(paths.DATA_PATH_PREPROCESSED, "line_labelling_clean_dataset"))