# Line Label Token Preprocessing

This notebooks is used to create the dataset for the line-label task using the token level approach. Be sure to first run:

1. notebooks/preprocessing_old_project
2. notebooks/01_classifying

In [None]:
import sys
import os
sys.path.append(os.getcwd()+"/../..")

from src import paths
from src.utils import prepare_pd_dataset_for_lineclass

import pandas as pd

from datasets import Dataset, DatasetDict, concatenate_datasets

This is a preprocessing step to label all the observations in seantis_kisim.csv, the dataset containing the reports in line splitted format.

In [None]:
# The dataset needs columns rid, text, class_agg. The text column contains one line of a report per row. 
# The class_agg column can have all Null values, it will be filled by the lineclass model.

df_all = pd.read_csv(paths.DATA_PATH_PREPROCESSED/"midatams/seantis_kisim.csv")
df_all = df_all[["research_id", "text"]].rename(columns={"research_id": "rid"})
df_all["class_agg"] = None
df_all

In [None]:
df_train = pd.read_csv(paths.DATA_PATH_PREPROCESSED/"line-label/line-label_clean_train.csv")
df_val = pd.read_csv(paths.DATA_PATH_PREPROCESSED/"line-label/line-label_clean_val.csv")
df_test = pd.read_csv(paths.DATA_PATH_PREPROCESSED/"line-label/line-label_clean_test.csv")

dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "val": Dataset.from_pandas(df_val),
    "test": Dataset.from_pandas(df_test),
    "all": Dataset.from_pandas(df_all)
})

df_token_train = prepare_pd_dataset_for_lineclass(df_train)
df_token_val = prepare_pd_dataset_for_lineclass(df_val)
df_token_test = prepare_pd_dataset_for_lineclass(df_test)
df_token_all = prepare_pd_dataset_for_lineclass(df_all)

dataset_token = DatasetDict({
    "train": Dataset.from_pandas(df_token_train),
    "val": Dataset.from_pandas(df_token_val),
    "test": Dataset.from_pandas(df_token_test),
    "all": Dataset.from_pandas(df_token_all)
})

# Add a few of the validation examples to the training set
dataset_token["train"] = concatenate_datasets([dataset_token["train"], dataset_token["val"].select(range(8))])
dataset_token["val"] = dataset_token["val"].select(range(8, len(dataset_token["val"])))

# Save the tokenized datasets
dataset_token.save_to_disk(paths.DATA_PATH_PREPROCESSED/"line-label/line-label_for_token_classification")