# Classifying text lines

A classifier is trained to classify single text lines of a report. For example, the lines can be classified as containing a diagnosis (`"dm"`) or history (`his`) of a patient. This task was used as a preprocessing step to later steps of structured information extraction. So that only lines classified as containing a diagnosis will be fed to a downstream classifier, extracting the exact diagnosis. This step might be unnecessary with modern transformers that can handle longer text inputs.

The files containing the necessary information are inside the `data/raw/labelling` directory. It contains manually labelled reports from different sessions.

The original classes per label are:

| category    | subcategory       | abbreviation |
|-------------|-------------------|--------------|
| diagnosis   | MS diagnosis      | dm           |
|             | other             | do           |
| current state     |              | cu           |
| history     |                   | his          |
| symptoms    | MS related        | sym          |
|             | other             | so           |
| MRI | results                  | mr           |
| lab | results                  | labr         |
|             | other             | labo         |
| medication  | MS related        | medms        |
|             | other             | medo         |
| test, treatment        | results | tr           |
| header      |                   | head         |
| unknown     |                   | unk          |


In [None]:
import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths
import polars as pl
import numpy as np

In [None]:
# Import the data
def load_line_labelling():
    """Loading the data from the nested csv files in the different "imported_time" directories. Labelled reports have a "rev.csv" ending.
    """

    df = pl.DataFrame(
    
    )

    for root, dirs, files in os.walk(paths.DATA_PATH_LABELLED):
        for file in files:
            # Get the research id from filename
            rid = file.split("_")[0]
            
            if (file.endswith("rev.csv") and "mri" not in file):
                # Create a dataframe from the csv file
                _df = pl.read_csv(os.path.join(root, file))
                
                # Add the rid to the dataframe
                _df = _df.select(
                    pl.col("text").alias("text"),
                    pl.col("class").alias("class"),
                    pl.lit(rid).alias("rid"),
                )
                # Append the dataframe to the main dataframe
                try: 
                    df = df.vstack(_df)
                except:
                    print("Error with file: ", file)
                    print("df head: ", df.head(5))
                    print("_df head: ", _df.head(5))
                    continue
    return df

In [None]:
def clean_line_text(df: pl.DataFrame):
    """Cleans the dataframe from the load_line_labelling function. 
    Text is cleaned by:
    1) Removes whitespace from beginning and end of text
    2) removes double spaces,
    3) remove empty lines and lines starting with "·" or "··".

    Args:
        df (pl.DataFrame): Input dataframe
    """

    df = df.with_columns(
        pl.col("text").map_elements(lambda s: s.strip())
        .map_elements(lambda s: s.replace("  ", " "))
        .map_elements(lambda s: s.replace("·", ""))
        .map_elements(lambda s: s.replace("··", "")),
        )

    return df

def clean_line_class(df: pl.DataFrame):
    """Cleans the dataframe labels in "class".
    1) Removes whitespace from beginning and end of text
    2) Correct spelling mistakes
    3) Exclude classes that are not part of the original approach
    4) Create a new column "class_agg" with the aggregated classes of the original approach.

    Args:
        df (pl.DataFrame): Input dataframe
    """

    # Class mapping spelling mistakes
    class_mapping_spelling = {
        'memds': 'medms',
    }

    # Classes of original approach abbreviation
    classes_orig = ["dm", "do", "cu", "his", "sym", "so", "mr", "labr", "labo", "medms", "medo", "tr", "head", "unk"]

    
    
    # Class mapping of original approach
    class_mapping_agg = {
        'his': 'his_sym_cu',
        'sym': 'his_sym_cu',
        'cu': 'his_sym_cu',
        'labr': 'labr_labo',
        'labo': 'labr_labo',
        'to': 'to_tr',
        'tr': 'to_tr',
        'medo': 'medo_unk_do_so',
        'unk': 'medo_unk_do_so',
        'do': 'medo_unk_do_so',
        'so': 'medo_unk_do_so',
    }

    df = df.with_columns(
        pl.col("class").map_elements(lambda s: s.strip())
        .map_elements(lambda s: class_mapping_spelling.get(s, s))
        .map_elements(lambda s: s if s in classes_orig else None)
        .map_elements(lambda s: class_mapping_agg.get(s, s)).alias("class_agg"),
    )

    return df


In [None]:
# Loading and cleaning
df = load_line_labelling()
df = clean_line_text(df)
df = clean_line_class(df)

# Reorder rid to first column
df = df[["rid", "text", "class", "class_agg"]]

In [None]:
# Output
display(df.head(5))

# Save the dataframe
df.write_csv(os.path.join(paths.DATA_PATH_PREPROCESSED, "line_labelling_clean.csv"))