In [None]:
import pandas as pd
import json
# import pathlib

from experiment.api import label_studio
from experiment.utils import transformation

In [None]:
# get variables
ANNOTATIONS_PATH = transformation.get_project_root() / "data" / "output" / "annotations.json"
DB_PATH = transformation.get_project_root() / "data" / "output" / "db.json"

# start the heroku server if stopped
label_studio.start_label_studio()

# get the annotations from the label studio server
label_studio.download_annotations()

In [None]:
# Read the JSON data from file
with open(ANNOTATIONS_PATH, "r") as file:
    json_data = json.load(file)

# Create a list to store flattened data
flattened_data_list = []

# Flatten the nested data for each entry in the JSON data
for entry in json_data:
    annotations = entry.get("annotations", [])
    data = entry.get("data", [])
    for annotation in annotations:
        result = annotation.get("result", [])
        for res in result:
            value = res.get("value", {})
            flattened_data = {
                # "id": entry["id"],
                "annotation_id": annotation["id"],
                # "completed_by": annotation["completed_by"],
                "type": res["type"],
                # "end": value.get("end", None),
                # "start": value.get("start", None),
                "text": value.get("text", None),
                "labels": value.get("labels", None),
                "choices": value.get("choices", None),
                # "origin": res.get("origin", None),
                # "to_name": res.get("to_name", None),
                # "from_name": res.get("from_name", None),
                "full_text": data.get("text", None),
                "study_no": data.get("study_no", None),
                "patient_no": data.get("patient_no", None),
                "report_date": data.get("report_date", None),
                "report_count": data.get("report_count", None),
            }
            flattened_data_list.append(flattened_data)

# Convert to a Pandas DataFrame
df = pd.DataFrame(flattened_data_list)

with open(DB_PATH, "w") as outfile:
    json_data = {"annotated": list(set(df["patient_no"].to_list()))}
    json.dump(json_data, outfile)

In [None]:
# separate labels and choices
df_labels = df.loc[df.type == "labels"]
df_choices = df.loc[df.type == "choices"]

# merge labels & choices & select only the relevant columns
df_labels_choices = (
    df_labels.groupby("annotation_id")
    .agg({"text": " ".join})
    .reset_index()
    .merge(df_choices, on="annotation_id", how="left")
)[
    [
        "annotation_id",
        # "study_no",
        "patient_no",
        "report_date",
        # "report_count",
        "full_text",
        "text_x",
        "choices",
    ]
].rename(
    columns={"text_x": "relevant_text", "choices": "classifications"}
)

# update the classifications column type
df_labels_choices["classifications"] = df_labels_choices["classifications"].astype(str)

# filter excluded ones
df_final = df_labels_choices.loc[
    ~df_labels_choices["classifications"].str.contains("Exclude", na=False)
]

# Define a mapping of string values to integer values
string_to_integer_mapping = {
    "['Emergency']": 0,
    "['Normal']": 1,
    "['Non Emergency [Doctor]']": 2,
    "['Non Emergency [No Doctor]']": 3,
}

# Replace string values with integer values
df_final["classifications"] = df_final["classifications"].replace(
    string_to_integer_mapping
)

In [None]:
df_final["relevant_text"][0]

In [None]:
# add word count to the data frame
df_final["word_count"] = (
    df_final["relevant_text"]
    .apply(lambda x: len(str(x).split()))
)

# word count ascending
df_final_sorted = df_final.sort_values(["word_count"], ascending=False)
df_final_sorted.head(5)

In [None]:
df_final_sorted.to_csv(transformation.get_project_root() / "data" / "output" / "clean_annotations.csv")

In [None]:
# stop the instance
label_studio.stop_label_studio()