In [8]:
import pandas as pd
import json
# import pathlib

from experiment.api import label_studio
from experiment.utils import transformation

In [9]:
# get variables
ANNOTATIONS_PATH = transformation.get_project_root() / "data" / "output" / "annotations.json"
DB_PATH = transformation.get_project_root() / "data" / "output" / "db.json"

In [10]:
# start the heroku server if stopped
label_studio.start_label_studio()

Scaling dynos... done, now running [32mweb[39m at 1:Basic


In [11]:
# get the annotations from the label studio server
label_studio.download_annotations()

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  141k  100  141k    0     0  77317      0  0:00:01  0:00:01 --:--:-- 77524


In [12]:
# Read the JSON data from file
with open(ANNOTATIONS_PATH, "r") as file:
    json_data = json.load(file)

In [13]:
# Create a list to store flattened data
flattened_data_list = []

# Flatten the nested data for each entry in the JSON data
for entry in json_data:
    annotations = entry.get("annotations", [])
    data = entry.get("data", [])
    for annotation in annotations:
        result = annotation.get("result", [])
        for res in result:
            value = res.get("value", {})
            flattened_data = {
                # "id": entry["id"],
                "annotation_id": annotation["id"],
                # "completed_by": annotation["completed_by"],
                "type": res["type"],
                # "end": value.get("end", None),
                # "start": value.get("start", None),
                "text": value.get("text", None),
                "labels": value.get("labels", None),
                "choices": value.get("choices", None),
                # "origin": res.get("origin", None),
                # "to_name": res.get("to_name", None),
                # "from_name": res.get("from_name", None),
                "full_text": data.get("text", None),
                "study_no": data.get("study_no", None),
                "patient_no": data.get("patient_no", None),
                "report_date": data.get("report_date", None),
                "report_count": data.get("report_count", None),
            }
            flattened_data_list.append(flattened_data)

# Convert to a Pandas DataFrame
df = pd.DataFrame(flattened_data_list)

with open(DB_PATH, "w") as outfile:
    json_data = {"annotated": list(set(df["patient_no"].to_list()))}
    json.dump(json_data, outfile)

In [14]:
# separate labels and choices
df_labels = df.loc[df.type == "labels"]
df_choices = df.loc[df.type == "choices"]

# merge labels & choices & select only the relevant columns
df_labels_choices = (
    df_labels.groupby("annotation_id")
    .agg({"text": " ".join})
    .reset_index()
    .merge(df_choices, on="annotation_id", how="left")
)[
    [
        "annotation_id",
        # "study_no",
        "patient_no",
        "report_date",
        # "report_count",
        "full_text",
        "text_x",
        "choices",
    ]
].rename(
    columns={"text_x": "relevant_text", "choices": "classifications"}
)

# update the classifications column type
df_labels_choices["classifications"] = df_labels_choices["classifications"].astype(str)

# filter excluded ones
df_final = df_labels_choices.loc[
    ~df_labels_choices["classifications"].str.contains("Exclude", na=False)
]

# Define a mapping of string values to integer values
string_to_integer_mapping = {
    "['Emergency']": 0,
    "['Normal']": 1,
    "['Non Emergency [Doctor]']": 2,
    "['Non Emergency [No Doctor]']": 3,
}

# Replace string values with integer values
df_final["classifications"] = df_final["classifications"].replace(
    string_to_integer_mapping
)

In [15]:
df_final["relevant_text"][0]

'CPA ve prepontin sisternler geniş izlenmektedir. Serebellar folyalar belirgindir. 3. ventrikül ve lateral ventriküller geniş olarak izlenmektedir. Serebral sulkus ve sisternalar genişlemiştir. Periventriküler alanlarda lökoriazis ile uyumlu görünüm mevcuttur.'

In [16]:
# add word count to the data frame
df_final["word_count"] = (
    df_final["relevant_text"]
    .apply(lambda x: len(str(x).split()))
)

# word count ascending
df_final_sorted = df_final.sort_values(["word_count"], ascending=False)
df_final_sorted.head(5)

Unnamed: 0,annotation_id,patient_no,report_date,full_text,relevant_text,classifications,word_count
18,36,2004355719,2021-06-11,Kontrastsız Beyin BT Kafa kaidesi ve kalvaria...,Kontrastsız Beyin BT Kafa kaidesi ve kalvarial...,1,78
4,19,2008696775,2020-10-13,KONTRASTSIZ BEYİN BT İnfratentorial kesitler...,Sol serebellar hemisferde sekel ??? ensefaloma...,3,60
12,28,2006088545,2021-06-07,Kontrastsız Beyin BT tetkiki Kafa kaidesi v...,Periventriküler derin beyaz cevherde kronik i...,3,57
47,74,2009366291,2021-06-23,Beyin BT ve orbita BT tetkiklerinde Posterio...,Sağda orbita tavanınd frontal sinüs tabanında ...,0,54
37,61,2009021658,2021-06-22,Beyin BT tetkiki Kafa kaidesi normal sınırl...,Sağ orbita üst duvarında fraktüre ait görünüm ...,0,53


In [17]:
df_final_sorted.to_csv(transformation.get_project_root() / "data" / "output" / "clean_annotations.csv")

In [18]:
# stop the instance
label_studio.stop_label_studio()

Scaling dynos... done, now running [32mweb[39m at 0:Basic


In [19]:
# item_counts = collections.Counter(clean_annotations["classifications"].to_list())

# # 1. Emergency
# # 2. Normal
# # 3. Non Emergency [Doctor]
# # 4. Non Emergency [No Doctor]
# for item, count in item_counts.items():
#     print(f"Item {item} occurs {count} times in the list.")

# # manual test the word lemmatizer
# import simplemma
# word = "hemisferde"
# simplemma.lemmatize(word, lang="tr").lower()