In [140]:
import pandas as pd
import pathlib
import re


In [141]:
source_data = pathlib.Path("../data") / "BT_BEYIN_KSIZ.xls"

In [142]:
df_reports = pd.read_excel(source_data)
df_reports

Unnamed: 0,HASTA_NO,PROTOKOL_NO,AD_SOYAD,ACIKLAMA
0,2004001412,23012646,SADİ İYEM,RAPOR TARİHİ: 26/09/2022 FİLM NO: 12796199\n...
1,2004001562,22394628,İLKNUR ERTENER,RAPOR TARİHİ: 15.03.2022 FİLM NO: 12523533\...
2,2004001709,22296380,VEHBİYE AKBALIK,RAPOR TARİHİ:15/02/2022 FİLM NO:12493333\n\n...
3,2004002163,23963976,HAVVA AYDIN,RAPOR TARİHİ: 25/05/2023 FİLM NO:\n\nKontras...
4,2004002609,21367900,ZEYNEP YEŞİLOVA,RAPOR TARİHİ: 09.08.2021 FİLM NO: 11338850...
...,...,...,...,...
32127,2009595227,24165121,BERİL SARGIN,RAPOR TARİHİ:20/07/2023 FİLM NO:\n\nKontrast...
32128,2009595436,24167884,MUHAMMED DABUL,RAPOR TARİHİ: 21/07/2023 TETKİK NO:...
32129,2009595507,24168883,MUHAMMED MUSTAFA KANDEMİR,RAPOR TARİHİ: 21/07/2023 TETKİK NO:...
32130,2009595568,24169841,KEREM KABA,RAPOR TARİHİ : 21/07/2023 FİLM NO : 1333488...


### Extract Information

In [143]:
film_no_pattern = r"\b\d{6,}\b"
rapor_tarihi_pattern= r"\b\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}\b"

In [144]:
def pattern_find_and_remove(text: str, pattern: str) -> tuple[str]:
    try:
        extracted_pattern = re.findall(pattern, text)[0]
    except IndexError:
        extracted_pattern = ""

    return extracted_pattern, text.replace(extracted_pattern, "").strip()

In [145]:
def replace_patterns_from_text(
    text: str, patterns: list[str], replace_with: str = ""
) -> str:
    for pattern in patterns:
        text = text.replace(pattern, replace_with)

    return text.strip()

In [146]:
def apply_transformations(text) -> tuple[str]:
    text = replace_patterns_from_text(
        text,
        [
            "RAPOR TARİHİ",
            "FİLM NO",
            "TETKİK NO",
            "ÇEKİM TARİHİ",
            "ÇEKİM  TARİHİ",
            "TETKİK TARİHİ",
            " :",
            ": ",
            " : ",
        ],
    )
    text = replace_patterns_from_text(
        text, ["RAPOR TARİHİ", "FİLM NO", "TETKİK NO", " :", ": ", " : "]
    )
    text = replace_patterns_from_text(text, ["\n", "*"], " ")
    text = replace_patterns_from_text(text, [":", ";"], " ")

    film_no, text = pattern_find_and_remove(
        text,
        film_no_pattern,
    )
    rapor_tarihi, text = pattern_find_and_remove(text, rapor_tarihi_pattern)

    return text.strip(), rapor_tarihi, film_no

In [147]:
result = df_reports['ACIKLAMA'].apply(apply_transformations)

In [148]:
df_reports[["text", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)

In [149]:
df_reports = df_reports.rename(
    columns={
        "HASTA_NO": "patient_no",
        "PROTOKOL_NO": "protocol_no",
    }
).drop(["ACIKLAMA", "AD_SOYAD"], axis=1)

In [150]:
df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')

  df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')


In [151]:
df_report_count = df_reports.groupby("patient_no").agg(report_count=pd.NamedAgg(column="patient_no", aggfunc="count")).reset_index()

In [152]:
# parse dates & order by date & drop duplicates
df_reports.merge(df_report_count, on='patient_no', how='left').sort_values(["report_count", "report_date"], ascending=[False, True]).drop_duplicates(subset=["patient_no"], keep="first")

Unnamed: 0,patient_no,protocol_no,text,report_date,study_no,report_count
5420,2005183286,22741231,Kafa tabanı ve verteks arasından elde olunan k...,2022-06-20,,27
6122,2006000036,22219950,BEYİN BT Hastanın filmi 09/01/2022 tarihli ...,2022-02-01,12471911,26
16908,2008509284,22344550,Kontrastsız beyin BT tetkiki İnfratentoria...,2022-02-08,12506525,20
25,2004004697,22023602,Kontrastsız beyin BT tetkiki İnfratentor...,2021-12-27,,19
16219,2008451292,22591624,Beyin BT tetkiki 28/04/2022 Hastanın filmi b...,2022-05-24,12612882,18
...,...,...,...,...,...,...
32044,2009591303,24110240,Kafa kaidesi ve kalvarial kemik yapılar normal...,NaT,,1
32046,2009591427,24111990,Kafa kaidesi ve kalvarial kemik yapılar normal...,NaT,,1
32047,2009591428,24111994,Sağ temporal fossada ekstraaksiyal 2.7x0.7cm ...,NaT,,1
32053,2009591912,24118791,Beyin BT Kafa tabanı ve verteks arasından el...,NaT,13316817,1


In [153]:
# check empty dates
# (df_reports['report_date'].values == '').sum() 

In [154]:
df_reports = df_reports.sort_values(by="patient_no")

In [155]:
df_reports.to_csv("../data/output.csv")