In [64]:
import pandas as pd
import pathlib
import re


In [65]:
source_data = pathlib.Path("data") / "BT_BEYIN_KSIZ.xls"

In [66]:
df_reports = pd.read_excel(source_data)
df_reports

Unnamed: 0,HASTA_NO,PROTOKOL_NO,AD_SOYAD,ACIKLAMA
0,2004001412,23012646,SADİ İYEM,RAPOR TARİHİ: 26/09/2022 FİLM NO: 12796199\n...
1,2004001562,22394628,İLKNUR ERTENER,RAPOR TARİHİ: 15.03.2022 FİLM NO: 12523533\...
2,2004001709,22296380,VEHBİYE AKBALIK,RAPOR TARİHİ:15/02/2022 FİLM NO:12493333\n\n...
3,2004002163,23963976,HAVVA AYDIN,RAPOR TARİHİ: 25/05/2023 FİLM NO:\n\nKontras...
4,2004002609,21367900,ZEYNEP YEŞİLOVA,RAPOR TARİHİ: 09.08.2021 FİLM NO: 11338850...
...,...,...,...,...
32127,2009595227,24165121,BERİL SARGIN,RAPOR TARİHİ:20/07/2023 FİLM NO:\n\nKontrast...
32128,2009595436,24167884,MUHAMMED DABUL,RAPOR TARİHİ: 21/07/2023 TETKİK NO:...
32129,2009595507,24168883,MUHAMMED MUSTAFA KANDEMİR,RAPOR TARİHİ: 21/07/2023 TETKİK NO:...
32130,2009595568,24169841,KEREM KABA,RAPOR TARİHİ : 21/07/2023 FİLM NO : 1333488...


### Extract Information

In [67]:
film_no_pattern = r"\b\d{6,}\b"
rapor_tarihi_pattern= r"\b\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}\b"

In [68]:
def pattern_find_and_remove(text: str, pattern: str) -> tuple[str]:
    try:
        extracted_pattern = re.findall(pattern, text)[0]
    except IndexError:
        extracted_pattern = ""

    return extracted_pattern, text.replace(extracted_pattern, "").strip()

In [69]:
def replace_patterns_from_text(
    text: str, patterns: list[str], replace_with: str = ""
) -> str:
    for pattern in patterns:
        text = text.replace(pattern, replace_with)

    return text.strip()

In [70]:
def apply_transformations(text) -> tuple[str]:
    text = replace_patterns_from_text(
        text,
        [
            "RAPOR TARİHİ",
            "FİLM NO",
            "TETKİK NO",
            "ÇEKİM TARİHİ",
            "ÇEKİM  TARİHİ",
            "TETKİK TARİHİ",
            " :",
            ": ",
            " : ",
        ],
    )
    text = replace_patterns_from_text(
        text, ["RAPOR TARİHİ", "FİLM NO", "TETKİK NO", " :", ": ", " : "]
    )
    text = replace_patterns_from_text(text, ["\n", "*"], " ")
    text = replace_patterns_from_text(text, [":", ";"], " ")

    film_no, text = pattern_find_and_remove(
        text,
        film_no_pattern,
    )
    rapor_tarihi, text = pattern_find_and_remove(text, rapor_tarihi_pattern)

    return text.strip(), rapor_tarihi, film_no

In [71]:
result = df_reports['ACIKLAMA'].apply(apply_transformations)

In [72]:
df_reports[["text", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)

In [73]:
df_reports = df_reports.rename(
    columns={
        "HASTA_NO": "patient_no",
        "PROTOKOL_NO": "protocol_no",
    }
).drop(["ACIKLAMA", "AD_SOYAD"], axis=1)

In [74]:
df_report_count = df_reports.groupby("patient_no").agg(report_count=pd.NamedAgg(column="patient_no", aggfunc="count")).reset_index()

In [75]:
# parse dates & order by date & drop duplicates
df_reports.merge(df_report_count, on='patient_no', how='left').sort_values("report_count", ascending=False).drop_duplicates()

Unnamed: 0,patient_no,protocol_no,text,report_date,study_no,report_count
5407,2005183286,21302877,BEYİN BT Kafa tabanı ve verteks arasından el...,24.06.2021,12217711,27
5425,2005183286,22950902,Kafa tabanı ve verteks arasından elde olunan k...,20.09.2022,,27
5423,2005183286,24058415,BEYİN BT Sol oksipital kemikte geçirilmiş ope...,21/06/2023,13288413,27
5422,2005183286,22759758,Beyin BT Kafa kaidesi normal sınırlardadır...,14/07/2022,,27
5421,2005183286,22742324,Beyin BT tetkiki Postop süreçte elde olun...,01/07/2022,,27
...,...,...,...,...,...,...
13413,2008237265,23327345,Kontrastsız beyin BT Posterior fossada beyin ...,16/12/2022,12945600,1
13412,2008237255,23410349,Beyin BT Posterior fossada beyin sapı kemik a...,03/01/2023,12983554,1
13411,2008237223,22217542,Kafa tabanı ve verteks arasından elde olunan k...,28/01/2022,,1
13410,2008237202,22320269,BEYİN BT Kafa tabanı ve verteks arasından el...,02/02/2022,,1


In [76]:
# check empty dates
# (df_reports['report_date'].values == '').sum() 

In [77]:
df_reports = df_reports.sort_values(by="patient_no")

In [78]:
df_reports.to_csv("data/output.csv")