In [14]:
import pandas as pd
import pathlib
import re


In [15]:
source_data = pathlib.Path("data") / "BT_BEYIN_KSIZ.xls"

In [16]:
df_reports = pd.read_excel(source_data)
df_reports

Unnamed: 0,HASTA_NO,PROTOKOL_NO,AD_SOYAD,ACIKLAMA
0,2004001412,23012646,SADİ İYEM,RAPOR TARİHİ: 26/09/2022 FİLM NO: 12796199\n...
1,2004001562,22394628,İLKNUR ERTENER,RAPOR TARİHİ: 15.03.2022 FİLM NO: 12523533\...
2,2004001709,22296380,VEHBİYE AKBALIK,RAPOR TARİHİ:15/02/2022 FİLM NO:12493333\n\n...
3,2004002163,23963976,HAVVA AYDIN,RAPOR TARİHİ: 25/05/2023 FİLM NO:\n\nKontras...
4,2004002609,21367900,ZEYNEP YEŞİLOVA,RAPOR TARİHİ: 09.08.2021 FİLM NO: 11338850...
...,...,...,...,...
32127,2009595227,24165121,BERİL SARGIN,RAPOR TARİHİ:20/07/2023 FİLM NO:\n\nKontrast...
32128,2009595436,24167884,MUHAMMED DABUL,RAPOR TARİHİ: 21/07/2023 TETKİK NO:...
32129,2009595507,24168883,MUHAMMED MUSTAFA KANDEMİR,RAPOR TARİHİ: 21/07/2023 TETKİK NO:...
32130,2009595568,24169841,KEREM KABA,RAPOR TARİHİ : 21/07/2023 FİLM NO : 1333488...


### Extract Information

In [18]:
film_no_pattern = r"\b\d{6,}\b"
rapor_tarihi_pattern= r"\b\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}\b"

In [19]:
def pattern_find_and_remove(text: str, pattern: str) -> tuple[str]:
    try:
        extracted_pattern = re.findall(pattern, text)[0]
    except IndexError:
        extracted_pattern = ""

    return extracted_pattern, text.replace(extracted_pattern, "")

In [20]:
def replace_patterns_from_text(
    text: str, patterns: list[str], replace_with: str = ""
) -> str:
    for pattern in patterns:
        text = text.replace(pattern, replace_with)

    return text.strip()

In [21]:
def apply_transformations(text) -> tuple[str]:
    film_no, text = pattern_find_and_remove(
        text,
        film_no_pattern,
    )
    rapor_tarihi, text = pattern_find_and_remove(text, rapor_tarihi_pattern)
    text = replace_patterns_from_text(
        text, ["RAPOR TARİHİ", "FİLM NO", "TETKİK NO", " :", ": ", " : "]
    )
    text = replace_patterns_from_text(
        text, ["RAPOR TARİHİ", "FİLM NO", "TETKİK NO", " :", ": ", " : "]
    )
    text = replace_patterns_from_text(text, ["\n"], " ")
    text = replace_patterns_from_text(text, [":", ";"], ".")

    return text, rapor_tarihi, film_no

In [22]:
result = df_reports['ACIKLAMA'].apply(apply_transformations)

In [23]:
df_reports[["clean_report", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)

In [24]:
df_reports = df_reports.rename(
    columns={
        "HASTA_NO": "patient_no",
        "PROTOKOL_NO": "protocol_no",
    }
).drop(["ACIKLAMA", "AD_SOYAD"], axis=1)

In [25]:
df_reports.sort_values(by="patient_no")

Unnamed: 0,patient_no,protocol_no,clean_report,report_date,study_no
0,2004001412,23012646,Kafa tabanı ve verteks arasından kontrastsız e...,26/09/2022,12796199
1,2004001562,22394628,Beyin BT. Kafa tabanı ve verteks arasından el...,15.03.2022,12523533
2,2004001709,22296380,Kontrastsız beyin BT tetkiki. Kafa kaide...,15/02/2022,12493333
3,2004002163,23963976,Kontrastlı beyin BT İnfratentorial yapıl...,25/05/2023,
4,2004002609,21367900,BEYİN BT. Kafa tabanı ve verteks arasından el...,09.08.2021,11338850
...,...,...,...,...,...
32127,2009595227,24165121,Kontrastsız beyin BT Kafa kaidesi ve ka...,20/07/2023,
32128,2009595436,24167884,BEYİN BT Kafa tabanı ve verteks arasından eld...,21/07/2023,13333104
32129,2009595507,24168883,BEYİN + ORBİTA BT Kafa tabanı ve verteks a...,21/07/2023,
32130,2009595568,24169841,Kafa tabanı ve verteks arasından elde olunan k...,21/07/2023,13334883
