In [2]:
import os, re, pandas as pd
from sklearn.model_selection import train_test_split

DATA_DIR = "../data/raw"

usecols = ["subject_id","hadm_id","charttime","storetime","text"]
discharge = pd.read_csv(os.path.join(DATA_DIR, "discharge.csv"), usecols=usecols)
radiology = pd.read_csv(os.path.join(DATA_DIR, "radiology.csv"), usecols=usecols)
discharge.head()

Unnamed: 0,subject_id,hadm_id,charttime,storetime,text
0,10000032,22595853,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...
1,10000032,22841357,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...
2,10000032,29079034,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...
3,10000032,25742920,2180-08-07 00:00:00,2180-08-10 05:43:00,\nName: ___ Unit No: _...
4,10000084,23052089,2160-11-25 00:00:00,2160-11-25 15:09:00,\nName: ___ Unit No: __...


In [3]:
radiology.head()

Unnamed: 0,subject_id,hadm_id,charttime,storetime,text
0,10000032,22595853.0,2180-05-06 21:19:00,2180-05-06 23:32:00,EXAMINATION: CHEST (PA AND LAT)\n\nINDICATION...
1,10000032,22595853.0,2180-05-06 23:00:00,2180-05-06 23:26:00,EXAMINATION: LIVER OR GALLBLADDER US (SINGLE ...
2,10000032,22595853.0,2180-05-07 09:55:00,2180-05-07 11:15:00,"INDICATION: ___ HCV cirrhosis c/b ascites, hi..."
3,10000032,,2180-06-03 12:46:00,2180-06-03 14:01:00,EXAMINATION: Ultrasound-guided paracentesis.\...
4,10000032,,2180-07-08 13:18:00,2180-07-08 14:15:00,EXAMINATION: Paracentesis\n\nINDICATION: ___...


In [4]:

def clean_text(s: str) -> str:
    s = str(s)
    s = re.sub(r"_+", " ", s)                # de-id underscores
    s = re.sub(r"\[[^\]]*\]", " ", s)        # [** de-id **] se houver
    s = re.sub(r"\s+", " ", s).strip()
    return s

discharge["text"] = discharge["text"].map(clean_text)
radiology["text"]  = radiology["text"].map(clean_text)

In [5]:
corpus = pd.concat([discharge["text"], radiology["text"]], ignore_index=True).dropna().drop_duplicates()

In [6]:
train_texts, val_test_texts = train_test_split(corpus, test_size=0.1, random_state=42, shuffle=True)
val_texts, test_texts = train_test_split(corpus, test_size=0.5, random_state=42, shuffle=True)
len(train_texts), len(val_texts)

(2366449, 1314694)

In [7]:
len(test_texts)

1314694

In [11]:
train_texts.head()

1057940    EXAMINATION: FINGER(S),2+VIEWS RIGHT INDICATIO...
361040     HISTORY: male with altered mental status and a...
2233363    HISTORY: Right patellar fracture. Evaluate for...
2099697    OBSTETRIC ULTRASOUND ON HISTORY: Recent miscar...
1458884    RADIOGRAPHS OF THE LEFT HIP AND PELVIS HISTORY...
Name: text, dtype: object

In [10]:
for data, file in zip((train_texts, val_texts, test_texts), ("train", "val", "test")):
    data.to_csv(f"../data/processed/{file}.csv")