In [1]:
import pandas as pd

# Function to read the patient CSV file
def read_patient():
    return pd.read_csv("PATIENTS.csv")

# Function to preprocess the patient dataframe
def preprocess_patient(df):
    selected_columns = ['SUBJECT_ID','DOB','DOD']
    return df[selected_columns].copy()

# Function to read the visit CSV file
def read_visit():
    return pd.read_csv('ADMISSIONS.csv')

# Function to preprocess the visit dataframe
def preprocess_visit(df):
    selected_columns = ['SUBJECT_ID','HADM_ID','ADMITTIME','DISCHTIME','DEATHTIME']
    return df[selected_columns].copy()

# Function to read the diagnosis CSV file
def read_diagnosis():
    return pd.read_csv("DIAGNOSES_ICD.csv")

# Function to preprocess the diagnosis dataframe
def preprocess_diagnosis(df):
    selected_columns = ['SUBJECT_ID','HADM_ID','ICD9_CODE']
    return df[selected_columns].copy()

# Function to read the medication CSV file
def read_medication():
    return pd.read_csv("PRESCRIPTIONS.csv", low_memory=False)

# Function to preprocess the medication dataframe
def preprocess_medication(df):
    selected_columns = ['SUBJECT_ID','HADM_ID','STARTDATE','ENDDATE','DRUG_TYPE','DRUG']
    return df[selected_columns].copy()


In [55]:
patient_raw = read_patient()
patient_df = preprocess_patient(patient_raw)

visit_raw = read_visit()
visit_df = preprocess_visit(visit_raw)

diagnosis_raw = read_diagnosis()
diagnosis_df = preprocess_diagnosis(diagnosis_raw)

medication_raw = read_medication()
medication_df = preprocess_medication(medication_raw)


In [62]:
# Add Age
patient_df['DOB'] = pd.to_datetime(patient_df['DOB']).dt.date
visit_df['ADMITTIME'] = pd.to_datetime(visit_df['ADMITTIME']).dt.date
df = visit_df.merge(patient_df, on='SUBJECT_ID')
df = df.merge(diagnosis_df, on=['SUBJECT_ID', 'HADM_ID'])
df['AGE'] = (df['ADMITTIME'] - df['DOB']).apply(lambda x: int(x.days/365))
subject_id_counts = df.groupby('SUBJECT_ID')['ADMITTIME'].nunique()
# filter out the patient with less than 2 visits
valid_subject_ids = subject_id_counts[subject_id_counts > 1].index
df = df[df['SUBJECT_ID'].isin(valid_subject_ids)].copy()
             
# group by date
diagnoses = df.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])
diagnoses_grouped = diagnoses.groupby(['SUBJECT_ID', 'ADMITTIME']).agg({'ICD9_CODE': list, 'AGE': list, 'DOB': 'first'}).reset_index()
diagnoses_grouped.columns = ['SUBJECT_ID', 'ADMITTIME', 'ICD9_CODE', 'AGE', 'DOB']

