In [1]:
print(_IMPORTS)

import datetime as dt
from collections import defaultdict, Counter, namedtuple
from itertools import product, combinations, permutations
import json
import os
import pickle

from IPython.display import HTML, display, set_matplotlib_formats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sqlalchemy import create_engine
from tqdm.auto import tqdm

# INITIALIZING
# pictures
sns.set()
plt.style.use('seaborn-bright')
plt.rcParams['figure.figsize'] = 10, 6
# pandas
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 100)
tqdm.pandas()

# Autoreload
get_ipython().magic(u"%reload_ext autoreload")
get_ipython().magic(u"%autoreload 2")
get_ipython().magic(u"%load_ext line_profiler")



In [2]:
import re

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *

from sklearn.model_selection import train_test_split

### Sampling on patients

In [4]:
PATH = "../data/raw/"
PATH_PROCESSED = "../data/processed/"
PATH_DATASETS = "../data/train/"

In [5]:
SAMPLE_SIZE = 10000
RANDOM_SEED = 1

In [6]:
patients = pd.read_csv(os.path.join(PATH, "PATIENTS.csv.gz"))
patients.EXPIRE_FLAG.value_counts()

0    30761
1    15759
Name: EXPIRE_FLAG, dtype: int64

In [7]:
#ensuring every patient is unique
print(f"{patients.SUBJECT_ID.nunique()} unique patients in {len(patients)} rows")
#sampling random patients
patients_sample = patients.sample(n=1000, random_state=RANDOM_SEED)

46520 unique patients in 46520 rows


In [8]:
SAMPLE_IDS = set(patients_sample.SUBJECT_ID)

with open(os.path.join(PATH_PROCESSED, "SAMPLE_IDS.json"), 'w') as f:
    json.dump({'ids': list(SAMPLE_IDS)}, f)
    
with open(os.path.join(PATH_PROCESSED, "SAMPLE_IDS.json"), 'r') as f:
    SAMPLE_IDS = set(json.load(f)['ids'])

In [9]:
patients_sample = patients[patients.SUBJECT_ID.isin(SAMPLE_IDS)]

In [10]:
patients_sample.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
10,244,261,M,2025-08-04 00:00:00,2102-06-29 00:00:00,2102-06-29 00:00:00,2102-06-29 00:00:00,1
85,691,734,F,2102-05-28 00:00:00,,,,0
113,719,763,M,2148-08-05 00:00:00,,,,0
115,721,765,F,2050-05-27 00:00:00,2134-07-15 00:00:00,2134-07-15 00:00:00,2134-07-15 00:00:00,1
196,802,852,M,2108-05-05 00:00:00,,,,0


In [11]:
DECEASED_TO_DATE = patients_sample[patients_sample.EXPIRE_FLAG == 1]\
    .set_index('SUBJECT_ID').DOD.map(lambda x: pd.to_datetime(x).date()).to_dict()

In [12]:
def get_data_for_sample(sample_ids: set, 
                        file_name: str, 
                        chunksize: int = 10_000) -> pd.DataFrame:
    """Get the data only relevant for the sample."""
    full_path = os.path.join(PATH, file_name)
    iterator = pd.read_csv(full_path, iterator=True, chunksize=chunksize)
    return pd.concat([chunk[chunk.SUBJECT_ID.isin(SAMPLE_IDS)] for chunk in tqdm(iterator)])

In [13]:
### Incomplete: Using AWS glue, for now I will do stuff locally
# import boto3
# Creating the low level functional client
# client = boto3.client(
#     'glue',
#     aws_access_key_id = '',
#     aws_secret_access_key = '',
#     region_name = 'us-east-1'
# )
# clientResponse = client.get_table(DatabaseName="mimiciii",Name="admissions")

### Build features

All features in feature_prepocessed form are features with columns ['SUBJECT_ID', 'FEATURE_NAME', 'DATE', 'VALUE], which can be later used for any of the aggregations we'd like.

In [14]:
admissions = get_data_for_sample(SAMPLE_IDS, "ADMISSIONS.csv.gz")

0it [00:00, ?it/s]

In [15]:
admissions['ADMITTIME'] = pd.to_datetime(admissions.ADMITTIME).dt.date

In [16]:
diagnoses = get_data_for_sample(SAMPLE_IDS, "DIAGNOSES_ICD.csv.gz")

0it [00:00, ?it/s]

In [17]:
lab_results = get_data_for_sample(SAMPLE_IDS, "LABEVENTS.csv.gz", chunksize=100_000)

0it [00:00, ?it/s]

In [18]:
meds = get_data_for_sample(SAMPLE_IDS, "PRESCRIPTIONS.csv.gz")

0it [00:00, ?it/s]

#### Diagnoses

In [19]:
diagnoses['ICD9_CODE'] = "ICD9_" + diagnoses['ICD9_CODE']

In [20]:
adm_cols = ['SUBJECT_ID', 'HADM_ID', 'ADMITTIME']
diagnoses = diagnoses.merge(admissions[adm_cols], on=['SUBJECT_ID', 'HADM_ID'])

In [21]:
dropper = ['ROW_ID', 'SEQ_NUM', 'HADM_ID']
renamer = {'ICD9_CODE': 'FEATURE_NAME', 'ADMITTIME': 'DATE'}
diag_preprocessed = diagnoses.drop(columns=dropper).rename(columns=renamer)
diag_preprocessed['VALUE'] = 1

In [22]:
diag_preprocessed.head()

Unnamed: 0,SUBJECT_ID,FEATURE_NAME,DATE,VALUE
0,138,ICD9_1890,2134-03-01,1
1,138,ICD9_4254,2134-03-01,1
2,138,ICD9_4280,2134-03-01,1
3,138,ICD9_V4502,2134-03-01,1
4,138,ICD9_V5861,2134-03-01,1


#### Labs

In [23]:
lab_results['DATE'] = pd.to_datetime(lab_results['CHARTTIME']).dt.date

In [24]:
lab_results['FEATURE_NAME'] = "LAB_" + lab_results['ITEMID'].astype(str)

In [25]:
dropper = ['ROW_ID', 'HADM_ID', 'VALUE', 'VALUEUOM', 'FLAG', 'ITEMID', 'CHARTTIME']
lab_preprocessed = lab_results.drop(columns=dropper)

In [26]:
lab_preprocessed.head()

Unnamed: 0,SUBJECT_ID,VALUENUM,DATE,FEATURE_NAME
70116,138,30.0,2134-01-06,LAB_50804
70117,138,1.12,2134-01-06,LAB_50808
70118,138,35.0,2134-01-06,LAB_50818
70119,138,7.52,2134-01-06,LAB_50820
70120,138,131.0,2134-01-06,LAB_50821


#### Meds

In [27]:
meds = meds[meds.ENDDATE.notna()]

In [28]:
meds['DATE'] = pd.to_datetime(meds['ENDDATE']).dt.date

In [29]:
def find_mean_dose(dose: str) -> float:
    if pd.isnull(dose):
        return 0
    try:
        cleaned = re.sub(r'[A-Za-z,>< ]', '', dose)
        parts = cleaned.split('-')
        return np.array(parts).astype(float).mean()
    except:
        print(dose)

In [30]:
meds['VALUE'] = meds['DOSE_VAL_RX'].map(find_mean_dose)

In [31]:
meds['FEATURE_NAME'] = "MED_" + meds['GSN'].astype(str)

In [32]:
dropper = [col for col in meds.columns if col not in {'SUBJECT_ID', 'DATE', 'FEATURE_NAME', 'VALUE'}]
meds_preprocessed = meds.drop(columns=dropper).rename(columns=renamer)

In [33]:
meds_preprocessed.head()

Unnamed: 0,SUBJECT_ID,DATE,VALUE,FEATURE_NAME
9807,138,2134-01-11,487.5,MED_004481
9808,138,2134-01-05,50.0,MED_001210
9809,138,2134-01-05,1.5,MED_008920
9810,138,2134-01-06,1.0,MED_014198
9811,138,2134-01-06,20.0,MED_008205


#### Notes

Here we can preprocess notes. Later the same things can be done using Spark.

In [34]:
def get_notes(sample_ids: set = None,
              note_path: str = "NOTEEVENTS.csv.gz",
              chunksize: int = 10_000) -> pd.DataFrame:
    """Get all notes or only those relevant for the sample."""
    if sample_ids is None:
        return pd.read_csv(os.path.join(PATH, note_path))
    return get_data_for_sample(sample_ids, note_path, chunksize)

In [35]:
# notes_sample = get_notes(SAMPLE_IDS)
# notes_sample.to_csv(os.path.join(PATH_PROCESSED, 'SAMPLE_NOTES.csv'), index=False)

notes = pd.read_csv(os.path.join(PATH_PROCESSED, 'SAMPLE_NOTES.csv'))

In [36]:
notes.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,223,5350,169684.0,2143-04-30,,,Discharge summary,Report,,,Admission Date: [**2143-4-25**] Discharge...
1,224,5350,169684.0,2143-04-30,,,Discharge summary,Report,,,Admission Date: [**2143-4-25**] Discharge...
2,340,59977,127266.0,2143-11-23,,,Discharge summary,Report,,,Admission Date: [**2143-11-17**] ...
3,474,1675,185767.0,2155-07-08,,,Discharge summary,Report,,,Admission Date: [**2155-7-4**] Discharg...
4,1036,55873,137057.0,2193-09-16,,,Discharge summary,Report,,,Admission Date: [**2193-8-29**] ...


In [37]:
notes['DATE'] = pd.to_datetime(notes['CHARTDATE']).dt.date

In [38]:
def clean_text(note: str):
    cleaned = re.sub(r'[^\w]', ' ', note).replace("_", " ")
    removed_spaces = re.sub(' +', ' ', cleaned)
    lower = removed_spaces.lower()
    return lower

In [39]:
notes['CLEAN_TEXT'] = notes['TEXT'].map(clean_text)

In [40]:
notes.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,DATE,CLEAN_TEXT
0,223,5350,169684.0,2143-04-30,,,Discharge summary,Report,,,Admission Date: [**2143-4-25**] Discharge...,2143-04-30,admission date 2143 4 25 discharge date 2143 4...
1,224,5350,169684.0,2143-04-30,,,Discharge summary,Report,,,Admission Date: [**2143-4-25**] Discharge...,2143-04-30,admission date 2143 4 25 discharge date 2143 4...
2,340,59977,127266.0,2143-11-23,,,Discharge summary,Report,,,Admission Date: [**2143-11-17**] ...,2143-11-23,admission date 2143 11 17 discharge date 2143 ...
3,474,1675,185767.0,2155-07-08,,,Discharge summary,Report,,,Admission Date: [**2155-7-4**] Discharg...,2155-07-08,admission date 2155 7 4 discharge date 2155 7 ...
4,1036,55873,137057.0,2193-09-16,,,Discharge summary,Report,,,Admission Date: [**2193-8-29**] ...,2193-09-16,admission date 2193 8 29 discharge date 2193 9...


### Feature construction

We need to take into account only the events that happened during the observation window. The end of observation window is N days before death for deceased patients and date of last event for alive patients. We can have several sets of events (e.g. labs, diags, meds), so we need to choose the latest date out of those.

In [41]:
OBSERVATION_WINDOW = 2000
PREDICTION_WINDOW = 50

In [42]:
def define_train_period(*feature_sets, 
                        obs_w: int = OBSERVATION_WINDOW, 
                        pred_w: int = PREDICTION_WINDOW) -> (dict, dict):
    """Create SUBJECT_ID -> earliest_date and SUBJECT_ID -> last date dicts."""
    cols = ['SUBJECT_ID', 'DATE']
    all_feats = pd.concat([feats[cols] for feats in feature_sets])
    last_date_base = all_feats.groupby('SUBJECT_ID').DATE.max().to_dict()
    last_date = {subj_id: date
                 for subj_id, date in last_date_base.items()
                 if subj_id not in DECEASED_TO_DATE}
    subtracted_pred_w = {subj_id: date - dt.timedelta(days=pred_w)
                         for subj_id, date in DECEASED_TO_DATE.items()}
    last_date.update(subtracted_pred_w)
    earliest_date = {subj_id: date - dt.timedelta(days=obs_w)
                     for subj_id, date in last_date.items()}
    return earliest_date, last_date

In [43]:
def clean_up_feature_sets(*feature_sets, earliest_date: dict, last_date: dict) -> list:
    """Leave only features from inside the observation window."""
    results = []
    for feats in feature_sets:
        results.append(feats[(feats.DATE < feats.SUBJECT_ID.map(last_date))
                             & (feats.DATE >= feats.SUBJECT_ID.map(earliest_date))])
    return results

In [44]:
use_feature_sets = [diag_preprocessed, lab_preprocessed, meds_preprocessed]

In [45]:
earliest_date, last_date = define_train_period(*use_feature_sets)

In [46]:
diag, lab, med = clean_up_feature_sets(*use_feature_sets, earliest_date=earliest_date, last_date=last_date)

#### Feat calculations

We are going to do a train test split based on patients to validate our model. We will only use those features that appear in the train set. Also, we will only use features that are shared between many patients (we will define "many" manually for each of the feature sets).  

This way we will lose some patients who don't have "popular" features, but that's fine since our goal is to compare similar patients, not to train the best model.

In [47]:
TRAIN_SIZE = 0.8

In [48]:
train_ids, test_ids = train_test_split(list(SAMPLE_IDS), train_size=TRAIN_SIZE, random_state=RANDOM_SEED)

In [49]:
def build_feats(df: pd.DataFrame, agg: list, train_ids: list = None, low_thresh: int = None) -> pd.DataFrame:
    """Build feature aggregations for patient.
    
    Args:
        agg: list of aggregations to use
        train_ids: if not empty, only features that exist in the train set 
            will be used
        low_thresh: if not empty, only features that more than low_thresh
            patients have will be used
    """
    cols_to_use = ['SUBJECT_ID', 'FEATURE_NAME']
    print(f"Total feats: {df.FEATURE_NAME.nunique()}")
    
    if train_ids is not None:
        train_df = df[df.SUBJECT_ID.isin(train_ids)]
        train_feats = set(train_df.FEATURE_NAME)
        df = df[df.FEATURE_NAME.isin(train_feats)]
        print(f"Feats after leaving only train: {len(train_feats)}")
        
    if low_thresh is not None:
        deduplicated = df.drop_duplicates(cols_to_use)
        count = Counter(deduplicated.FEATURE_NAME)
        features_to_leave = set(feat for feat, cnt in count.items() if cnt > low_thresh)
        df = df[df.FEATURE_NAME.isin(features_to_leave)]
        print(f"Feats after removing rare: {len(features_to_leave)}")
    
    grouped = df.groupby(cols_to_use).agg(agg)
    return grouped

In [50]:
def pivot_aggregation(df: pd.DataFrame, fill_value = None, use_sparse: bool = True) -> pd.DataFrame:
    """Make sparse pivoted table with SUBJECT_ID as index."""
    pivoted = df.unstack()
    if fill_value is not None:
        pivoted = pivoted.fillna(fill_value)
    
    if use_sparse:
        pivoted = pivoted.astype(pd.SparseDtype("float", fill_value))
    
    pivoted.columns = [f"{col[-1]}_{col[1]}" for col in pivoted.columns]
    return pivoted

In [51]:
diag_built = build_feats(diag, agg=[lambda x: x.sum() > 0], train_ids=train_ids, low_thresh=30)

Total feats: 1821
Feats after leaving only train: 1643
Feats after removing rare: 52


In [52]:
diag_final = pivot_aggregation(diag_built, fill_value=0)

In [53]:
diag_final.head()

Unnamed: 0_level_0,ICD9_0389_<lambda>,ICD9_2449_<lambda>,ICD9_25000_<lambda>,ICD9_2720_<lambda>,ICD9_2724_<lambda>,ICD9_2749_<lambda>,ICD9_2761_<lambda>,ICD9_2762_<lambda>,ICD9_2767_<lambda>,ICD9_2851_<lambda>,ICD9_2859_<lambda>,ICD9_2875_<lambda>,ICD9_3051_<lambda>,ICD9_311_<lambda>,ICD9_4019_<lambda>,ICD9_40390_<lambda>,ICD9_41071_<lambda>,ICD9_412_<lambda>,ICD9_41401_<lambda>,ICD9_4168_<lambda>,ICD9_4240_<lambda>,ICD9_4241_<lambda>,ICD9_42731_<lambda>,ICD9_42789_<lambda>,ICD9_4280_<lambda>,ICD9_4589_<lambda>,ICD9_486_<lambda>,ICD9_496_<lambda>,ICD9_5070_<lambda>,ICD9_5119_<lambda>,ICD9_5180_<lambda>,ICD9_51881_<lambda>,ICD9_53081_<lambda>,ICD9_5845_<lambda>,ICD9_5849_<lambda>,ICD9_5859_<lambda>,ICD9_5990_<lambda>,ICD9_769_<lambda>,ICD9_7742_<lambda>,ICD9_7793_<lambda>,ICD9_78039_<lambda>,ICD9_99592_<lambda>,ICD9_9971_<lambda>,ICD9_V053_<lambda>,ICD9_V1582_<lambda>,ICD9_V290_<lambda>,ICD9_V3000_<lambda>,ICD9_V3001_<lambda>,ICD9_V4581_<lambda>,ICD9_V502_<lambda>,ICD9_V5861_<lambda>,ICD9_V5867_<lambda>
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
211,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
261,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
labs_built = build_feats(lab, agg=['mean', 'max', 'min'], train_ids=train_ids, low_thresh=50)

Total feats: 543
Feats after leaving only train: 535
Feats after removing rare: 164


In [55]:
labs_final = pivot_aggregation(labs_built, fill_value=0)

In [56]:
labs_final.head()

Unnamed: 0_level_0,LAB_50800_mean,LAB_50801_mean,LAB_50802_mean,LAB_50803_mean,LAB_50804_mean,LAB_50806_mean,LAB_50808_mean,LAB_50809_mean,LAB_50810_mean,LAB_50811_mean,LAB_50812_mean,LAB_50813_mean,LAB_50815_mean,LAB_50816_mean,LAB_50817_mean,LAB_50818_mean,LAB_50819_mean,LAB_50820_mean,LAB_50821_mean,LAB_50822_mean,LAB_50823_mean,LAB_50824_mean,LAB_50825_mean,LAB_50826_mean,LAB_50827_mean,LAB_50828_mean,LAB_50852_mean,LAB_50856_mean,LAB_50861_mean,LAB_50862_mean,LAB_50863_mean,LAB_50867_mean,LAB_50868_mean,LAB_50878_mean,LAB_50879_mean,LAB_50880_mean,LAB_50882_mean,LAB_50883_mean,LAB_50884_mean,LAB_50885_mean,LAB_50887_mean,LAB_50889_mean,LAB_50893_mean,LAB_50902_mean,LAB_50903_mean,LAB_50904_mean,LAB_50905_mean,LAB_50907_mean,LAB_50908_mean,LAB_50909_mean,LAB_50910_mean,LAB_50911_mean,LAB_50912_mean,LAB_50917_mean,LAB_50919_mean,LAB_50920_mean,LAB_50922_mean,LAB_50924_mean,LAB_50925_mean,LAB_50930_mean,LAB_50931_mean,LAB_50933_mean,LAB_50935_mean,LAB_50952_mean,LAB_50953_mean,LAB_50954_mean,LAB_50955_mean,LAB_50956_mean,LAB_50960_mean,LAB_50963_mean,LAB_50964_mean,LAB_50967_mean,LAB_50970_mean,LAB_50971_mean,LAB_50976_mean,...,LAB_51074_min,LAB_51075_min,LAB_51078_min,LAB_51079_min,LAB_51082_min,LAB_51087_min,LAB_51090_min,LAB_51092_min,LAB_51093_min,LAB_51097_min,LAB_51099_min,LAB_51100_min,LAB_51102_min,LAB_51103_min,LAB_51104_min,LAB_51137_min,LAB_51143_min,LAB_51144_min,LAB_51146_min,LAB_51151_min,LAB_51200_min,LAB_51214_min,LAB_51221_min,LAB_51222_min,LAB_51233_min,LAB_51237_min,LAB_51244_min,LAB_51246_min,LAB_51248_min,LAB_51249_min,LAB_51250_min,LAB_51251_min,LAB_51252_min,LAB_51254_min,LAB_51255_min,LAB_51256_min,LAB_51257_min,LAB_51260_min,LAB_51265_min,LAB_51266_min,LAB_51267_min,LAB_51268_min,LAB_51274_min,LAB_51275_min,LAB_51277_min,LAB_51279_min,LAB_51283_min,LAB_51287_min,LAB_51288_min,LAB_51294_min,LAB_51296_min,LAB_51301_min,LAB_51462_min,LAB_51463_min,LAB_51464_min,LAB_51466_min,LAB_51476_min,LAB_51478_min,LAB_51479_min,LAB_51482_min,LAB_51484_min,LAB_51486_min,LAB_51487_min,LAB_51491_min,LAB_51492_min,LAB_51493_min,LAB_51498_min,LAB_51501_min,LAB_51506_min,LAB_51508_min,LAB_51512_min,LAB_51514_min,LAB_51516_min,LAB_51519_min,LAB_51523_min
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1
138,0.0,0.0,0.807692,0.0,27.076923,104.666667,1.134286,136.1875,34.636364,11.572727,0.0,1.45,0.0,46.666667,97.888889,44.230769,0.0,7.381071,195.0,4.447059,0.0,136.25,0.0,0.0,0.0,0.0,0.0,0.0,110.4,3.516667,83.777778,116.333333,10.65625,54.0,0.0,0.0,27.46875,0.1,0.2,0.377778,0.0,0.0,8.577273,102.9375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,115.53125,0.0,36.0,0.0,0.0,410.0,0.0,85.666667,1.94,0.0,0.0,0.0,3.331818,4.162857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,1.1,323.0,26.8,9.2,0.0,1.0,19.3,0.0,28.7,31.7,85.0,0.0,0.0,5.0,0.0,71.4,0.0,0.0,81.0,0.0,0.0,0.0,12.2,29.5,12.8,3.02,0.0,0.0,0.0,0.0,0.0,6.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,1.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.5,0.0,0.0,0.0,22.0,0.3,7.6,7.9,0.0,0.0,0.0,109.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.35,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,0.0,42.8,15.1,0.0,0.0,57.0,0.0,36.3,35.4,103.0,0.0,0.0,5.0,0.0,32.0,2.0,0.0,353.0,0.0,0.0,0.0,0.0,0.0,17.3,4.16,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,7.9,8.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.0,0.0,48.4,16.5,0.0,0.0,56.0,0.0,40.0,34.2,117.0,0.0,0.0,9.0,0.0,28.0,4.0,0.0,240.0,0.0,0.0,0.0,0.0,0.0,15.7,4.14,0.0,0.0,0.0,0.0,0.0,10.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
211,0.0,0.0,-0.475,0.0,25.1,0.0,1.132,130.913043,29.285714,9.742857,0.0,1.525,4.5,40.0,94.321429,40.225,0.0,7.388537,122.175,4.072727,0.0,138.333333,36.5,0.0,0.0,0.0,7.1,0.0,23.75,3.25,73.25,25.0,12.62963,47.75,0.0,0.0,26.607143,0.2,0.0,1.0,0.0,0.0,8.15,104.857143,5.2,49.0,173.0,253.0,0.8,42.3,1260.285714,8.0,1.12069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,126.555556,0.0,0.0,0.0,0.0,0.0,0.0,21.5,2.19375,0.0,0.0,0.0,3.847619,4.048387,0.0,...,0.0,0.0,0.0,0.0,164.0,0.0,0.0,0.0,408.0,0.0,0.0,0.0,0.0,0.0,634.0,0.0,0.0,18.0,0.0,0.0,0.0,135.0,22.2,7.8,0.0,1.0,1.0,0.0,29.3,32.7,89.0,0.0,0.0,2.0,0.0,65.5,0.0,0.0,135.0,0.0,0.0,0.0,12.5,24.5,13.2,2.34,0.0,0.0,0.0,0.0,0.0,5.4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,1.011,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
261,0.0,0.0,3.4,0.0,27.8,99.0,1.05,243.666667,0.0,0.0,0.0,1.8,0.0,8.0,95.5,34.2,0.0,7.502,67.0,4.15,0.0,129.0,0.0,22.0,0.0,0.0,0.0,0.0,10.5,2.7,182.0,0.0,11.526316,17.0,0.0,0.0,29.947368,0.0,0.0,0.5,0.0,0.0,7.633333,98.210526,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.778947,1.1,0.0,0.0,0.0,0.0,0.0,0.0,177.380952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.87,0.0,0.0,0.0,3.95,4.081818,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,389.0,0.0,0.0,0.0,0.0,0.0,216.0,0.0,0.0,0.0,0.2,0.0,1.0,0.0,27.7,9.2,0.0,1.4,10.4,0.0,28.1,31.5,87.0,0.0,0.0,3.3,0.0,85.2,0.0,0.0,206.0,0.0,0.0,0.0,14.1,31.3,15.6,3.16,0.0,0.0,52.0,0.0,0.0,5.8,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,5.0,30.0,0.0,1.019,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0


In [57]:
meds_built = build_feats(med, agg=['mean', 'count'], train_ids=train_ids, low_thresh=50)

Total feats: 2050
Feats after leaving only train: 1952
Feats after removing rare: 120


In [58]:
meds_final = pivot_aggregation(meds_built, fill_value=0)

In [59]:
meds_final.head()

Unnamed: 0_level_0,MED_000283_mean,MED_001187_mean,MED_001209_mean,MED_001210_mean,MED_001248_mean,MED_001255_mean,MED_001262_mean,MED_001285_mean,MED_001356_mean,MED_001417_mean,MED_001723_mean,MED_001972_mean,MED_001989_mean,MED_002006_mean,MED_002689_mean,MED_002944_mean,MED_002947_mean,MED_003009_mean,MED_003017_mean,MED_003026_mean,MED_003067_mean,MED_003753_mean,MED_003757_mean,MED_003779_mean,MED_004070_mean,MED_004072_mean,MED_004103_mean,MED_004222_mean,MED_004225_mean,MED_004376_mean,MED_004380_mean,MED_004478_mean,MED_004489_mean,MED_004758_mean,MED_004886_mean,MED_005039_mean,MED_005066_mean,MED_005068_mean,MED_005132_mean,MED_005229_mean,MED_006522_mean,MED_006532_mean,MED_006549_mean,MED_006562_mean,MED_008205_mean,MED_008208_mean,MED_009066_mean,MED_009588_mean,MED_011672_mean,MED_011673_mean,MED_011677_mean,MED_015361_mean,MED_016546_mean,MED_016796_mean,MED_016995_mean,MED_019187_mean,MED_019808_mean,MED_019964_mean,MED_021700_mean,MED_021732_mean,MED_022346_mean,MED_027413_mean,MED_027462_mean,MED_041384_mean,MED_041660_mean,MED_043952_mean,MED_045309_mean,MED_047635_mean,MED_048548_mean,MED_050631_mean,MED_057959_mean,MED_061716_mean,MED_063951_mean,MED_11673.0_mean,MED_1187.0_mean,...,MED_008208_count,MED_009066_count,MED_009588_count,MED_011672_count,MED_011673_count,MED_011677_count,MED_015361_count,MED_016546_count,MED_016796_count,MED_016995_count,MED_019187_count,MED_019808_count,MED_019964_count,MED_021700_count,MED_021732_count,MED_022346_count,MED_027413_count,MED_027462_count,MED_041384_count,MED_041660_count,MED_043952_count,MED_045309_count,MED_047635_count,MED_048548_count,MED_050631_count,MED_057959_count,MED_061716_count,MED_063951_count,MED_11673.0_count,MED_1187.0_count,MED_1210.0_count,MED_1248.0_count,MED_1255.0_count,MED_1262.0_count,MED_1285.0_count,MED_1356.0_count,MED_1417.0_count,MED_16546.0_count,MED_16796.0_count,MED_16995.0_count,MED_1723.0_count,MED_1972.0_count,MED_19808.0_count,MED_1989.0_count,MED_19964.0_count,MED_2006.0_count,MED_22346.0_count,MED_27413.0_count,MED_27462.0_count,MED_283.0_count,MED_2944.0_count,MED_2947.0_count,MED_3009.0_count,MED_3017.0_count,MED_3753.0_count,MED_4070.0_count,MED_41384.0_count,MED_4222.0_count,MED_4380.0_count,MED_43952.0_count,MED_4478.0_count,MED_4489.0_count,MED_45309.0_count,MED_47635.0_count,MED_5039.0_count,MED_50631.0_count,MED_5068.0_count,MED_5132.0_count,MED_5229.0_count,MED_57959.0_count,MED_61716.0_count,MED_63951.0_count,MED_6549.0_count,MED_8205.0_count,MED_nan_count
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1
138,0.0,0.0,0.0,75.0,40.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1000.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0,0.75,0.0,0.0,0.0,0.0,1.75,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,40.0,40.0,0.0,12.5,0.0,0.0,0.0,150.0,1000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,13.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,4.0,0.0,3.0,8.0,5.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,1.0,3.0,3.0,2.0,0.0,4.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,2.0,14.0
167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,1000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,5.0,8.0,0.0,3.0,0.0,2.0,3.0,0.0,1.0,3.0,2.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,2.0,1.0,4.0,1.0,1.0,1.0,5.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,19.0,11.0
352,0.0,1000.0,1000.0,750.0,0.0,0.0,0.0,15.0,2.0,2.0,0.0,700.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,325.0,81.0,0.0,487.5,0.0,0.0,0.0,0.0,60.0,0.0,0.0,25000.0,0.0,5000.0,5.0,30.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,1000.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0
536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Model Training

We will use random forest to automatically incorporate feature interrelations into our model.

In [60]:
from sklearn.ensemble import RandomForestClassifier

In [61]:
feats_to_train_on = [diag_final, meds_final, labs_final]

In [62]:
df_final = pd.concat(feats_to_train_on, axis=1).fillna(0)

In [63]:
target = pd.Series(df_final.index.isin(DECEASED_TO_DATE), index=df_final.index, name='target')

In [64]:
target.value_counts()

False    669
True     180
Name: target, dtype: int64

In [65]:
# pd.concat([df_final, target], axis=1).to_csv(os.path.join(PATH_DATASETS, 'diag_med_lab.csv'))

In [66]:
train_loc = df_final.index.isin(train_ids)

In [67]:
def train_and_predict(df: pd.DataFrame, target: pd.Series, train_loc: pd.Series, classifier) -> np.array:
    classifier.fit(df_final[train_loc], target[train_loc])
    pred = classifier.predict_proba(df_final[~train_loc])[:, 1]
    return pred

In [68]:
cl = RandomForestClassifier(random_state=RANDOM_SEED)

In [69]:
pred = train_and_predict(df_final, target, train_loc, cl)

In [70]:
roc_auc_score(target[~train_loc], pred)

0.8265490728177295

In [71]:
pd.Series(cl.feature_importances_, index=df_final.columns).sort_values(ascending=False).iloc[:10]

LAB_51006_mean     0.014228
LAB_51006_max      0.012731
LAB_51277_max      0.011634
LAB_51006_min      0.010673
LAB_50912_max      0.010099
LAB_50917_min      0.009474
LAB_50912_mean     0.008964
MED_1248.0_mean    0.008191
LAB_50912_min      0.007255
LAB_51301_min      0.007155
dtype: float64

### Add note TF-IDF

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [73]:
last_note = clean_up_feature_sets(notes, earliest_date=earliest_date, last_date=last_date)[0]

In [74]:
cols = ['SUBJECT_ID', 'DATE', 'CLEAN_TEXT']
last_note = last_note.sort_values(by=cols, ascending=False).drop_duplicates('SUBJECT_ID')[cols]

In [75]:
last_note = last_note[last_note.SUBJECT_ID.isin(SAMPLE_IDS)]

In [76]:
last_note

Unnamed: 0,SUBJECT_ID,DATE,CLEAN_TEXT
24200,99283,2125-12-12,2125 12 12 9 30 am chest pa lat clip clip num...
24346,98564,2124-05-02,2124 5 2 8 48 am carot cereb hospital1 clip c...
9148,98402,2111-03-08,title sicu hpi 55f s p fall down the stairs la...
22035,98318,2145-06-08,2145 6 8 4 30 am chest portable ap clip clip ...
20891,98051,2142-04-12,2142 4 12 1 34 pm pelvis ap inlet outlet in o...
...,...,...,...
111,261,2102-01-12,admission date 2101 12 27 discharge date 2102 ...
18346,211,2197-12-27,2197 12 27 2 02 pm chest pa lat clip clip num...
34512,180,2193-12-15,npn agree with coworker initials namepattern4 ...
34682,167,2163-04-12,social work met with mother at the bedside yes...


In [77]:
vectorizer = TfidfVectorizer(max_features=200)
tf_idf = vectorizer.fit_transform(last_note.CLEAN_TEXT)


In [78]:
cols = [f"TFIDF_{feat}" for feat in vectorizer.get_feature_names()]
tf_idf_feats = pd.DataFrame.sparse.from_spmatrix(tf_idf, columns=cols, index=last_note.SUBJECT_ID)

#### Training

In [79]:
# making sure no new rows are added
feats_to_train_on = [diag_final, meds_final, labs_final]
df_final = pd.concat(feats_to_train_on, axis=1).fillna(0)
tf_idf_feats = tf_idf_feats[tf_idf_feats.index.isin(df_final.index)]

In [80]:
feats_to_train_on = [diag_final, meds_final, labs_final, tf_idf_feats]

In [81]:
df_final = pd.concat(feats_to_train_on, axis=1).fillna(0)

In [82]:
target = pd.Series(df_final.index.isin(DECEASED_TO_DATE), index=df_final.index, name='target')

In [83]:
target.value_counts()

False    669
True     180
Name: target, dtype: int64

In [84]:
# pd.concat([df_final, target], axis=1).to_csv(os.path.join(PATH_DATASETS, 'diag_med_lab_tf_idf.csv'))

In [85]:
cl = RandomForestClassifier(random_state=RANDOM_SEED)

In [86]:
train_loc = df_final.index.isin(train_ids)

In [87]:
pred = train_and_predict(df_final, target, train_loc, cl)

In [88]:
roc_auc_score(target[~train_loc], pred)

0.862053369516056

In [89]:
pd.Series(cl.feature_importances_, index=df_final.columns).sort_values(ascending=False).iloc[:10]

LAB_51006_mean     0.016210
LAB_51006_max      0.011862
MED_1248.0_mean    0.009590
LAB_50912_mean     0.009552
TFIDF_discharge    0.008184
LAB_50912_max      0.007902
LAB_50924_max      0.007853
LAB_50917_min      0.006461
LAB_50917_max      0.006417
LAB_50924_mean     0.006410
dtype: float64

Better results, mostly from getting patient discharge information from notes.

### Add transformer embeddings

In [90]:
last_note

Unnamed: 0,SUBJECT_ID,DATE,CLEAN_TEXT
24200,99283,2125-12-12,2125 12 12 9 30 am chest pa lat clip clip num...
24346,98564,2124-05-02,2124 5 2 8 48 am carot cereb hospital1 clip c...
9148,98402,2111-03-08,title sicu hpi 55f s p fall down the stairs la...
22035,98318,2145-06-08,2145 6 8 4 30 am chest portable ap clip clip ...
20891,98051,2142-04-12,2142 4 12 1 34 pm pelvis ap inlet outlet in o...
...,...,...,...
111,261,2102-01-12,admission date 2101 12 27 discharge date 2102 ...
18346,211,2197-12-27,2197 12 27 2 02 pm chest pa lat clip clip num...
34512,180,2193-12-15,npn agree with coworker initials namepattern4 ...
34682,167,2163-04-12,social work met with mother at the bedside yes...


In [91]:
last_note = clean_up_feature_sets(notes, earliest_date=earliest_date, last_date=last_date)[0]

In [92]:
cols = ['SUBJECT_ID', 'DATE', 'TEXT']
last_note = last_note.sort_values(by=cols, ascending=False).drop_duplicates('SUBJECT_ID')[cols]

In [93]:
last_note = last_note[last_note.SUBJECT_ID.isin(SAMPLE_IDS)]

In [94]:
def prepare_text_for_tokenizer(text: str):
    cleaned = ('. ').join(text.splitlines())
    removed_symbols = re.sub('[\[\]\*\_#:?!]+', ' ', cleaned)
    removed_spaces = re.sub(' +', ' ', removed_symbols)
    removed_dots = re.sub('\. \.| \.', '.', removed_spaces)
    removed_duplicated_dots = re.sub('\.+', '.', removed_dots)
    return removed_duplicated_dots

In [95]:
prepare_text_for_tokenizer(last_note['TEXT'].iloc[0])

' 2125-12-12 9 30 AM. CHEST (PA & LAT) Clip Clip Number (Radiology) 33276. Reason To better characterise LLL shadowing. Admitting Diagnosis STROKE;TELEMETRY;TRANSIENT ISCHEMIC ATTACK. Hospital 2 MEDICAL CONDITION. 54 year old woman with R lateral medullary syndrome and LLL shadowing. resolved. REASON FOR THIS EXAMINATION. To better characterise LLL shadowing. FINAL REPORT. HISTORY To assess for left lower lobe shadowing. FINDINGS In comparison with the study of 12-8 , the patient has taken a much. better inspiration. Continued enlargement of the cardiac silhouette, though. no evidence of vascular congestion or pleural effusion. The lungs are. essentially clear. '

In [96]:
last_note['TO_TOK'] = last_note.TEXT.map(prepare_text_for_tokenizer)

In [111]:
last_note = last_note.reset_index(drop=True)

In [112]:
last_note.head()

Unnamed: 0,SUBJECT_ID,DATE,TEXT,TO_TOK
0,99283,2125-12-12,[**2125-12-12**] 9:30 AM\n CHEST (PA & LAT) ...,2125-12-12 9 30 AM. CHEST (PA & LAT) Clip Cli...
1,98564,2124-05-02,[**2124-5-2**] 8:48 AM\n CAROT/CEREB [**Hospit...,2124-5-2 8 48 AM. CAROT/CEREB Hospital1 Clip ...
2,98402,2111-03-08,[**2111-3-8**] 10:27 AM\n CT HEAD W/O CONTRAST...,2111-3-8 10 27 AM. CT HEAD W/O CONTRAST Clip ...
3,98318,2145-06-08,[**2145-6-8**] 4:30 AM\n CHEST (PORTABLE AP) ...,2145-6-8 4 30 AM. CHEST (PORTABLE AP) Clip Cl...
4,98051,2142-04-12,"[**2142-4-12**] 1:34 PM\n PELVIS (AP, INLET & ...","2142-4-12 1 34 PM. PELVIS (AP, INLET & OUTLET..."


In [98]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig

In [99]:
import torch

In [100]:
tokenizer = AutoTokenizer.from_pretrained("deepset/covid_bert_base")

In [101]:
config = AutoConfig.from_pretrained('deepset/covid_bert_base', output_hidden_states=True, output_attentions=True)

In [113]:
model = AutoModelForMaskedLM.from_pretrained("deepset/covid_bert_base", config=config)
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [114]:
def get_vector_for_text(text):
    """This is ugly and slow."""
    encoding = tokenizer(text, 
                         add_special_tokens=True, 
                         truncation=True, 
                         padding="max_length", 
                         return_attention_mask=True, 
                         return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encoding)
        hs = outputs.hidden_states
        token_embeddings = torch.stack(hs, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
        token_vecs = hs[-2][0]
        text_embedding = torch.mean(token_vecs, dim=0)
        return text_embedding

In [115]:
%%time
sentence = get_vector_for_text(last_note.TO_TOK.iloc[0])

CPU times: user 2.51 s, sys: 139 ms, total: 2.65 s
Wall time: 676 ms


In [116]:
torch.save(sentence, f"../data/embeddings/99283.pt")

In [117]:
for row_num, row in tqdm(last_note.iloc[0:].iterrows()):
    text = row['TO_TOK']
    subj_id = row['SUBJECT_ID']
    embedding = get_vector_for_text(text)
    torch.save(embedding, f"../data/embeddings/{subj_id}.pt")

0it [00:00, ?it/s]

In [119]:
from glob import glob

In [125]:
subj_ids = []
embeds = []
for file in tqdm(glob('../data/embeddings/*')):
    name = file.split('/')[-1]
    subj_id = int(name.split('.')[0])
    embedding = torch.load(file)
    subj_ids.append(subj_id)
    embeds.append(np.array(embedding))

  0%|          | 0/833 [00:00<?, ?it/s]

In [127]:
embed_df = pd.DataFrame(embeds, index=subj_ids)

In [128]:
embed_df.columns = [f"EMBED_{i}" for i in embed_df.columns]

In [129]:
embed_df.head()

Unnamed: 0,EMBED_0,EMBED_1,EMBED_2,EMBED_3,EMBED_4,EMBED_5,EMBED_6,EMBED_7,EMBED_8,EMBED_9,EMBED_10,EMBED_11,EMBED_12,EMBED_13,EMBED_14,EMBED_15,EMBED_16,EMBED_17,EMBED_18,EMBED_19,EMBED_20,EMBED_21,EMBED_22,EMBED_23,EMBED_24,EMBED_25,EMBED_26,EMBED_27,EMBED_28,EMBED_29,EMBED_30,EMBED_31,EMBED_32,EMBED_33,EMBED_34,EMBED_35,EMBED_36,EMBED_37,EMBED_38,EMBED_39,EMBED_40,EMBED_41,EMBED_42,EMBED_43,EMBED_44,EMBED_45,EMBED_46,EMBED_47,EMBED_48,EMBED_49,EMBED_50,EMBED_51,EMBED_52,EMBED_53,EMBED_54,EMBED_55,EMBED_56,EMBED_57,EMBED_58,EMBED_59,EMBED_60,EMBED_61,EMBED_62,EMBED_63,EMBED_64,EMBED_65,EMBED_66,EMBED_67,EMBED_68,EMBED_69,EMBED_70,EMBED_71,EMBED_72,EMBED_73,EMBED_74,...,EMBED_693,EMBED_694,EMBED_695,EMBED_696,EMBED_697,EMBED_698,EMBED_699,EMBED_700,EMBED_701,EMBED_702,EMBED_703,EMBED_704,EMBED_705,EMBED_706,EMBED_707,EMBED_708,EMBED_709,EMBED_710,EMBED_711,EMBED_712,EMBED_713,EMBED_714,EMBED_715,EMBED_716,EMBED_717,EMBED_718,EMBED_719,EMBED_720,EMBED_721,EMBED_722,EMBED_723,EMBED_724,EMBED_725,EMBED_726,EMBED_727,EMBED_728,EMBED_729,EMBED_730,EMBED_731,EMBED_732,EMBED_733,EMBED_734,EMBED_735,EMBED_736,EMBED_737,EMBED_738,EMBED_739,EMBED_740,EMBED_741,EMBED_742,EMBED_743,EMBED_744,EMBED_745,EMBED_746,EMBED_747,EMBED_748,EMBED_749,EMBED_750,EMBED_751,EMBED_752,EMBED_753,EMBED_754,EMBED_755,EMBED_756,EMBED_757,EMBED_758,EMBED_759,EMBED_760,EMBED_761,EMBED_762,EMBED_763,EMBED_764,EMBED_765,EMBED_766,EMBED_767
5285,-0.798802,0.130756,0.671988,-0.503266,0.321561,0.167486,-0.138109,0.283182,0.142994,-0.086918,0.140263,-0.11794,-0.134491,-0.162193,-0.184334,0.658178,0.637654,-0.340463,-0.512746,0.33817,0.990492,-0.499885,0.328712,0.459036,0.205726,0.165479,-0.164664,-0.676376,-0.046614,0.270833,0.700922,-0.117788,0.29381,-0.341717,0.631551,-0.430869,0.051414,-0.237179,0.380903,0.184331,-0.295622,-0.296313,0.011057,-0.000363,-0.522852,-0.027841,0.378444,-0.294858,-0.329169,-0.059924,-0.602925,-0.385992,0.456208,-0.223731,-0.057904,0.305034,-0.073592,0.166305,-0.382342,-0.245806,0.132887,-0.10945,-0.022951,0.221922,-0.16622,0.096496,0.056278,0.160964,-0.704472,0.355464,0.107311,0.002045,0.788721,0.161504,0.436051,...,-0.121475,-0.185049,0.282081,-0.030035,-0.410694,-0.204711,-0.309379,0.954139,-0.491521,-0.27713,0.209304,-0.029853,-0.255413,1.266172,0.111401,-0.1166,0.157362,-0.695698,-0.132504,0.333457,0.081255,-0.201507,0.728113,-0.056593,-0.100532,-0.303419,0.222861,0.131474,0.369915,-0.008144,0.067196,-0.37312,-0.313959,-0.502534,0.126407,-0.704903,-0.896134,0.15031,0.02064,-0.054072,-0.42849,0.013663,-0.165275,0.029552,-0.315281,0.033213,0.616471,0.111539,-0.015011,0.33324,-0.135848,-0.186889,0.49566,0.19471,0.88384,-0.085824,0.129288,-0.236622,0.412517,-0.154571,-0.272965,-0.093266,-0.716633,-0.475912,-0.737194,-0.524004,-0.109473,0.133466,-0.297956,0.068109,-0.111506,-0.82445,0.068347,0.461592,-0.238279
98051,-0.280688,0.80274,0.54837,-0.282611,-0.028171,0.093071,-0.152611,-0.296869,0.203632,-0.411534,-0.262536,0.046728,-0.139504,-0.321624,0.19894,0.517119,0.097881,-0.324706,-0.091824,0.148899,0.415854,-0.000608,0.032371,0.404749,-0.18033,0.190517,-0.256167,-0.115635,-0.362978,-0.021065,0.484093,-0.030303,0.181302,-0.141996,0.148368,0.034773,-0.032207,-0.273553,0.175443,0.057039,0.067191,-0.204902,-0.011195,-0.037448,-0.826448,-0.058914,0.166248,-0.000723,-0.521033,0.145015,-0.508619,-0.177843,0.515539,-0.12297,-0.487692,0.30274,-0.399523,-0.294083,-0.015211,-0.459082,0.34645,-0.168052,0.055544,0.361819,-0.61194,0.151686,-0.075463,0.279492,-0.188566,0.60538,-0.13925,0.204843,0.707199,0.401628,0.041722,...,-0.489388,0.061825,-0.214151,0.075408,-0.356072,-0.273681,-0.178658,0.09687,-0.390266,0.035978,-0.027068,-0.300614,-0.005389,0.683946,0.088988,-0.256731,0.487871,0.075371,-0.03795,0.301867,-0.305417,0.311921,0.88669,-0.252098,-0.307613,0.09233,0.817284,0.106059,0.724567,0.312704,0.606344,-0.194923,-0.255452,-0.280414,-0.087071,-0.54828,-0.94683,0.028157,0.300649,-0.26886,0.438057,-0.527234,0.052847,0.506063,-0.229447,0.102302,0.461137,0.119273,0.287434,0.016432,-0.473277,-0.037758,0.225377,0.132918,-0.011153,-0.402879,0.096427,0.188822,0.288018,0.255821,-0.402288,-0.481334,-0.393742,0.014213,-0.47647,0.0916,0.112254,0.040658,-0.675892,0.126579,0.088465,-0.711551,0.274893,-0.122412,-0.127703
15172,-0.414744,0.021008,0.524008,-0.399622,0.513473,0.07585,-0.094331,0.160585,0.187512,-0.48238,-0.04716,-0.088703,-0.299282,-0.43688,-0.110216,0.615391,0.535529,-0.038922,-0.394013,0.3975,0.642235,-0.337195,0.51869,0.311151,0.152609,-0.351956,-0.076573,-0.132275,0.289071,-0.020222,0.24203,0.371205,0.365996,-0.226079,0.358797,0.009828,-0.075149,0.163555,-0.064947,-0.128361,-0.184762,-0.196806,-0.30074,-0.434965,-0.733609,0.078693,0.380415,-0.106653,-0.302421,0.278402,0.199108,-0.496028,0.260483,-0.180037,0.10538,-0.035527,-0.18105,-0.037626,-0.043648,-0.376816,0.214007,0.030185,-0.461027,0.055144,-0.365699,0.182524,0.167663,0.301716,-0.253953,0.485669,0.072951,0.208921,0.525274,0.162466,-0.023446,...,-0.003362,-0.185176,0.038473,0.118952,-0.295013,-0.695434,-0.215306,0.884778,-0.508358,-0.070256,0.012543,-0.185443,-0.307983,0.843137,-0.047293,-0.182796,-0.424801,-0.388191,-0.008918,0.368762,-0.0219,-0.113036,0.509355,-0.55889,0.101244,-0.253927,0.362122,0.101458,0.43264,0.105878,0.03785,-0.315225,-0.291166,-0.394223,-0.029524,-0.696403,-0.82824,-0.031419,0.341811,-0.205195,-0.076328,-0.101417,0.127234,0.072907,-0.28681,-0.394943,0.360002,0.131855,-0.065004,0.202663,-0.033341,-0.161072,0.47582,-0.064704,0.461814,0.275205,-0.499793,0.221447,0.747472,0.482716,0.205197,-0.440048,-0.284466,-0.113503,-0.523377,-0.154417,0.091929,0.317044,0.011682,-0.085569,-0.196497,-0.756556,0.348,-0.443147,-0.224361
41257,-0.842999,0.031648,0.632068,-0.601519,0.434381,-0.064112,0.126074,0.293112,-0.046643,0.133917,0.132017,-0.05003,-0.166736,-0.123393,-0.164162,0.506986,0.82046,-0.221554,-0.622828,0.505789,1.012208,-0.541087,0.348817,0.232747,-0.011625,0.086966,-0.158313,-0.877268,-0.210244,0.066717,0.790384,0.068049,0.116933,-0.018498,0.758664,-0.29316,0.245151,-0.178581,0.349478,0.249381,-0.299569,-0.302833,-0.168717,-0.069499,-0.581013,0.002317,0.411438,-0.417711,-0.281043,-0.301367,-0.543026,-0.337432,0.669424,-0.345663,-0.079265,0.259159,-0.167023,0.260732,-0.35962,-0.216552,0.237773,-0.26064,0.059075,0.434136,-0.172505,0.118521,0.020008,0.274847,-0.704293,0.394395,-0.001161,-0.062747,0.658188,0.10929,0.355391,...,-0.210128,-0.277788,0.225926,-0.112506,-0.28873,-0.42532,-0.085733,0.5884,-0.552412,-0.163092,0.170172,-0.013826,-0.221776,1.168883,-0.100631,-0.241108,-0.056296,-0.731118,-0.162888,0.205745,0.039908,-0.232735,0.826579,-0.262552,-0.091319,-0.417739,0.298624,0.138679,0.283312,0.24093,0.348811,-0.322191,-0.306089,-0.672392,0.100485,-0.61501,-1.0992,0.197555,0.117936,-0.008405,-0.45659,-0.271509,-0.308355,0.106236,-0.593583,0.206155,0.554975,0.214126,-0.210225,0.249428,0.026533,-0.274545,0.462565,0.133483,0.972944,0.081681,-0.120025,-0.264368,0.139046,-0.179958,-0.170151,-0.015946,-0.807186,-0.402417,-0.867511,-0.724654,-0.194507,0.137298,-0.261988,0.158016,-0.146899,-0.78096,-0.03552,0.350281,-0.380913
67154,-0.447635,0.062718,0.376936,-0.305529,0.30912,0.242256,-0.171642,-0.203261,-0.148735,-0.529392,0.038897,0.037419,-0.023907,-0.539065,0.125974,0.722225,0.270403,-0.144585,-0.094342,0.187173,0.482168,-0.293688,0.373881,0.492459,-0.113738,-0.084038,-0.303313,-0.294956,-0.118955,-0.250179,0.316651,0.244253,0.160146,-0.294944,0.258566,0.224622,-0.19163,0.03269,0.287607,-0.201182,-0.082706,-0.108723,-0.122696,-0.188311,-0.655219,0.049095,0.236593,0.067479,-0.474038,0.262903,-0.405772,-0.380182,0.442027,-0.432599,-0.343862,0.143039,-0.282629,0.037388,-0.055119,-0.344272,0.55559,-0.220835,-0.210492,0.10629,-0.236861,-0.0712,0.179933,0.288357,-0.080485,0.544151,-0.005643,-0.054372,0.421583,0.032561,-0.000711,...,-0.072968,-0.008397,0.078561,0.147991,-0.321658,-0.476216,0.127434,0.443928,-0.450771,0.000149,0.155021,0.085699,-0.207759,0.474185,-0.085649,-0.291156,0.512919,-0.08263,-0.084523,0.213701,-0.127176,0.318632,0.695798,-0.17196,-0.020055,0.043892,0.681453,0.171592,0.5608,0.142832,0.378218,-0.302443,-0.644294,-0.591003,-0.039776,-0.644871,-1.050573,-0.11219,0.198408,-0.287369,-0.283677,-0.343019,0.106222,0.270027,-0.12302,-0.422115,0.409869,0.325658,-0.006445,0.105292,-0.203967,0.191552,0.328378,-0.082142,0.172267,-0.234295,-0.273755,0.09146,0.335056,0.528141,-0.166128,-0.596034,-0.312793,0.13268,-0.544444,-0.1293,0.178918,0.285913,-0.333962,0.119855,0.18145,-1.005967,0.17671,-0.487329,0.165298


#### Training with embeds

In [130]:
# making sure no new rows are added
feats_to_train_on = [diag_final, meds_final, labs_final]
df_final = pd.concat(feats_to_train_on, axis=1).fillna(0)
embed_df = embed_df[embed_df.index.isin(df_final.index)]

In [132]:
feats_to_train_on = [diag_final, meds_final, labs_final, embed_df]

In [133]:
df_final = pd.concat(feats_to_train_on, axis=1).fillna(0)

In [134]:
target = pd.Series(df_final.index.isin(DECEASED_TO_DATE), index=df_final.index, name='target')

In [135]:
target.value_counts()

False    669
True     180
Name: target, dtype: int64

In [136]:
# pd.concat([df_final, target], axis=1).to_csv(os.path.join(PATH_DATASETS, 'diag_med_lab_tf_idf.csv'))

In [137]:
cl = RandomForestClassifier(random_state=RANDOM_SEED)

In [138]:
train_loc = df_final.index.isin(train_ids)

In [139]:
pred = train_and_predict(df_final, target, train_loc, cl)

In [140]:
roc_auc_score(target[~train_loc], pred)

0.8504070556309362

In [141]:
pd.Series(cl.feature_importances_, index=df_final.columns).sort_values(ascending=False).iloc[:10]

LAB_51006_max      0.009665
LAB_50924_mean     0.008362
LAB_50912_max      0.007733
LAB_51006_mean     0.007523
MED_1248.0_mean    0.007517
LAB_50912_mean     0.007306
LAB_51006_min      0.006452
LAB_51256_max      0.005481
EMBED_594          0.004969
LAB_50910_mean     0.004846
dtype: float64