This the code for walking through the lesson examples for your reference.

[Solution](https://www.youtube.com/watch?v=ymL-vyz-RoU&t=106s&ab_channel=Udacity)

## Code for Building Synthetic Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
#build synthetic line level example
NUMBER_RECORDS = 100000
NUMBER_ENCOUNTERS = 7800
NUMBER_PATIENTS = 1000

In [3]:
# Create random list of code sets for diagnosis, procedure, medication, and lab codes
dx_code_list = ["dx_code_" + str(x) for x in np.arange(1,100000)]
procedure_code_list =["procedure_code_" + str(x) for x in np.arange(0,73000)]
medication_code_list = ["medication_code_" + str(x) for x in np.arange(0,10000)]
lab_code_list = ["lab_code_" + str(x) for x in np.arange(0,10000)]

In [4]:
patient_id_list = ["udacity_health_patient_id_" + str(x) for x in np.arange(1, NUMBER_PATIENTS +1)]
encounter_id_list = ["udacity_health_encounter_id_" + str(x) for x in np.arange(1, NUMBER_ENCOUNTERS +1)]

In [5]:
def random_value_selection(field_value_list, number_records):
    #build normal probability distribution 
    field_prob_dist = np.random.dirichlet(np.ones(len(field_value_list)), size=1)[0] 
    #build random value list for field
    field_random_values = np.random.choice(field_value_list, number_records, p=field_prob_dist)
    return field_random_values

In [6]:
#patient_values = random_value_selection(patient_id_list, NUMBER_RECORDS)
encounter_values = random_value_selection(encounter_id_list, NUMBER_RECORDS)

In [7]:
encounter_patient_mapping = dict(zip(encounter_id_list,   random_value_selection(patient_id_list, NUMBER_ENCOUNTERS)))
patient_values = [encounter_patient_mapping[x] for x in encounter_values]

In [8]:
dx_value_mapping = dict(zip(encounter_id_list, random_value_selection(dx_code_list, NUMBER_ENCOUNTERS) ))
dx_values = [dx_value_mapping[x] for x in encounter_values ]

In [9]:
procedure_values = random_value_selection(procedure_code_list, NUMBER_RECORDS)
medication_values = random_value_selection(medication_code_list, NUMBER_RECORDS)
lab_values = random_value_selection(lab_code_list, NUMBER_RECORDS)

In [10]:
triplet_prob_choice = np.random.choice([0, 1, 2], NUMBER_RECORDS, p= np.random.dirichlet(np.ones(3), size=1)[0] )
line_triplet_values = list(zip(procedure_values, medication_values, lab_values, triplet_prob_choice))
selected_procedure_values = [x[0] if x[3] == 0 else np.nan for x in line_triplet_values ]
selected_medication_values = [x[1] if x[3] == 1 else np.nan for x in line_triplet_values]
selected_lab_values = [x[2] if x[3] == 2 else np.nan for x in line_triplet_values]

In [11]:
#add label
patient_label_mapping = dict(zip( patient_id_list, np.random.choice([0, 1], NUMBER_PATIENTS, replace=True, 
                                                                    p=[0.88, 0.12]) ))
label_values = [patient_label_mapping[x] for x in patient_values]

In [12]:
line_df = pd.DataFrame({ "ENCOUNTER_ID": encounter_values,
                        "PATIENT_ID": patient_values,
                        "PRINCIPAL_DIAGNOSIS_CODE": dx_values,
                        "PROCEDURE_CODE": selected_procedure_values,
                        "MEDICATION_CODE": selected_medication_values,
                        "LAB_CODE": selected_lab_values,
                        "LABEL": label_values
                       })

In [13]:
#line_df.to_csv("./data/SYNTHETIC_EHR_DATASET.csv", index=False)

## 1. Converting Line to Encounter Representation

### Load Synthetic EHR Line Dataset

In [14]:
ehr_line_df = pd.read_csv("SYNTHETIC_EHR_DATASET.csv")

In [15]:
ehr_line_df.head()

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
0,udacity_health_encounter_id_1933,udacity_health_patient_id_817,dx_code_56522,procedure_code_20005,,,0
1,udacity_health_encounter_id_5664,udacity_health_patient_id_594,dx_code_39264,,medication_code_7471,,0
2,udacity_health_encounter_id_1946,udacity_health_patient_id_169,dx_code_83619,,,lab_code_5311,0
3,udacity_health_encounter_id_1528,udacity_health_patient_id_870,dx_code_97434,procedure_code_31111,,,0
4,udacity_health_encounter_id_7185,udacity_health_patient_id_722,dx_code_68924,,medication_code_2250,,0


In [16]:
ehr_line_df[ehr_line_df['ENCOUNTER_ID']=='udacity_health_encounter_id_100']

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
14286,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,,lab_code_4198,0
19091,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,medication_code_7982,,0
29530,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,,lab_code_6603,0
34583,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,,lab_code_332,0
62325,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,medication_code_2452,,0
64967,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,,lab_code_1431,0
94636,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,,medication_code_279,,0


In [17]:
#note that this is for illustrative purposes only and for practicing key skills, 
# the actual data representation and combinations of codes not indicative of real thing

### Convert Line to Encounter Representation

In [18]:
# grouping fields 
grouping_field_list = ['ENCOUNTER_ID', 'PATIENT_ID', 'PRINCIPAL_DIAGNOSIS_CODE']
non_grouped_field_list = [c for c in ehr_line_df.columns if c not in grouping_field_list]

In [19]:
encounter_df = ehr_line_df.groupby(grouping_field_list)[non_grouped_field_list].agg(lambda x: 
                                                        list([y for y in x if y is not np.nan ] ) ).reset_index()

In [20]:
encounter_df[0:5]

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
0,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,"[procedure_code_58552, procedure_code_39776, p...","[medication_code_2350, medication_code_8630, m...","[lab_code_8835, lab_code_9859, lab_code_9032, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,udacity_health_encounter_id_10,udacity_health_patient_id_188,dx_code_74047,[],"[medication_code_7789, medication_code_3560, m...",[],"[1, 1, 1, 1]"
2,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,[],"[medication_code_7982, medication_code_2452, m...","[lab_code_4198, lab_code_6603, lab_code_332, l...","[0, 0, 0, 0, 0, 0, 0]"
3,udacity_health_encounter_id_1000,udacity_health_patient_id_525,dx_code_61569,[],[medication_code_4036],[],[0]
4,udacity_health_encounter_id_1001,udacity_health_patient_id_950,dx_code_90172,[procedure_code_30555],"[medication_code_6755, medication_code_5045]",[lab_code_9112],"[0, 0, 0, 0]"


In [21]:
ehr_line_df[ehr_line_df['ENCOUNTER_ID']=='udacity_health_encounter_id_1']

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
3246,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,medication_code_2350,,0
7901,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,medication_code_8630,,0
11765,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,,lab_code_8835,0
11950,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,procedure_code_58552,,,0
16057,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,medication_code_4030,,0
17961,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,procedure_code_39776,,,0
24877,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,medication_code_431,,0
26235,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,medication_code_7435,,0
35490,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,,lab_code_9859,0
42855,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,,medication_code_4338,,0


In [22]:
encounter_df[encounter_df['ENCOUNTER_ID']=='udacity_health_encounter_id_1']

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
0,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,"[procedure_code_58552, procedure_code_39776, p...","[medication_code_2350, medication_code_8630, m...","[lab_code_8835, lab_code_9859, lab_code_9032, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## 2. Converting Encounter to Longitudinal Representation

In [23]:
encounter_df.head()

Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
0,udacity_health_encounter_id_1,udacity_health_patient_id_186,dx_code_15406,"[procedure_code_58552, procedure_code_39776, p...","[medication_code_2350, medication_code_8630, m...","[lab_code_8835, lab_code_9859, lab_code_9032, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,udacity_health_encounter_id_10,udacity_health_patient_id_188,dx_code_74047,[],"[medication_code_7789, medication_code_3560, m...",[],"[1, 1, 1, 1]"
2,udacity_health_encounter_id_100,udacity_health_patient_id_585,dx_code_71465,[],"[medication_code_7982, medication_code_2452, m...","[lab_code_4198, lab_code_6603, lab_code_332, l...","[0, 0, 0, 0, 0, 0, 0]"
3,udacity_health_encounter_id_1000,udacity_health_patient_id_525,dx_code_61569,[],[medication_code_4036],[],[0]
4,udacity_health_encounter_id_1001,udacity_health_patient_id_950,dx_code_90172,[procedure_code_30555],"[medication_code_6755, medication_code_5045]",[lab_code_9112],"[0, 0, 0, 0]"


In [24]:
patient_grouping_field_list = ["PATIENT_ID"]
non_patient_agg_field_list = [c for c in encounter_df.columns if c not in patient_grouping_field_list]

In [25]:
long_df = encounter_df.groupby(patient_grouping_field_list)[non_patient_agg_field_list].agg(lambda x: 
                                                        list([y for y in x if y is not np.nan ] ) ).reset_index()

In [26]:
long_df.head()

Unnamed: 0,PATIENT_ID,ENCOUNTER_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
0,udacity_health_patient_id_1,"[udacity_health_encounter_id_1038, udacity_hea...","[dx_code_36196, dx_code_63471, dx_code_29114, ...","[[procedure_code_36285, procedure_code_21124, ...","[[medication_code_3772, medication_code_9214, ...","[[lab_code_3982, lab_code_306], [], [lab_code_...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
1,udacity_health_patient_id_10,"[udacity_health_encounter_id_1110, udacity_hea...","[dx_code_29609, dx_code_268, dx_code_26932, dx...","[[procedure_code_9379], [procedure_code_3052, ...","[[medication_code_7371, medication_code_2104, ...","[[lab_code_6457], [lab_code_2180, lab_code_693...","[[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, ..."
2,udacity_health_patient_id_100,"[udacity_health_encounter_id_1205, udacity_hea...","[dx_code_32095, dx_code_45376, dx_code_48998, ...","[[procedure_code_71055, procedure_code_29744, ...","[[medication_code_2399, medication_code_966, m...","[[lab_code_4928, lab_code_6524, lab_code_1713,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,udacity_health_patient_id_1000,"[udacity_health_encounter_id_1105, udacity_hea...","[dx_code_53764, dx_code_50924, dx_code_80218, ...","[[], [procedure_code_7870, procedure_code_2169...","[[medication_code_6580, medication_code_3007],...","[[], [lab_code_5468], [lab_code_7607], [lab_co...","[[1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1..."
4,udacity_health_patient_id_101,"[udacity_health_encounter_id_2058, udacity_hea...","[dx_code_13590, dx_code_29551]","[[procedure_code_45319, procedure_code_67294],...","[[medication_code_2532], [medication_code_4311...","[[lab_code_1206, lab_code_9967], []]","[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]"


In [27]:
example_patient_history = long_df[long_df['PATIENT_ID']=='udacity_health_patient_id_310']

In [28]:
example_patient_history

Unnamed: 0,PATIENT_ID,ENCOUNTER_ID,PRINCIPAL_DIAGNOSIS_CODE,PROCEDURE_CODE,MEDICATION_CODE,LAB_CODE,LABEL
211,udacity_health_patient_id_310,"[udacity_health_encounter_id_4647, udacity_hea...","[dx_code_74153, dx_code_95836, dx_code_39465, ...","[[procedure_code_40521, procedure_code_52188, ...","[[medication_code_7251, medication_code_2765, ...","[[lab_code_1372], [lab_code_1794], [lab_code_1...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [29]:
list(example_patient_history['ENCOUNTER_ID'].values)

[['udacity_health_encounter_id_4647',
  'udacity_health_encounter_id_5010',
  'udacity_health_encounter_id_551',
  'udacity_health_encounter_id_7210',
  'udacity_health_encounter_id_7331']]

In [30]:
list(example_patient_history['PRINCIPAL_DIAGNOSIS_CODE'].values)

[['dx_code_74153',
  'dx_code_95836',
  'dx_code_39465',
  'dx_code_66358',
  'dx_code_99743']]

In [31]:
list(example_patient_history['PROCEDURE_CODE'].values)

[[['procedure_code_40521',
   'procedure_code_52188',
   'procedure_code_57020',
   'procedure_code_11784',
   'procedure_code_50478'],
  ['procedure_code_12696',
   'procedure_code_2873',
   'procedure_code_28392',
   'procedure_code_1398'],
  ['procedure_code_49962'],
  [],
  ['procedure_code_31171']]]

## 3. How to Split Dataset at Patient Level

#### ***Objective:*** 
- Split dataset at patient level into train and test partitions
- Validate that split was done correctly

#### Dataset Splitting Tests
- Patient data in only one partition
- Total unique number of patients across all partitions = total number unique patients in original full dataset
- Total number of rows original dataset = sum of rows across splits

In [32]:
PATIENT_ID_FIELD = 'PATIENT_ID'
TEST_PERCENTAGE = 0.2

In [33]:
def split_dataset_patient_level(df, key, test_percentage=0.2):
    df = df.iloc[np.random.permutation(len(df))]
    unique_values = df[key].unique()
    total_values = len(unique_values)
    sample_size = round(total_values * (1 - test_percentage ))
    train = df[df[key].isin(unique_values[:sample_size])].reset_index(drop=True)
    test = df[df[key].isin(unique_values[sample_size:])].reset_index(drop=True)
    return train, test

In [34]:
train_df, test_df = split_dataset_patient_level(encounter_df, PATIENT_ID_FIELD, TEST_PERCENTAGE)

In [35]:
assert len(set(train_df[PATIENT_ID_FIELD].unique()).intersection(set(test_df[PATIENT_ID_FIELD].unique()))) == 0
print("Test passed for patient data in only one partition")

Test passed for patient data in only one partition


In [36]:
assert (train_df[PATIENT_ID_FIELD].nunique()  + test_df[PATIENT_ID_FIELD].nunique()) == encounter_df[PATIENT_ID_FIELD].nunique()
print("Test passed for number of unique patients being equal!")

Test passed for number of unique patients being equal!


In [37]:
assert len(train_df)  + len(test_df) == len(encounter_df)
print("Test passed for number of total rows equal!")

Test passed for number of total rows equal!


## 4. ETL with TF Dataset API and Pandas

NOTE: In some cases you may need to preprocess Pandas Dataframe to removed mixed types. In particular, remove null values and impute or remove rows (we will later impute with zero for numerical features).

In [38]:
import tensorflow as tf

In [39]:
swiss_dataset_path = "processed_swiss.csv"
swiss_df = pd.read_csv(swiss_dataset_path)
selected_col_list = ['age', 'thalach', 'cp', 'num_label']
subset_swiss_df = swiss_df[selected_col_list]

In [40]:
swiss_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num_label
0,32,1,1,95,0,?,0,127,0,.7,1,?,?,1
1,34,1,4,115,0,?,?,154,0,.2,1,?,?,1
2,35,1,4,?,0,?,0,130,1,?,?,?,7,3
3,36,1,4,110,0,?,0,125,1,1,2,?,6,1
4,38,0,4,105,0,?,0,166,0,2.8,1,?,?,2


In [41]:
subset_swiss_df.head()

Unnamed: 0,age,thalach,cp,num_label
0,32,127,1,1
1,34,154,4,1
2,35,130,4,3
3,36,125,4,1
4,38,166,4,2


In [42]:
#adapted from https://www.tensorflow.org/tutorials/structured_data/feature_columns
def df_to_dataset(df, predictor,  batch_size=32):
    df = df.copy()
    labels = df.pop(predictor)
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    return ds

In [43]:
BATCH_SIZE = 64
PREDICTOR_FIELD = 'num_label'
sample_tf_ds = df_to_dataset(subset_swiss_df, PREDICTOR_FIELD, batch_size=BATCH_SIZE)

Metal device set to: Apple M1


2022-06-17 09:52:38.158207: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-06-17 09:52:38.158690: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [44]:
sample_feature_batch = next(iter(sample_tf_ds))[0]
sample_feature_batch

{'age': <tf.Tensor: shape=(64,), dtype=int64, numpy=
 array([62, 32, 51, 64, 63, 46, 61, 70, 51, 64, 42, 50, 55, 59, 65, 56, 65,
        59, 60, 73, 72, 58, 56, 43, 50, 53, 38, 66, 65, 56, 53, 57, 68, 51,
        56, 38, 60, 50, 47, 43, 50, 34, 41, 54, 56, 42, 38, 62, 60, 56, 62,
        65, 63, 55, 40, 51, 57, 47, 61, 56, 53, 74, 57, 62])>,
 'thalach': <tf.Tensor: shape=(64,), dtype=string, numpy=
 array([b'128', b'127', b'60', b'145', b'109', b'133', b'113', b'157',
        b'104', b'140', b'128', b'110', b'150', b'175', b'154', b'82',
        b'122', b'115', b'110', b'121', b'114', b'138', b'100', b'140',
        b'156', b'115', b'120', b'108', b'67', b'98', b'120', b'182',
        b'120', b'127', b'148', b'150', b'149', b'139', b'120', b'122',
        b'120', b'154', b'176', b'150', b'103', b'99', b'179', b'72',
        b'99', b'99', b'78', b'93', b'86', b'155', b'144', b'92', b'120',
        b'149', b'117', b'97', b'141', b'123', b'98', b'123'], dtype=object)>,
 'cp': <tf.Tensor: 

In [45]:
sample_label_batch = next(iter(sample_tf_ds))[1]
sample_label_batch

<tf.Tensor: shape=(64,), dtype=int64, numpy=
array([1, 2, 1, 1, 2, 1, 2, 3, 4, 2, 2, 1, 3, 3, 2, 3, 1, 2, 2, 2, 3, 0,
       1, 3, 2, 3, 2, 2, 1, 1, 2, 3, 3, 1, 3, 1, 1, 0, 1, 2, 3, 1, 1, 1,
       2, 1, 2, 2, 3, 3, 2, 1, 0, 2, 1, 2, 1, 1, 3, 3, 3, 2, 1, 2])>

## 5. Building Numerical Feature with TF Feature Column API

In [46]:
subset_swiss_df.head()

Unnamed: 0,age,thalach,cp,num_label
0,32,127,1,1
1,34,154,4,1
2,35,130,4,3
3,36,125,4,1
4,38,166,4,2


In [47]:
age_mean = subset_swiss_df['age'].describe()['mean']
age_std = subset_swiss_df['age'].describe()['std']
print("Mean age:{}\nStandard Deviation Age:{}".format(age_mean, age_std))

Mean age:55.31707317073171
Standard Deviation Age:9.032107639562039


In [48]:
import functools
def normalize_numeric_with_zscore(col, mean, std):
    return (col - mean)/std

def create_tf_numeric_feature(col, MEAN, STD,   default_value=0):
    normalizer = functools.partial(normalize_numeric_with_zscore, mean=MEAN, std=STD)
    return tf.feature_column.numeric_column(
    key=col, default_value = default_value, normalizer_fn=normalizer, dtype=tf.float64)

In [49]:
age_tf_feature = create_tf_numeric_feature('age', age_mean, age_std)

In [50]:
def demo(feature_column, example_batch):
    feature_layer = tf.keras.layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch))

In [51]:
print("Example continuous field:\n{}\n".format(age_tf_feature))
demo(age_tf_feature, sample_feature_batch)

Example continuous field:
NumericColumn(key='age', shape=(1,), default_value=(0,), dtype=tf.float64, normalizer_fn=functools.partial(<function normalize_numeric_with_zscore at 0x1690e94c0>, mean=55.31707317073171, std=9.032107639562039))

tf.Tensor(
[[ 0.7777778 ]
 [-2.5555556 ]
 [-0.44444445]
 [ 1.        ]
 [ 0.8888889 ]
 [-1.        ]
 [ 0.6666667 ]
 [ 1.6666666 ]
 [-0.44444445]
 [ 1.        ]
 [-1.4444444 ]
 [-0.5555556 ]
 [ 0.        ]
 [ 0.44444445]
 [ 1.1111112 ]
 [ 0.11111111]
 [ 1.1111112 ]
 [ 0.44444445]
 [ 0.5555556 ]
 [ 2.        ]
 [ 1.8888888 ]
 [ 0.33333334]
 [ 0.11111111]
 [-1.3333334 ]
 [-0.5555556 ]
 [-0.22222222]
 [-1.8888888 ]
 [ 1.2222222 ]
 [ 1.1111112 ]
 [ 0.11111111]
 [-0.22222222]
 [ 0.22222222]
 [ 1.4444444 ]
 [-0.44444445]
 [ 0.11111111]
 [-1.8888888 ]
 [ 0.5555556 ]
 [-0.5555556 ]
 [-0.8888889 ]
 [-1.3333334 ]
 [-0.5555556 ]
 [-2.3333333 ]
 [-1.5555556 ]
 [-0.11111111]
 [ 0.11111111]
 [-1.4444444 ]
 [-1.8888888 ]
 [ 0.7777778 ]
 [ 0.5555556 ]
 [ 0.11111111]


## 6. Building Categorical Features with TF Feature Column API

In [52]:
categorical_example_df = encounter_df[['ENCOUNTER_ID', 'PRINCIPAL_DIAGNOSIS_CODE', 'LABEL']] 

In [53]:
# for this task need to convert label from array to scalar value
categorical_example_df['LABEL'] = categorical_example_df['LABEL'].apply(lambda x: np.unique(x)[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categorical_example_df['LABEL'] = categorical_example_df['LABEL'].apply(lambda x: np.unique(x)[0])


In [54]:
categorical_example_df.head()

Unnamed: 0,ENCOUNTER_ID,PRINCIPAL_DIAGNOSIS_CODE,LABEL
0,udacity_health_encounter_id_1,dx_code_15406,0
1,udacity_health_encounter_id_10,dx_code_74047,1
2,udacity_health_encounter_id_100,dx_code_71465,0
3,udacity_health_encounter_id_1000,dx_code_61569,0
4,udacity_health_encounter_id_1001,dx_code_90172,0


### High Cardinality for Principal Diagnosis Code

In [55]:
categorical_example_df['PRINCIPAL_DIAGNOSIS_CODE'].nunique()

6752

### Generate Vocabulary File

In [56]:
#make vocab dir
import os
#os.mkdir("./vocab/")

In [57]:
# build vocab for categorical features
def write_vocabulary_file(vocab_list, field_name, default_value, vocab_dir='./vocab/'):
    output_file_path = os.path.join(vocab_dir, str(field_name) + "_vocab.txt")
    # put default value in first row as TF requires
    vocab_list = np.insert(vocab_list, 0, default_value, axis=0) 
    df = pd.DataFrame(vocab_list).to_csv(output_file_path, index=None, header=None)
    return output_file_path

def build_vocab_files(df, categorical_column_list, default_value='00'):
    vocab_files_list = []
    for c in categorical_column_list:
        v_file = write_vocabulary_file(df[c].unique(), c, default_value)
        vocab_files_list.append(v_file)
    return vocab_files_list

In [58]:
categorical_field_list = ["PRINCIPAL_DIAGNOSIS_CODE"]
vocab_files_list = build_vocab_files(categorical_example_df, categorical_field_list)

### Build TF Dataset from Pandas Dataframe

In [59]:
BATCH_SIZE = 64
PREDICTOR_FIELD = 'LABEL'
categorical_tf_ds = df_to_dataset(categorical_example_df, PREDICTOR_FIELD, batch_size=BATCH_SIZE)

### Use TF Feature Column API to read from vocab file

In [60]:
vocab_files_list[0]

'./vocab/PRINCIPAL_DIAGNOSIS_CODE_vocab.txt'

In [61]:
principal_diagnosis_vocab = tf.feature_column.categorical_column_with_vocabulary_file(
            key="PRINCIPAL_DIAGNOSIS_CODE", vocabulary_file = vocab_files_list[0], num_oov_buckets=1)

INFO:tensorflow:vocabulary_size = 6753 in PRINCIPAL_DIAGNOSIS_CODE is inferred from the number of elements in the vocabulary_file ./vocab/PRINCIPAL_DIAGNOSIS_CODE_vocab.txt.


### Create one-hot encoding  from vocab column feature function

In [62]:
one_hot_principal_diagnosis_feature = tf.feature_column.indicator_column(principal_diagnosis_vocab)

In [63]:
categorical_tf_ds_batch = next(iter(categorical_tf_ds))[0]

In [64]:
demo(one_hot_principal_diagnosis_feature, categorical_tf_ds_batch)

tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(64, 6754), dtype=float32)


[Tensorflow Feature Column API Categorical Features](https://www.tensorflow.org/api_docs/python/tf/feature_column/categorical_column_with_vocabulary_file)

[TensorFlow One-hot Encoding](https://www.tensorflow.org/api_docs/python/tf/one_hot)