In [4]:
import pandas as pd
import tensorflow as tf
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [5]:
col_list = ['case_month', 'res_state', 'state_fips_code', 'res_county', 'county_fips_code',
           'age_group', 'sex', 'race', 'ethnicity', 'case_positive_specimen_interval', 
           'case_onset_interval', 'process', 'exposure_yn', 'current_status', 'symptom_status',
           'hosp_yn', 'icu_yn','death_yn', 'underlying_conditions_yn']

dtyp = {'case_positive_specimen_interval':np.int64, 'case_onset_interval':np.int64, 'case_month':str, 'res_state':str, 'state_fips_code':str, 'res_county':str, 'county_fips_code':str,
       'age_group':str, 'sex':str, 'race':str, 'ethnicity':str, 'process':str, 'exposure_yn':str, 'current_status':str, 'symptom_status':str,
       'hosp_yn':str,'icu_yn':str, 'death_yn':np.int64,'underlying_conditions_yn':str}
cdc = pd.read_csv('cdcCleaned.csv', names=col_list, dtype=dtyp, header=1)

## Assigning training/testing variables


In [6]:
train = cdc[:75000]
test = cdc[75000:]
y_train = train.pop('death_yn')
y_test = test.pop('death_yn')
cdc.pop('death_yn')

train

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,underlying_conditions_yn
1,2020-08,KS,20,CRAWFORD,20037,18 to 49 years,Male,White,Non-Hispanic/Latino,0,0,Laboratory reported,Yes,Laboratory-confirmed case,Symptomatic,No,No,No
2,2021-01,NV,32,CLARK,32003,18 to 49 years,Male,Black,Non-Hispanic/Latino,0,0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,Yes
3,2020-04,IA,19,WOODBURY,19193,50 to 64 years,Female,White,Hispanic/Latino,0,0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,Yes
4,2020-08,KS,20,CRAWFORD,20037,18 to 49 years,Male,White,Non-Hispanic/Latino,0,0,Clinical evaluation,Yes,Probable Case,Symptomatic,No,No,No
5,2021-09,OH,39,JEFFERSON,39081,18 to 49 years,Female,White,Non-Hispanic/Latino,0,0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74996,2020-07,NV,32,CLARK,32003,18 to 49 years,Female,White,Non-Hispanic/Latino,1,0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,Yes
74997,2020-07,NV,32,CLARK,32003,18 to 49 years,Female,White,Non-Hispanic/Latino,1,0,Routine surveillance,Yes,Laboratory-confirmed case,Symptomatic,No,No,Yes
74998,2020-07,NV,32,CLARK,32003,18 to 49 years,Female,White,Non-Hispanic/Latino,1,0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,Yes
74999,2020-07,NV,32,CLARK,32003,18 to 49 years,Female,White,Non-Hispanic/Latino,0,0,Routine surveillance,Yes,Laboratory-confirmed case,Symptomatic,No,No,Yes


## Feature columns

In [7]:
CATEGORICAL_COLUMNS = ['case_month','county_fips_code','state_fips_code','res_state', 'res_county','sex', 'race'
                      , 'age_group','ethnicity', 'process', 'current_status', 'symptom_status', 'hosp_yn', 'icu_yn', 'underlying_conditions_yn',
                      'exposure_yn']
NUMERICAL_COLUMNS = ['case_positive_specimen_interval','case_onset_interval']


feature_columns = []

for feature_name in CATEGORICAL_COLUMNS:
    vocabulary = train[feature_name].unique()
    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))
    
for feature_name in NUMERICAL_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))
print(feature_columns)

[VocabularyListCategoricalColumn(key='case_month', vocabulary_list=('2020-08', '2021-01', '2020-04', '2021-09', '2020-09', '2020-11', '2021-08', '2020-10', '2021-04', '2020-05', '2021-12', '2021-03', '2021-11', '2021-10', '2020-07', '2020-12', '2021-06', '2021-07', '2021-02', '2020-06', '2020-03', '2021-05'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='county_fips_code', vocabulary_list=('20037', '32003', '19193', '39081', '20045', '20057', '39113', '19155', '39107', '20175', '49005', '49039', '21067', '20113', '49047', '19183', '32007', '39165', '39147', '39061', '19187', '39077', '20015', '20161', '20103', '19181', '39155', '39173', '39069', '21035', '39039', '21195', '19149', '39079', '39097', '39131', '39023', '21199', '39141', '21231', '19127', '39045', '39041', '39175', '39001', '39033', '20009', '20149', '21217', '21019', '21151', '21193', '39161', '39031', '32031', '49041', '21013', '49011', '21107', '39151', '20173', '39095', '39

# Training

In [8]:
def make_input_fn(data_df, label_df, num_epochs=50, shuffle=True, batch_size=2048):
    def input_function(): # inner function, this will be returned
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df)) # create tf.data.Dataset object with data and its labels
        if shuffle:
            ds = ds.shuffle(1000) # randomize order of data
        ds = ds.batch(batch_size).repeat(num_epochs) # split dataset into batches of 32 and repeat process for no. of epochs
        return ds
    return input_function # return a function object for use

train_input_fn = make_input_fn(train, y_train) # here we will call the input_function that was returned to us to get a dataset object we can fit into the model
eval_input_fn = make_input_fn(test, y_test, num_epochs=1, shuffle=False)


linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
linear_est.train(train_input_fn) # train
result = linear_est.evaluate(eval_input_fn) # get model metrics/stats by testing on testing data
clear_output()
print(result) # result variable is a dict of stats about our model

{'accuracy': 0.9761583, 'accuracy_baseline': 0.9751438, 'auc': 0.9377605, 'auc_precision_recall': 0.36730337, 'average_loss': 0.07126875, 'label/mean': 0.024856275, 'loss': 0.07074752, 'precision': 0.57894737, 'prediction/mean': 0.017571812, 'recall': 0.14965986, 'global_step': 1850}


# Testing

In [9]:
result = list(linear_est.predict(eval_input_fn)) # <-- this is a generator object 
clear_output()
print(result[3500])
print(y_test.loc[3500 + 75000])

{'logits': array([-6.886643], dtype=float32), 'logistic': array([0.00102029], dtype=float32), 'probabilities': array([0.9989797 , 0.00102029], dtype=float32), 'class_ids': array([0]), 'classes': array([b'0'], dtype=object), 'all_class_ids': array([0, 1], dtype=int32), 'all_classes': array([b'0', b'1'], dtype=object)}
0
