# Tensorflow Model

In [1]:
from os import path
from random import sample

import tensorflow as tf

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd


In [2]:
mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

### Load, split, and normalize the training data

In [3]:
def load_raw_evalmetrics_df():
    return pd.read_pickle(path.join('.', 'raw_evalmetrics_df.pkl'))

def prep_multivarate(df):
    """ Calculates density and removes unnecessary columns """
    new_df = df.copy()
    new_df['density'] = df.latestTotalPopulation / df.LND110210
    new_df = new_df.drop(columns=['latestTotalPopulation','fips','LND110210'])
    return new_df

def split_sample(df, percent_train=.80):
    county = set(df.county)
    train_county = sample(county, int(len(county) * percent_train))
    val_county = county - set(train_county)
    train_filter = [c in train_county for c in df.county]
    val_filter = [c in val_county for c in df.county]
    return df[train_filter].fillna(0), df[val_filter].fillna(0)

def normalize_df(df):
    new_df = df.copy()
    cols = ['confirmed_cases','confirmed_deaths', 'confirmed_recoveries','hospitalIcuBeds','hospitalStaffedBeds','hospitalLicensedBeds', 'density']
    for col in cols:
        data = new_df[col].astype('float')
        data_mean = data.mean(axis=0)
        data_std = data.std(axis=0)
        new_df[col] = (data-data_mean)/data_std
    return new_df.fillna(0)

def get_data():
    evalmetric_df = load_raw_evalmetrics_df()
    prepped_df = prep_multivarate(evalmetric_df)
    train_df, val_df = split_sample(prepped_df)
    return normalize_df(train_df), val_df



### Prep the data for RNN

In [19]:
# inspired by https://www.tensorflow.org/tutorials/structured_data/time_series


def multivariate_data(dataset, target_col, history_size=20, target_size=0):
    data = []
    labels = []
    
    counties = set(dataset.county)
    
    for j, county in enumerate(counties):
        if (j + 1) % 100 == 0:
            print('.', end='')
        sub_dataset = dataset[dataset.county == county]
        target = sub_dataset[target_col].values
        sub_dataset = sub_dataset.values
        start_index = history_size
        end_index = len(sub_dataset) - target_size

        for i in range(start_index, end_index):
            indices = range(i-history_size, i)
            data.append(sub_dataset[indices])

            labels.append(target[i+target_size])

    print()
    return np.array(data), np.array(labels)



In [20]:
train_df, val_df = get_data()
train_data, train_labels = multivariate_data(train_df, 'confirmed_deaths')
val_data, val_labels = multivariate_data(val_df, 'confirmed_deaths')

......


In [17]:
train_data, train_labels = data, labels

In [None]:
past_history = 720
future_target = 72
STEP = 6

x_train_single, y_train_single = multivariate_data(dataset, dataset[:, 1], 0,
                                                   TRAIN_SPLIT, past_history,
                                                   future_target, STEP,
                                                   single_step=True)
x_val_single, y_val_single = multivariate_data(dataset, dataset[:, 1],
                                               TRAIN_SPLIT, None, past_history,
                                               future_target, STEP,
                                               single_step=True)

In [9]:

#train_df, val_df = get_data()
train_df
sds = train_df[train_df.county == 'Pike_Kentucky_UnitedStates']
sds

Unnamed: 0,county,date,confirmed_cases,confirmed_deaths,confirmed_recoveries,hospitalIcuBeds,hospitalStaffedBeds,hospitalLicensedBeds,density
333648,Pike_Kentucky_UnitedStates,2020-01-01,-0.066651,-0.045146,-0.033007,0.64039,0.178098,0.217083,-0.157574
333649,Pike_Kentucky_UnitedStates,2020-01-02,-0.066651,-0.045146,-0.033007,0.64039,0.178098,0.217083,-0.157574
333650,Pike_Kentucky_UnitedStates,2020-01-03,-0.066651,-0.045146,-0.033007,0.64039,0.178098,0.217083,-0.157574
333651,Pike_Kentucky_UnitedStates,2020-01-04,-0.066651,-0.045146,-0.033007,0.64039,0.178098,0.217083,-0.157574
333652,Pike_Kentucky_UnitedStates,2020-01-05,-0.066651,-0.045146,-0.033007,0.64039,0.178098,0.217083,-0.157574
...,...,...,...,...,...,...,...,...,...
333787,Pike_Kentucky_UnitedStates,2020-05-19,-0.062216,-0.040878,-0.033007,0.64039,0.178098,0.217083,-0.157574
333788,Pike_Kentucky_UnitedStates,2020-05-20,-0.062216,-0.040878,-0.033007,0.64039,0.178098,0.217083,-0.157574
333789,Pike_Kentucky_UnitedStates,2020-05-21,-0.062216,-0.040878,-0.033007,0.64039,0.178098,0.217083,-0.157574
333790,Pike_Kentucky_UnitedStates,2020-05-22,-0.062216,-0.040878,-0.033007,0.64039,0.178098,0.217083,-0.157574


In [None]:
normalized_train_df.fillna(0)

In [None]:
data = train_df.confirmed_cases.astype('float')

data_mean = data.mean(axis=0)
data_std = data.std(axis=0)
data = (data-data_mean)/data_std
print(round(data.mean(axis=0),10))
print(round(data.std(axis=0),10))