In [3]:
# import statements cell
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
import sagemaker as sage
import numpy as np
import boto3
import io
import sagemaker.amazon.common as smac
import os
from sagemaker.amazon.amazon_estimator import get_image_uri
import mxnet as mx
from sagemaker.predictor import csv_serializer, json_deserializer

In [2]:
# scrape raw CSV from NYTimes COVID-19 GitHub repo, parse with BeautifulSoup
response = requests.get('https://github.com/nytimes/covid-19-data/blob/master/us-counties.csv')
soup = BeautifulSoup(response.text)

# put data from scraped page into pandas dataframe
lines = [tr.find_all('td')[1] for tr in soup.find_all('tr')]
raw_dict = {line['id'] : line.string.split(',') for line in lines}
raw_labels = raw_dict['LC1']
del raw_dict['LC1']
raw_data = pd.DataFrame.from_dict(raw_dict, orient='index', columns=raw_labels)

# remove datapoints without FIPS code
trimmed_fips = raw_data[raw_data['fips'] != ''].copy(deep=True)
trimmed_fips['fips'] = trimmed_fips['fips'].astype(int)
trimmed_fips['cases'] = trimmed_fips['cases'].astype(int)
trimmed_fips['deaths'] = trimmed_fips['deaths'].astype(int)
trimmed_fips['date'] = trimmed_fips['date'].map(lambda date: datetime.strptime(date, '%Y-%m-%d').timestamp())
trimmed_fips = trimmed_fips.drop(columns=['county', 'state'])

In [3]:
# upload trimmed data from USDA
education = pd.read_excel('Education.xls')
# filter out states and countries
education = education[education['FIPS Code'] % 1000 != 0]

unemployment = pd.read_excel('Unemployment.xls')
# filter out states and countries
unemployment = unemployment[unemployment['FIPS'] % 1000 != 0]

poverty = pd.read_excel('PovertyEstimates.xls')
# filter out states and countries
poverty = poverty[poverty['FIPS'] % 1000 != 0]

population = pd.read_excel('PopulationEstimates.xls')
# filter out states and countries
population = population[population['FIPS'] % 1000 != 0]

atlas_people = pd.read_excel('RuralAtlasData20.xlsx', sheet_name='People')
# filter out states and countries
atlas_people = atlas_people[atlas_people['FIPS'] % 1000 != 0]

atlas_jobs = pd.read_excel('RuralAtlasData20.xlsx', sheet_name='Jobs')
# filter out states and countries
atlas_jobs = atlas_jobs[atlas_jobs['FIPS'] % 1000 != 0]

atlas_county_classifications = pd.read_excel('RuralAtlasData20.xlsx', sheet_name='County Classifications')
# filter out states and countries
atlas_county = atlas_county_classifications[atlas_county_classifications['FIPS'] % 1000 != 0]

atlas_income = pd.read_excel('RuralAtlasData20.xlsx', sheet_name='Income')
# filter out states and countries
atlas_income = atlas_income[atlas_income['FIPS'] % 1000 != 0]

atlas_veterans = pd.read_excel('RuralAtlasData20.xlsx', sheet_name='Veterans')
# filter out states and countries
atlas_veterans = atlas_veterans[atlas_veterans['FIPS'] % 1000 != 0]

In [4]:
# join socioeconomic measures by county together with COVID-19 spread data

data_with_edu = pd.merge(trimmed_fips, education, how='left', left_on='fips', right_on='FIPS Code')
data_with_edu_unemploy = pd.merge(data_with_edu, unemployment, how='left', left_on='fips', right_on='FIPS')
data_with_edu_unemploy_pov = pd.merge(
    data_with_edu_unemploy, poverty, how='left', left_on='fips', right_on='FIPS')
data_after_pop = pd.merge(
    data_with_edu_unemploy_pov, population, how='left', left_on='fips', right_on='FIPS')
data_after_atlas_people = pd.merge(
    data_after_pop, atlas_people, how='left', left_on='fips', right_on='FIPS')
data_after_atlas_jobs = pd.merge(
    data_after_atlas_people, atlas_jobs, how='left', left_on='fips', right_on='FIPS')
data_after_atlas_classif = pd.merge(
    data_after_atlas_jobs, atlas_county_classifications, how='left', left_on='fips', right_on='FIPS')
data_after_atlas_income = pd.merge(
    data_after_atlas_classif, atlas_income, how='left', left_on='fips', right_on='FIPS')
data_after_atlas_veterans = pd.merge(
    data_after_atlas_income, atlas_veterans, how='left', left_on='fips', right_on='FIPS')
data_with_usda = data_after_atlas_veterans

In [5]:
data_with_usda = data_with_usda.drop(columns=['FIPS Code', 'FIPS_x', 'FIPS_y'])

In [7]:
# we have NaNs, SageMaker doesn't like these
# impute by mean of column 
# from https://stackoverflow.com/questions/18689235/numpy-array-replace-nan-values-with-average-of-columns

def impute_by_means(a):
    col_means = np.nanmean(a, axis=0)
    inds = np.where(np.isnan(a))
    a[inds] = np.take(col_means, inds[1])
    return a

In [8]:
# features don't have cases or deaths

train, test = train_test_split(data_with_usda)

train_labels_cases = train['cases'].to_numpy(dtype=np.float32)
train_labels_deaths = train['deaths'].to_numpy(dtype=np.float32)
train_features = impute_by_means(train.drop(columns=['cases', 'deaths']).to_numpy(dtype=np.float32))

test_labels_cases = test['cases'].to_numpy(dtype=np.float32)
test_labels_deaths = test['deaths'].to_numpy(dtype=np.float32)
test_features = impute_by_means(test.drop(columns=['cases', 'deaths']).to_numpy(dtype=np.float32))

In [9]:
role = 'arn:aws:iam::022575370123:role/service-role/AmazonSageMaker-ExecutionRole-20200402T213912'
bucket = 'sagemaker-studio-uok86wzhfvl'
prefix = 'covid-data'

In [10]:
# from https://www.bmc.com/blogs/aws-linear-learner/
# and https://www.xaxis.com/insights/blog/steps-to-train-a-machine-learning-model-with-amazon-sagemaker-first-look/

sess = sage.Session(default_bucket=bucket)

In [11]:
key = 'linearlearner'
s3_train_data_cases = 's3://{}/{}/train_cases/{}'.format(bucket, prefix, key)
s3_test_data_cases = 's3://{}/{}/test_cases/{}'.format(bucket, prefix, key)

s3_train_data_deaths = 's3://{}/{}/train_deaths/{}'.format(bucket, prefix, key)
s3_test_data_deaths = 's3://{}/{}/test_deaths/{}'.format(bucket, prefix, key)

In [12]:
# don't execute again unless data changed
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, train_features, train_labels_cases)
buf.seek(0)
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train_cases', key)).upload_fileobj(buf)

In [13]:
# don't execute again unless data changed
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, train_features, train_labels_deaths)
buf.seek(0)
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train_deaths', key)).upload_fileobj(buf)

In [14]:
# don't execute again unless data changed
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, test_features, test_labels_cases)
buf.seek(0)
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test_cases', key)).upload_fileobj(buf)

In [15]:
# don't execute again unless data changed
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, test_features, test_labels_deaths)
buf.seek(0)
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test_deaths', key)).upload_fileobj(buf)

In [16]:
output_loc_cases = 's3://{}/{}/output_cases'.format(bucket, prefix)
output_loc_deaths = 's3://{}/{}/output_deaths'.format(bucket, prefix)

In [17]:
container = get_image_uri(boto3.Session(region_name='us-east-2').region_name, 'linear-learner')

In [32]:
estimator_cases = sage.estimator.Estimator(container, role, train_instance_count=1,
                                         train_instance_type='ml.c4.xlarge', output_path=output_loc_cases, 
                                         sagemaker_session=sess)
estimator_cases.set_hyperparameters(feature_dim=train_features.shape[1], predictor_type='regressor')

In [19]:
estimator_deaths = sage.estimator.Estimator(container, role, train_instance_count=1,
                                         train_instance_type='ml.c4.xlarge', output_path=output_loc_deaths, 
                                         sagemaker_session=sess)
estimator_deaths.set_hyperparameters(feature_dim=train_features.shape[1], predictor_type='regressor')

In [33]:
# run training job for cases

estimator_cases.fit({'train': s3_train_data_cases, 'test': s3_test_data_cases}, wait=False)

In [48]:
# run training job for deaths

estimator_deaths.fit({'train': s3_train_data_deaths, 'test': s3_test_data_deaths}, wait=False)

In [49]:
training_job_name_cases = estimator_cases.latest_training_job.job_name
training_job_name_deaths = estimator_deaths.latest_training_job.job_name

In [75]:
def get_training_job_status(training_job_name: str):
    job_info = boto3.client('sagemaker').describe_training_job(TrainingJobName=training_job_name)
    job_status = job_info['TrainingJobStatus']
    if job_status == 'Failed':
        message = job_info['FailureReason']
        print(f'Training failed with the following error: {message}')
    return job_status, job_info

In [54]:
get_training_job_status(estimator_deaths.latest_training_job.job_name)

('Completed',
 {'TrainingJobName': 'linear-learner-2020-04-03-18-48-17-345',
  'TrainingJobArn': 'arn:aws:sagemaker:us-east-2:022575370123:training-job/linear-learner-2020-04-03-18-48-17-345',
  'ModelArtifacts': {'S3ModelArtifacts': 's3://sagemaker-studio-uok86wzhfvl/covid-data/output_deaths/linear-learner-2020-04-03-18-48-17-345/output/model.tar.gz'},
  'TrainingJobStatus': 'Completed',
  'SecondaryStatus': 'Completed',
  'HyperParameters': {'feature_dim': '502', 'predictor_type': 'regressor'},
  'AlgorithmSpecification': {'TrainingImage': '404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:1',
   'TrainingInputMode': 'File',
   'MetricDefinitions': [{'Name': 'test:dcg',
     'Regex': '#quality_metric: host=\\S+, test dcg <score>=(\\S+)'},
    {'Name': 'train:progress',
     'Regex': '#progress_metric: host=\\S+, completed (\\S+) %'},
    {'Name': 'test:binary_f_beta',
     'Regex': '#quality_metric: host=\\S+, test binary_f_\\S+ <score>=(\\S+)'},
    {'Name': 'train:objecti

In [39]:
linear_predictor_cases = estimator_cases.deploy(initial_instance_count=1, wait=False, 
                                   instance_type='ml.c4.xlarge', endpoint_name='predict-covid19-cases')

Using already existing model: linear-learner-2020-04-03-18-23-02-289


In [55]:
linear_predictor_deaths = estimator_deaths.deploy(initial_instance_count=1, wait=False,
                                   instance_type='ml.c4.xlarge', endpoint_name='predict-covid19-deaths')

In [40]:
linear_predictor_cases.content_type = 'text/csv'
linear_predictor_cases.serializer = csv_serializer
linear_predictor_cases.deserializer = json_deserializer

In [71]:
linear_predictor_deaths.content_type = 'text/csv'
linear_predictor_deaths.serializer = csv_serializer
linear_predictor_deaths.deserializer = json_deserializer

In [43]:
# join socioeconomic measures by county together

data_with_edu_unemploy = pd.merge(education, unemployment, how='left', left_on='FIPS Code', right_on='FIPS')
data_with_edu_unemploy_pov = pd.merge(
    data_with_edu_unemploy, poverty, how='left', left_on='FIPS', right_on='FIPS')
data_after_pop = pd.merge(
    data_with_edu_unemploy_pov, population, how='left', left_on='FIPS', right_on='FIPS')
data_after_atlas_people = pd.merge(
    data_after_pop, atlas_people, how='left', left_on='FIPS', right_on='FIPS')
data_after_atlas_jobs = pd.merge(
    data_after_atlas_people, atlas_jobs, how='left', left_on='FIPS', right_on='FIPS')
data_after_atlas_classif = pd.merge(
    data_after_atlas_jobs, atlas_county_classifications, how='left', left_on='FIPS', right_on='FIPS')
data_after_atlas_income = pd.merge(
    data_after_atlas_classif, atlas_income, how='left', left_on='FIPS', right_on='FIPS')
data_after_atlas_veterans = pd.merge(
    data_after_atlas_income, atlas_veterans, how='left', left_on='FIPS', right_on='FIPS')
county_data = data_after_atlas_veterans.drop(columns=['FIPS Code'])
county_data = county_data.set_index('FIPS')

In [1]:
# a feature should have: date (Unix format), fips number, county data
def get_prediction(date, fips, label='cases'):
    a = np.array([date, fips], dtype=np.float32)
    b = county_data.loc[fips].to_numpy(np.float32)
    feature = np.concatenate((a, b))
    if label == 'cases':
        return linear_predictor_cases.predict(feature)
    elif label == 'deaths':
        return linear_predictor_deaths.predict(feature)
    else: 
        raise ValueError('label must be either cases or deaths')

In [2]:
get_prediction(datetime.now().timestamp(), 39153, label='cases')

NameError: name 'datetime' is not defined