In [1]:
import numpy as np
import os
import git
import numpy as np
import pandas as pd
np.set_printoptions(precision=5)

In [2]:
def get_processed_df(file_name='nyt_us_counties.csv'):
    repo = git.Repo("./", search_parent_directories=True)
    homedir = repo.working_dir
    datadir = f"{homedir}/data/us/covid/"
    df = pd.read_csv(datadir + file_name)
    df['date_processed'] = pd.to_datetime(df['date'].values)
    # Convert YYYY-MM-DD date format into integer number of days since the first day in the data set
    df['date_processed'] = (df['date_processed'] - df['date_processed'].min()) / np.timedelta64(1, 'D')
    # Special correction for the NYT data set
    df.loc[df['county'] == 'New York City', 'fips'] = 36061.
    return df


def process_date(date_str, df):
    return (pd.to_datetime(date_str) - pd.to_datetime(df['date'].values).min()) / np.timedelta64(1, 'D')


def get_region_data(df, county_fips, proc_date=None, key='deaths'):
    county_fips = float(county_fips)
    d = df.loc[df['fips'] == county_fips]
    if proc_date is not None:
        vals = d.loc[d['date_processed'] == proc_date][key].values
        if len(vals) == 0:
            return 0.0
        return vals[0]
    return d


def all_output_dates():
    ret = ['2020-04-%02d' % x for x in range(1, 31)]
    ret += ['2020-05-%02d' % x for x in range(1, 32)]
    ret += ['2020-06-%02d' % x for x in range(1, 31)]
    return ret


def all_fips_in_df(df):
    # Get a sorted list of all FIPS string codes in a dataframe
    return sorted(list(set(['%d' % x for x in df['fips'].values if not np.isnan(x)])))


def all_output_fips(sample_out_file):
    # Get a sorted list of all FIPS codes in the sample output file
    repo = git.Repo("./", search_parent_directories=True)
    homedir = repo.working_dir
    datafile = f"{homedir}/" + sample_out_file
    all_data = np.genfromtxt(datafile, delimiter=',', dtype='str')
    all_fips = set([x.split('-')[-1] for x in all_data[1:, 0]])
    return sorted(list(all_fips)), all_data[1:, 0]


def fill_missing_dates(t, y):
    # If a time series is missing days, fill those missing days with a copy of the most recent value
    ret_t = np.arange(np.min(t), np.max(t) + 1)
    ret_y = np.zeros(len(ret_t))
    dat_ind = 0
    for ret_ind in range(len(ret_t)):
        if ret_t[ret_ind] in t:
            ret_y[ret_ind] = y[dat_ind]
            dat_ind += 1
        else:
            ret_y[ret_ind] = ret_y[ret_ind - 1]
    return ret_t, ret_y

In [35]:
def score_all_predictions(pred_file, date, model_date, mse=False, key='cases', bin_cutoffs=[20, 1000]):
    true_data = get_processed_df('nyt_us_counties_daily.csv')
    cum_data = get_processed_df('nyt_us_counties.csv')
    
    proc_score_date = process_date(date, true_data)
    proc_model_date = process_date(model_date, true_data)
    
    raw_pred_data = np.genfromtxt(pred_file, delimiter=',', skip_header=1, dtype=np.str)
    #print(raw_pred_data)
    date_preds = np.array([row for row in raw_pred_data if date in row[0]])
    #print("break")
    
    print(date_preds)
    all_fips = np.array([row[0].split('-')[-1] for row in date_preds])
    
    all_preds = date_preds[:, 1:].astype(np.float)
    
    print(all_preds)
    true_data = np.array([get_region_data(true_data, fips, proc_date=proc_score_date, key=key) for fips in all_fips])
    cum_data = np.array([get_region_data(cum_data, fips, proc_date=proc_model_date, key=key) for fips in all_fips])
    return get_scores(all_fips, all_preds, true_data, cum_data, mse=mse, bin_cutoffs=bin_cutoffs)


def get_scores(all_fips, all_preds, true_data, cum_data, bin_cutoffs=[20, 1000], mse=False):
    tot_loss = 0
    bin_losses, bin_counts = np.zeros(len(bin_cutoffs) + 1), np.zeros(len(bin_cutoffs) + 1)
    for fips, preds, true_number, cum_number in zip(all_fips, all_preds, true_data, cum_data):
        if mse:
            loss = (preds[4] - true_number) ** 2
        else:
            loss = pinball_loss(preds, true_number)
        tot_loss += loss
        done = False
        for i, bc in enumerate(bin_cutoffs):
            if cum_number <= bc:
                bin_losses[i] += loss
                bin_counts[i] += 1
                done = True
                break
        if not done:
            bin_losses[-1] += loss
            bin_counts[-1] += 1

    return tot_loss / len(all_preds), bin_losses / bin_counts


def pinball_loss(preds, true_val, p_vals=np.arange(0.1, 1.0, 0.1)):
    loss = 0
    for pred, p in zip(preds, p_vals):
        delta = np.abs(true_val - pred)
        if pred < true_val:
            loss += p * delta
        else:
            loss += (1 - p) * delta
    return loss / len(p_vals)

In [37]:
pred_file = os.path.join(os.getcwd(), 'sample_submission.csv')
scores = score_all_predictions(pred_file, '2020-04-14', '2020-04-13', key='deaths')
scores_mse = score_all_predictions(pred_file, '2020-04-14', '2020-04-13', key='deaths', mse=True)
print(scores[0], scores_mse[0])

[['2020-04-01-10001' '0.00E+00' '0' ... '0' '0' '0']
 ['2020-04-01-10003' '3.56E+01' '0' ... '0' '0' '0']
 ['2020-04-01-10005' '1.27E+02' '0' ... '0' '0' '0']
 ...
 ['2020-06-30-9011' '0' '0' ... '0' '0' '0']
 ['2020-06-30-9013' '0' '0' ... '0' '0' '0']
 ['2020-06-30-9015' '0' '0' ... '0' '0' '0']]
break
[['2020-04-14-10001' '4.01E+03' '0' ... '0' '0' '0']
 ['2020-04-14-10003' '1.65E+04' '0' ... '0' '0' '0']
 ['2020-04-14-10005' '2.89E+03' '0' ... '0' '0' '0']
 ...
 ['2020-04-14-9011' '0' '0' ... '0' '0' '0']
 ['2020-04-14-9013' '0' '0' ... '0' '0' '0']
 ['2020-04-14-9015' '0' '0' ... '0' '0' '0']]
[[ 4010.     0.     0. ...     0.     0.     0.]
 [16500.     0.     0. ...     0.     0.     0.]
 [ 2890.     0.     0. ...     0.     0.     0.]
 ...
 [    0.     0.     0. ...     0.     0.     0.]
 [    0.     0.     0. ...     0.     0.     0.]
 [    0.     0.     0. ...     0.     0.     0.]]
[['2020-04-01-10001' '0.00E+00' '0' ... '0' '0' '0']
 ['2020-04-01-10003' '3.56E+01' '0' ... '

In [36]:
pred_file = os.path.join(os.getcwd(), 'test_linreg_daily.csv')
scores = score_all_predictions(pred_file, '2020-05-24', '2020-05-24', key='deaths')
scores_mse = score_all_predictions(pred_file, '2020-05-24', '2020-05-24', key='deaths', mse=True)
print(scores[0], scores_mse[0])

[['2020-05-11-0' '0' '0' ... '0.640775783' '0.961163674' '1.281551566']
 ['2020-05-12-0' '0' '0' ... '0.640775783' '0.961163674' '1.281551566']
 ['2020-05-13-0' '0' '0' ... '0.640775783' '0.961163674' '1.281551566']
 ...
 ['2020-05-22-56045' '0' '0' ... '0.640775783' '0.961163674'
  '1.281551566']
 ['2020-05-23-56045' '0' '0' ... '0.640775783' '0.961163674'
  '1.281551566']
 ['2020-05-24-56045' '0' '0' ... '0.640775783' '0.961163674'
  '1.281551566']]
break
[['2020-05-24-0' '0' '0' ... '0.640775783' '0.961163674' '1.281551566']
 ['2020-05-24-1001' '0' '0' ... '0.732601604' '1.052989495' '1.373377387']
 ['2020-05-24-1003' '0' '0' ... '0.855558391' '1.175946283' '1.496334174']
 ...
 ['2020-05-24-56041' '0' '0' ... '0.640775783' '0.961163674'
  '1.281551566']
 ['2020-05-24-56043' '0' '0' ... '0.640775783' '0.961163674'
  '1.281551566']
 ['2020-05-24-56045' '0' '0' ... '0.640775783' '0.961163674'
  '1.281551566']]
[[0.      0.      0.      ... 0.64078 0.96116 1.28155]
 [0.      0.      0. 

In [42]:
pred_file = os.path.join(os.getcwd(), 'test_quantreg.csv')
scores = score_all_predictions(pred_file, '2020-05-24', '2020-05-24', key='deaths')
scores_mse = score_all_predictions(pred_file, '2020-05-24', '2020-05-24', key='deaths', mse=True)
print(scores[0], scores_mse[0])

[['2020-05-11-0' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ['2020-05-12-0' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ['2020-05-13-0' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ...
 ['2020-05-22-56045' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ['2020-05-23-56045' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ['2020-05-24-56045' '0.0' '0.0' ... '0.0' '0.0' '0.0']]
break
[['2020-05-24-0' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ['2020-05-24-1001' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ['2020-05-24-1003' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ...
 ['2020-05-24-56041' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ['2020-05-24-56043' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ['2020-05-24-56045' '0.0' '0.0' ... '0.0' '0.0' '0.0']]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[['2020-05-11-0' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ['2020-05-12-0' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ['2020-05-13-0' '0.0' '0.0' ... '0.0' '0.0' '0.0']
 ...
 ['2020-05-