In [138]:
import os
import datetime
import random
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, Band
from bokeh.embed import components
from bokeh.models import Range1d
from bokeh.models import NumeralTickFormatter
from bokeh.models import Span


import numpy as np
import pandas as pd

import scipy
from scipy import stats

In [335]:
def resample(l):
    final = []
    for i in range(len(l)):
        final.append(random.choice(l))
    return final

def resample_percentile(deaths, state_dict, num_iter = 100):
    final = {}
    for i in range(num_iter):
        resamp_deaths = resample(deaths)
        for key in state_dict.keys():
            per = stats.percentileofscore(resamp_deaths, state_dict[key])
            if not final.get(key):
                final[key] = []
            final[key].append(per)
    return final

def resample_one_percentile(deaths, target_per, num_iter = 100):
    final = []
    for i in range(num_iter):
        resamp_deaths = resample(deaths)
        per = stats.percentileofscore(resamp_deaths, target_per)
        per = np.percentile(resamp_deaths, target_per)
        final.append(per)
    return final

def resample_mean(deaths, num_iter = 1000):
    means = []
    for i in range(num_iter):
        resamp_deaths = resample(deaths)
        means.append(np.mean(resamp_deaths))
    return means

def repeat_resample(sample_a, sample_b, num_iter = 1000):
    difference_in_means = []#keep track of the difference in heights for each experiment
    for i in range(num_iter):
        resample_a = resample(sample_a)
        resample_b = resample(sample_b)
        difference = np.mean(resample_a) - np.mean(resample_b)
        difference_in_means.append(difference)
    return difference_in_means


In [271]:
def get_percentile():
    final = {}
    df = get_data('/home/henry/Downloads/covid_deaths_by_state1.csv')
    states = df['state'].tolist()
    deaths = df['deaths_million'].tolist()
    state_dict = {}
    for counter, state in enumerate(states):
        state_dict[state] = deaths[counter]
    ranges = resample_percentile(deaths = deaths, state_dict = state_dict, num_iter = 100)
    for key in ranges.keys():
        r = stats.percentileofscore(deaths, state_dict[key])
        final[key] = ((np.percentile(ranges[key], 2.5), np.percentile(ranges[key], 97.5), r))
    return final



In [6]:
def get_data(path):
    df = pd.read_csv(path)
    return df


In [331]:
def plot_states(title = None):
    df = get_data('/home/henry/Downloads/covid_deaths_by_state1.csv')
    df = df[df['state'] != 'District of Columbia']
    state = df['state'].tolist()
    deaths = df['deaths_million'].tolist()
    means = resample_mean(deaths, num_iter = 1000)
    lower = np.percentile(means, 2.5)
    upper = np.percentile(means, 97.5)
    

    ca_index = state.index('California')
    fl_index = state.index('Florida')
    both = sorted(list(zip(state, deaths)), key = lambda x: x[1])
    p = figure(x_range=[x[0] for x in both], title = title, plot_width = 800)
    color = ['blue' for x in both]
    color[ca_index] = 'red'
    color[fl_index] = 'green'
    size = [3 for x in state]
    size[ca_index] = 6
    size[fl_index] = 6
    p.circle(x=[x[0] for x in both], y = [x[1] for x in both], 
             color = color, size = size)
    p.xaxis.major_label_orientation = "vertical"
    
    p.yaxis.axis_label = "Deaths/Million"
    print(np.mean(deaths), lower, upper)

    return p

    
show(plot_states(title = 'California (red) vs. Florida (green)'))


1505.9042490279264 1338.5460174690159 1663.849343342565


In [236]:
def make_percentile_plot():
    raise NotImplemented("do not use")
    percentile_dict, state_dict = get_percentile()
    data = []
    for key in percentile_dict.keys():
        data.append((key, percentile_dict[key][0], percentile_dict[key][1], percentile_dict[key][2]))
    data = sorted(data, key = lambda x: x[1])
    cats = [x[0] for x in data]
    lower = [x[1] for x in data]
    upper = [x[2] for x in data]
    p = figure(x_range=cats, title = 'States Percentile')
    p.circle(cats, y = [x[3] for x in data])

    p.segment(cats, lower, cats, upper, line_color="black")
    p.rect(cats, lower, 0.2, 0.01, line_color="black")
    p.rect(cats, upper, 0.2, 0.01, line_color="black")
    p.xaxis.major_label_orientation = "vertical"
    p.yaxis.axis_label = "Deaths/Million"

    return p



In [237]:
def ex_conf():
    cats = list("abcdef")
    p = figure(x_range=cats)
    p.segment(cats, [10 for x in cats], cats,[15 for x in cats], line_color="black")
    p.rect(cats, [10 for x in cats], 0.2, 0.01, line_color="black")
    p.rect(cats, [15 for x in cats], 0.2, 0.01, line_color="black")
    return p
#show(ex_conf())


In [252]:
def percentile_diff():
    df = get_data('/home/henry/Downloads/covid_deaths_by_state1.csv')
    states = df['state'].tolist()
    deaths = df['deaths_million'].tolist()
    state_dict = {}
    for counter, state in enumerate(states):
        state_dict[state] = deaths[counter]
    cal_per = stats.percentileofscore(deaths, state_dict['California'])
    calif_range = resample_one_percentile(deaths, target_per = cal_per, num_iter = 100)
    cal_lower, cal_upper = np.percentile(calif_range, 2.5), np.percentile(calif_range, 97.5)
    fl_per = stats.percentileofscore(deaths, state_dict['California'])
    fl_range = resample_one_percentile(deaths, target_per = fl_per, num_iter = 100)
    fl_lower, fl_upper = np.percentile(fl_range, 2.5), np.percentile(calif_range, 97.5)
    return cal_lower, cal_upper, fl_lower, fl_upper

percentile_diff()

(1192.5629444348037,
 1655.8462287278335,
 1211.7423916570247,
 1655.8462287278335)

In [281]:
def plot_percentile(title = None):
    df = get_data('/home/henry/Downloads/covid_deaths_by_state1.csv')
    df = df[df['state'] != 'District of Columbia']
    state = df['state'].tolist()
    deaths = df['deaths_million'].tolist()
    stats_ = percentile_diff()
    ca_index = state.index('California')
    fl_index = state.index('Florida')
    both = sorted(list(zip(state, deaths)), key = lambda x: x[1])
    p = figure(x_range=[x[0] for x in both], title = title )
    color = ['blue' for x in both]
    color[ca_index] = 'red'
    color[fl_index] = 'green'
    size = [3 for x in state]
    size[ca_index] = 6
    size[fl_index] = 6
    p.circle(x=[x[0] for x in both], y = [x[1] for x in both], 
             color = color, size = size)
    p.xaxis.major_label_orientation = "vertical"
    p.line(x=[x[0] for x in both], y = [stats_[0] for x in both], 
             color = 'black', legend_label = '95% confidence interval CA',
          alpha = .5)
    p.line(x=[x[0] for x in both], y = [stats_[1] for x in both], 
             color = 'black', alpha = .5)
    p.yaxis.axis_label = "Deaths/Million"

    return p

    
show(plot_percentile(title = 'California vs. Florida'))

In [323]:
def over_time(window = 7):
    df = pd.read_csv('data/states.csv')
    df['date'] = pd.to_datetime(df['date'])
    df_pop = pd.read_csv('data/states_population.csv')
    df_ca = df[df['state'] == 'California']
    df_fl = df[df['state'] == 'Florida']
    fl_pop = df_pop[df_pop['state'] == 'Florida']['population_2019'].tolist()[0]
    ca_pop = df_pop[df_pop['state'] == 'California']['population_2019'].tolist()[0]
    dates_fl = df_fl['date'].tolist()
    dates_ca = df_ca['date'].tolist()
    deaths_ca = df_ca['deaths'].rolling(window).mean().tolist()
    deaths_fl = df_fl['deaths'].rolling(window).mean().tolist()

    deaths_ca = [x/ca_pop * 1e6 for x in deaths_ca]
    deaths_fl = [x/fl_pop * 1e6 for x in deaths_fl]
    p = figure(title = 'CA vs FL', x_axis_type = 'datetime')
    p.line(x = dates_ca, y = deaths_ca, legend_label = 'CA')
    p.line(x= dates_fl, y = deaths_fl, legend_label = 'FL', color = 'green')
    return p


show(over_time())


In [448]:
import math
def over_time_diff(window = 7):
    df = pd.read_csv('data/states.csv')
    df['date'] = pd.to_datetime(df['date'])
    df_pop = pd.read_csv('data/states_population.csv')
    df_ca = df[df['state'] == 'California']
    df_fl = df[df['state'] == 'Florida']
    fl_pop = df_pop[df_pop['state'] == 'Florida']['population_2019'].tolist()[0]
    ca_pop = df_pop[df_pop['state'] == 'California']['population_2019'].tolist()[0]
    dates_fl = df_fl['date'].tolist()
    dates_ca = df_ca['date'].tolist()
    deaths_ca = df_ca['deaths'].rolling(window).mean().tolist()
    deaths_fl = df_fl['deaths'].rolling(window).mean().tolist()

    deaths_ca = [x/ca_pop * 1e6 for x in deaths_ca]
    deaths_fl = [x/fl_pop * 1e6 for x in deaths_fl]
    deaths_ca = [x for x in deaths_ca if not math.isnan(x)]
    deaths_fl = [x for x in deaths_fl if not math.isnan(x)]

    diff = repeat_resample(deaths_ca, deaths_fl, num_iter = 1000)
    lower, upper = np.percentile(diff, 2.5), np.percentile(diff, 97.5)
    print(lower, upper, len(dates_ca))
   
    def get_total_diff(d, pop, dates, window):
        return d/1e6 * pop * len(dates)
    print(get_total_diff(abs(upper), fl_pop, dates_fl, window))
    print(get_total_diff(abs(lower), fl_pop, dates_fl, window))
    print(get_total_diff(.52, fl_pop, dates_fl, window))
    print(sum(df_fl['deaths']))




over_time_diff(window = 14)


-1.0045762154523603 -0.14540519644089242 420
1199.2222339548762
8285.193120745394
4288.67452416
32712


In [450]:
50/2000 * 100

2.5