In [1]:
import os
import datetime
import random
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, Band
from bokeh.embed import components
from bokeh.models import Range1d

import numpy as np

In [2]:
output_notebook()

In [3]:
import math
import scipy.optimize as optim
import pandas as pd
import statsmodels.api as sm
from scipy.stats import ttest_ind

In [4]:
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
from bokeh.models import Arrow, NormalHead, OpenHead, VeeHead
from bokeh.models import Label
from bokeh.models import Span
from bokeh.embed import components


In [5]:
def get_data():
    df = pd.read_csv('data/states.csv')
    df = df[(df['state'] != 'Guam') & (df['state'] != 'Northern Mariana Islands')]
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by = ['date'])
    return df


In [6]:

def get_pop_data():
    df = pd.read_csv('data/seven_day_state.csv')
    #df = df[(df['state'] != 'Guam') & (df['state'] != 'Northern Mariana Islands')]
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by = ['date'])
    return df

def get_pop():
    df = get_pop_data()
    states = df['state']
    population = df['population']
    d = {}
    for x in set(list(zip(states, population))):
        d[x[0]] = x[1]
    return d


In [7]:
def get_rate(df, state, leng, key):
    df_s = df[df['state'] == state]
    current = df_s[key].tolist()[-1 * leng:]
    prev = df_s[key].tolist()[-2 * leng: -1 * leng]
    assert len(prev) == leng
    assert len(current) == leng
    p_value_ttest = round(ttest_ind(current, prev).pvalue,2)
    return np.mean(current)/np.mean(prev), p_value_ttest


In [8]:
def get_linear(y):
    x = range(len(y))
    X = list(zip(*[x]))
    xm = sm.add_constant(X)
    model = sm.OLS(y, xm) 
    result = model.fit()
    slope = result.params[1]
    p_value =result.pvalues[1]
    return result.params[0], slope, p_value


def get_state_linear(df, state, leng, key):
    df_s = df[df['state'] == state]
    y = df_s[key].tolist()[-1 * leng:]
    assert len(y) == leng
    return get_linear(y)
get_state_linear(get_data(), 'Washington', 14, 'cases')


(604.5428571428573, 50.89450549450551, 0.03373896747789528)

In [9]:
def test_slope(df, state, leng, key, window = None):
    #print('p value is {p}'.format(p = p))
    df_s = df[df['state'] == state]
    df_s = df_s.sort_values(by = ['date'])
    if window:
        y = df_s['cases'].rolling(window).mean().tolist()[-1 * leng:]
    else:
        y = df_s['cases'].tolist()[-1 * leng:]

    inter, slope, p = get_linear(y)
    print('p value is {p}'.format(p = p))
    x = df_s['date'].tolist()[-1 * leng:]
    y_hat = [x * slope + inter for x in range(leng)]
    p = figure(x_axis_type = 'datetime')
    p.line(x=x, y = y)
    p.line(x = x[-1 * leng:], y = y_hat)
    return p
show(test_slope(get_data(), 'Colorado', 7, 'cases', window = None))
    

p value is 0.12275842549223852


Metrics:
* if the state is way down from its peak
* if the state has a low # cases/million
* if the state has a longer declining slope (how to measure?)
* if the slope is increasing but not significant ? 

In [10]:
def all_states_info(df, leng):
    final = []
    for state in set(df['state'].tolist()):
        change, p = get_rate(df, state, leng, 'cases')
        final.append((state, change))
    return final

In [11]:
def make_bar(labels_neg, labels_pos, y_neg, y_pos, labels_neutral = None, y_neutral = None, plot_height = 450, 
             plot_width = 450, title = None):
    x_range = []
    if labels_neutral:
        x_range = labels_neg + labels_neutral + labels_pos
    else:
        x_range = labels_neg  + labels_pos    
    p = figure(x_range = x_range , plot_height = plot_height, 
               plot_width = plot_width, title = title)
    p.vbar(x=labels_neg, top=y_neg , line_width = 5, width = .9, color = 'green')
    if labels_neutral:
        p.vbar(x=labels_neutral, top=y_neutral , line_width = 5, width = .9, color = 'yellow')
    p.vbar(x=labels_pos, top=y_pos , line_width = 5, width = .9, color = 'red')
    p.xaxis.major_label_orientation = "vertical"
    p.xgrid.grid_line_color = None
    #p.y_range = Range1d(1, max(y_pos))
    return p

In [12]:
def all_states(leng):
    df = get_data()
    neutral_n = 1.1
    data= sorted(all_states_info(df, leng), key = lambda x: x[1])
    neg = list(filter(lambda x: x[1] <=1, data))
    neutral = list(filter(lambda x: x[1] > 1 and x[1] <= neutral_n, data))
    pos = list(filter(lambda x: x[1] > neutral_n, data))
    neg_l = [x[0] for x in neg]
    neg_y = [x[1] for x in neg]
    assert len(neg_l) == len(neg_y)
    p = make_bar(labels_neg = [x[0] for x in neg], 
                 y_neg = [x[1] for x in neg],
                labels_pos = [x[0] for x in pos],
                 y_pos = [x[1] for x in pos],
                 labels_neutral = [x[0] for x in neutral],
                 y_neutral = [x[1] for x in neutral],
                 plot_width = 800,
                 title = '{l} -day'.format(l = leng)
                )
    return p
show(all_states(7))
show(all_states(14))
show(all_states(28))




In [13]:
def make_bar_pop(labels, y, plot_height = 450, 
             plot_width = 450, title = None, mask_dict = {}):
    x_range = labels
    p = figure(x_range = x_range , plot_height = plot_height, 
               plot_width = plot_width, title = title)
    for counter, i in enumerate(labels):
        if mask_dict.get(i):
            color = 'blue'
        else:
            color = 'red'
        p.vbar(x=[i], top=y[counter] , line_width = 5, width = .5, color = color)
    p.xaxis.major_label_orientation = "vertical"
    p.xgrid.grid_line_color = None

    return p
    #p.y_range 

In [14]:
def get_mask():
    return [('Alabama', datetime.datetime(2020, 7, 16)),
             ('Arizona', datetime.datetime(2020,6,17)),
             ('California', datetime.datetime(2020, 6, 18)),
           ('Connecticut', datetime.datetime(2020,4, 20)),
            ('Delaware', datetime.datetime(2020, 4, 28)),
            ('Hawaii', datetime.datetime(2020, 4, 20)),
            ('Illinois', datetime.datetime(2020,5,1)),
            ('Kansas', datetime.datetime(2020, 7, 3)),
            ('Kentucky', datetime.datetime(2020, 7, 10)),
            ('Maine', datetime.datetime(2020, 5, 1)),
            ('Maryland', datetime.datetime(2020, 4, 18)),
            ('Massachusetts', datetime.datetime(2020, 5, 6)),
            ('Michigan', datetime.datetime(2020, 6, 18)),
            ('Nevada', datetime.datetime(2020, 6, 24)),
            ('New Jersey', datetime.datetime(2020, 4, 8)),
            ('New Mexico', datetime.datetime(2020, 5, 16)),
            ('New York', datetime.datetime(2020, 4, 17)),
            ('North Carolina', datetime.datetime(2020, 6, 26)),
            ('Oregon', datetime.datetime(2020, 7, 1)),
            ('Pennsylvania', datetime.datetime(2020, 4, 19)),
            ('Rhode Island', datetime.datetime(2020, 5, 18)),
            ('Texas', datetime.datetime(2020, 7, 3)),
            ('Virginia', datetime.datetime(2020, 5, 29)),
            ('Washington', datetime.datetime(2020, 6, 26)),
            ('West Virginia', datetime.datetime(2020, 7, 6)),
           ]

In [15]:
def cases_with_pop(back = 7, per = 100000):
    df = get_data()
    pop = get_pop()
    final = []
    now = datetime.datetime.now()
    states = set(df['state'].tolist())
    for i in states:
        df_ = df[(df['state'] == i) & (df['date'] >= now - datetime.timedelta(days = back))]
        if i in ["Puerto Rico", "Virgin Islands"]:
            continue
        cases = np.mean(df_['cases'])/pop[i] * per
        final.append((i, cases))
    return sorted(final, key = lambda x: x[1])


In [16]:
case_with_pop = cases_with_pop(back = 14)
labels = [x[0] for x in case_with_pop]
y = [x[1] for x in case_with_pop]
masks = get_mask()
d = {}
for i in masks:
    d[i[0]] = i[1]
p = make_bar_pop(labels, y, plot_height = 450, plot_width = 750, title = None, mask_dict = d)
show(p)
