In [None]:
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt

from scipy.integrate import trapz
from scipy.stats import gaussian_kde

from pwass.spline import MonotoneQuadraticSplineBasis
from pwass.distributions import Distribution
from pwass.dimsensionality_reduction.geodesic_pca import GeodesicPCA
from pwass.dimsensionality_reduction.nested_pca import NestedPCA
from pwass.dimsensionality_reduction.projected_pca import ProjectedPCA

In [None]:
df = pd.read_csv("data/covid_us_new.csv")
df.head(10)

In [None]:
dfs_by_state = [
    x for _, x in df.groupby(df.State) if len(x) > 1]

# optional: Remove Puerto Rico
dfs_by_state = [
    x for x in dfs_by_state if x.State.values[0] != 'Puerto Rico']

for currdf in dfs_by_state:
    currdf.loc[np.isnan(currdf["COVID-19 Deaths"].values), "COVID-19 Deaths"] = 2.0

In [None]:
nbasis = 15
zero_one_grid = np.linspace(0, 1, 100)
spline_basis = MonotoneQuadraticSplineBasis(nbasis, zero_one_grid)

In [None]:
from scipy.optimize import curve_fit

def gaussian(x, mean, amplitude, standard_deviation):
    return amplitude * np.exp( - ((x - mean) / standard_deviation) ** 2)

In [None]:
# create distributions
state_names = [x.State.values[0] for x in dfs_by_state]

m_distribs = []
f_distribs = []
orig_m = []
orig_f = []
bins_center = np.array([0.5, 2.5, 9.5, 19.5, 29.5, 39.5, 49.5, 59.5, 69.5, 79.5, 89.5])

bins = [[0, 1], [1, 5], [5, 15], [15, 25], [25, 35], [35, 45], [45, 55], [55, 65], [65, 75], 
        [75, 85], [85, 95]]

xgrid = np.linspace(0, 94, 101)
kept_groups = np.array([1, 3, 4, 5, 7, 9, 10, 12, 13, 14, 15])
for s in dfs_by_state:
    males = s[s.Sex == "Male"]["COVID-19 Deaths"].values[kept_groups]
    orig_m.append(males)
    interp_m = np.zeros_like(xgrid)
    assert(len(males) == len(bins))
    for i, (l, u) in enumerate(bins):
        wh = np.where((xgrid >= l) & (xgrid <= u))[0]
        interp_m[wh] = males[i]
    
    interp_m /= trapz(interp_m, xgrid)
    curr = Distribution(wbasis=spline_basis)
    curr.init_from_pdf(xgrid, interp_m)
    curr._invert_cdf()
    curr.compute_spline_expansions()
    m_distribs.append(curr)
    plt.plot(xgrid, interp_m, color="steelblue")    
 
    females = s[s.Sex == "Female"]["COVID-19 Deaths"].values[kept_groups]
    orig_f.append(females)


    interp_f = np.zeros_like(xgrid)
    for i, (l, u) in enumerate(bins):
        wh = np.where((xgrid >= l) & (xgrid <= u))[0]
        interp_f[wh] = females[i]
    
    interp_f /= trapz(interp_f, xgrid)
    curr = Distribution(wbasis=spline_basis)
    curr.init_from_pdf(xgrid, interp_f)
    curr._invert_cdf()
    curr.compute_spline_expansions()
    f_distribs.append(curr)
    plt.plot(xgrid, interp_f, color="orange")
    
plt.savefig("covid_data.pdf")

In [None]:
alldistribs = m_distribs + f_distribs
orig_distribs = orig_m + orig_f

In [None]:
ppca = ProjectedPCA(spline_basis=spline_basis, compute_spline=False)

ppca.fit(alldistribs, 3)
projs = ppca.transform(alldistribs)

In [None]:
projs

In [None]:
%matplotlib inline
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(10, 5))

axes[0].scatter(projs[:53, 0], projs[:53, 1], c="steelblue")
axes[0].scatter(projs[53:, 0], projs[53:, 1],  c="orange")

axes[1].scatter(projs[:53, 0], projs[:53, 2], c="steelblue")
axes[1].scatter(projs[53:, 0], projs[53:, 2],  c="orange")

axes[2].scatter(projs[:53, 1], projs[:53, 2], c="steelblue")
axes[2].scatter(projs[53:, 1], projs[53:, 2],  c="orange")

plt.tight_layout()

In [None]:
import seaborn as sns
from scipy.interpolate import UnivariateSpline

def invcdf_to_pdf(zero_one_grid, invcdf_eval, s=0.1):
    kept = np.unique(invcdf_eval, return_index=True)[1]
    new_grid = np.linspace(np.min(invcdf_eval), np.max(invcdf_eval), 100)
    cdf = UnivariateSpline(x=invcdf_eval[kept], y=zero_one_grid[kept], s=s)
    der = cdf.derivative()(new_grid)
    return new_grid, der

def plot_pc(pca, ind, pos_lambdas, neg_lambdas, pos_palette, neg_palette, ax, smooth_val):
    for j, lam in enumerate(pos_lambdas):
        proj = pca.bary + pca.project(lam * pca.eig_vecs[:, ind])
        grid, pdf = invcdf_to_pdf(
            zero_one_grid,
            spline_basis.eval_spline(proj), smooth_val)
        ax.plot(grid, pdf, color=pos_palette[j])
        
        
    for j, lam in enumerate(neg_lambdas):
        proj = pca.bary + pca.project(lam * pca.eig_vecs[:, ind])
        grid, pdf = invcdf_to_pdf(
            zero_one_grid,
            spline_basis.eval_spline(proj), smooth_val)
        ax.plot(grid, pdf, color=neg_palette[j])

In [None]:
%matplotlib inline

fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3.5))

pca = ppca
s_vals = [1.5, 2.5]

pos_lambdas = np.linspace(0.01, 3, 10)
neg_lambdas = np.linspace(-0.01, -3, 10)
pos_palette = sns.light_palette("red", n_colors=len(pos_lambdas))
neg_palette = sns.light_palette("navy", n_colors=len(neg_lambdas))
plot_pc(pca, 0, pos_lambdas, neg_lambdas, pos_palette, neg_palette, axes[0], s_vals[0])



pos_lambdas = np.linspace(0.01, 2.5, 10)
neg_lambdas = np.linspace(-0.01, -2, 10)
pos_palette = sns.light_palette("red", n_colors=len(pos_lambdas))
neg_palette = sns.light_palette("navy", n_colors=len(neg_lambdas))
plot_pc(pca, 1, pos_lambdas, neg_lambdas, pos_palette, neg_palette, axes[1], s_vals[0])


axes[0].set_ylim((0, 0.06))
axes[1].set_ylim((0, 0.06))

i1 = 72
i2 = 54
i3 = 50

lw = 2.5

axes[2].scatter(projs[:53, 0], projs[:53, 1], c="steelblue", s=30, alpha=0.7)
axes[2].scatter(projs[53:, 0], projs[53:, 1],  c="orange", s=30, alpha=0.7)

axes[3].plot(alldistribs[i1].pdf_grid, alldistribs[i1].pdf_eval, 
             label=state_names[i1 - 53], c="red", linewidth=lw)
axes[3].plot(alldistribs[i2].pdf_grid, alldistribs[i2].pdf_eval, 
             c="green", linewidth=lw)
axes[3].plot(alldistribs[i3].pdf_grid, alldistribs[i3].pdf_eval, 
             label=state_names[i3 - len(dfs_by_state)], c="#9467bd", linewidth=lw)



axes[2].scatter(projs[i1, 0], projs[i1, 1], c="red", s=70)
axes[2].scatter(projs[i2, 0], projs[i2, 1],  c="green", s=70)
axes[2].scatter(projs[i3, 0], projs[i3, 1],  c="#9467bd", s=70)

axes[0].set_title("First PD", fontsize=16)
axes[1].set_title("Second PD", fontsize=16)
axes[2].set_title("Scores", fontsize=16)

plt.tight_layout()
# plt.savefig("covid_pc.pdf")

In [None]:
# Reconstruction error

def reconstruction_error(true_distribs, pca):
    proj_coeffs = pca.transform(true_distribs)
    reconstructed_coeffs = pca.pt_from_proj(proj_coeffs) + pca.bary

    mean = 0
    for i in range(len(true_distribs)):
        pt = true_distribs[i].quantile_coeffs
        delta = pt - reconstructed_coeffs[i, :]
        mean += np.sqrt(pca.inner_prod(delta, delta))

    return mean / len(true_distribs)


def fit_and_compute_error(data, pca, dim):
    pca.fit(data, dim)
    error = reconstruction_error(data, pca)
    print("dim: {0}, error: {1}".format(dim, error))
    return error

In [None]:
errs = []
mean_err = 0
for d in alldistribs:
    delta = d.quantile_coeffs - ppca.bary
    mean_err += ppca.inner_prod(delta, delta)

errs.append(mean_err / len(alldistribs))

for dim in range(1, 10):
    errs.append(fit_and_compute_error(alldistribs, ppca, dim))

In [None]:
fig,ax = plt.subplots()

ax.plot(range(10), errs, label="reconstruction error")
ax.set_xticks(range(10))

ax2=ax.twinx()
ax2.plot(range(1, 10), np.cumsum(ppca.eig_vals[:9]) / np.sum(ppca.eig_vals),
         label="explained variance", color="orange")
ax2.set_ylim(0.5, 1.1)
fig.legend(fontsize=14)
# plt.savefig("covid_pca_rec.pdf")