In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
from datetime import datetime

In [2]:
# read covid age demographics
covid_ages = pd.read_csv('data/case_demographics_age.csv')
covid_ages.head()

# lots of missing data at beginning -> perhaps because nothing to compare it to

Unnamed: 0,age_group,totalpositive,date,case_percent,deaths,deaths_percent,ca_percent
0,0-17,120,2020-04-02,,,,
1,18-49,5302,2020-04-02,,,,
2,50-64,2879,2020-04-02,,,,
3,65 and Older,2342,2020-04-02,,,,
4,Unknown,58,2020-04-02,,,,


In [3]:
covid_ages.info()
#1235 rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age_group       1235 non-null   object 
 1   totalpositive   1235 non-null   int64  
 2   date            1235 non-null   object 
 3   case_percent    1135 non-null   float64
 4   deaths          1135 non-null   float64
 5   deaths_percent  1135 non-null   float64
 6   ca_percent      1125 non-null   float64
dtypes: float64(4), int64(1), object(2)
memory usage: 67.7+ KB


In [12]:
covid_ages['age_group'].unique()

array(['0-17', '18-49', '50-64', '65 and Older', 'Unknown', '65+',
       'Missing'], dtype=object)

In [4]:
covid_cases = pd.read_csv('data/statewide_cases.csv')
covid_cases.head()

Unnamed: 0,county,totalcountconfirmed,totalcountdeaths,newcountconfirmed,newcountdeaths,date
0,Santa Clara,151.0,6.0,151,6,2020-03-18
1,Santa Clara,183.0,8.0,32,2,2020-03-19
2,Santa Clara,246.0,8.0,63,0,2020-03-20
3,Santa Clara,269.0,10.0,23,2,2020-03-21
4,Santa Clara,284.0,13.0,15,3,2020-03-22


In [5]:
covid_cases.info()
#15665 rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15665 entries, 0 to 15664
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   county               15665 non-null  object 
 1   totalcountconfirmed  15662 non-null  float64
 2   totalcountdeaths     15663 non-null  float64
 3   newcountconfirmed    15665 non-null  int64  
 4   newcountdeaths       15665 non-null  int64  
 5   date                 15665 non-null  object 
dtypes: float64(2), int64(2), object(2)
memory usage: 734.4+ KB


Notes about what is observed from the data:

- Both have the same Date Format, which is YYYY/MM/DD Time, so we can potentially join on date
- The covid_cases is sorted by date and grouped by county, but the covid_ages doesn't show us the counties. We would need to sum totalcountconfirmed in covid_cases grouped by dates before joining and comparing the proportions; We will also probably need to do that for the age groups...
- Unknown and missing age group in covid_ages

In [6]:
grouped_covid_cases = covid_cases.groupby('date')['totalcountconfirmed'].sum()
grouped_covid_cases

date
2020-03-18        675.0
2020-03-19       1006.0
2020-03-20       1224.0
2020-03-21       1468.0
2020-03-22       1733.0
                ...    
2020-11-30    1225189.0
2020-12-01    1245948.0
2020-12-02    1264539.0
2020-12-03    1286557.0
2020-12-04    1311625.0
Name: totalcountconfirmed, Length: 262, dtype: float64

In [7]:
grouped_covid_ages = covid_ages.groupby(['date','age_group'])['totalpositive'].sum()
grouped_covid_ages

date        age_group   
2020-04-02  0-17               120
            18-49             5302
            50-64             2879
            65 and Older      2342
            Unknown             58
                             ...  
2020-12-04  0-17            147807
            18-49           780365
            50-64           246393
            65+             135981
            Missing           1079
Name: totalpositive, Length: 1235, dtype: int64

In [8]:
# change grouped series to dataframe and reset index to get
# age group and date out of index
grouped_covid_ages = grouped_covid_ages.to_frame()
grouped_covid_ages = grouped_covid_ages.reset_index()
grouped_covid_cases = grouped_covid_cases.to_frame()
grouped_covid_cases = grouped_covid_cases.reset_index()

In [9]:
grouped_covid_ages

Unnamed: 0,date,age_group,totalpositive
0,2020-04-02,0-17,120
1,2020-04-02,18-49,5302
2,2020-04-02,50-64,2879
3,2020-04-02,65 and Older,2342
4,2020-04-02,Unknown,58
...,...,...,...
1230,2020-12-04,0-17,147807
1231,2020-12-04,18-49,780365
1232,2020-12-04,50-64,246393
1233,2020-12-04,65+,135981


In [10]:
grouped_covid_cases

Unnamed: 0,date,totalcountconfirmed
0,2020-03-18,675.0
1,2020-03-19,1006.0
2,2020-03-20,1224.0
3,2020-03-21,1468.0
4,2020-03-22,1733.0
...,...,...
257,2020-11-30,1225189.0
258,2020-12-01,1245948.0
259,2020-12-02,1264539.0
260,2020-12-03,1286557.0


In [11]:
# join the dataframes
pd.merge(grouped_covid_ages, grouped_covid_cases)

Unnamed: 0,date,age_group,totalpositive,totalcountconfirmed
0,2020-04-02,0-17,120,10701.0
1,2020-04-02,18-49,5302,10701.0
2,2020-04-02,50-64,2879,10701.0
3,2020-04-02,65 and Older,2342,10701.0
4,2020-04-02,Unknown,58,10701.0
...,...,...,...,...
1230,2020-12-04,0-17,147807,1311625.0
1231,2020-12-04,18-49,780365,1311625.0
1232,2020-12-04,50-64,246393,1311625.0
1233,2020-12-04,65+,135981,1311625.0
