# INFO 607.  Tensorflow Project.  Exploratory Data Analysis
## May 10, 2020

In [123]:
import requests
import json
from time import sleep
import os
from os import path
from datetime import datetime, timedelta
from glob import glob
import re
import csv


import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import gamma
import numpy as np


### Helper Methods from the Quickstart

In [31]:
def read_data_json(typename, api, body):
    """
    read_data_json directly accesses the C3.ai COVID-19 Data Lake APIs using the requests library, 
    and returns the response as a JSON, raising an error if the call fails for any reason.
    ------
    typename: The type you want to access, i.e. 'OutbreakLocation', 'LineListRecord', 'BiblioEntry', etc.
    api: The API you want to access, either 'fetch' or 'evalmetrics'.
    body: The spec you want to pass. For examples, see the API documentation.
    """
    response = requests.post(
        "https://api.c3.ai/covid/api/1/" + typename + "/" + api, 
        json = body, 
        headers = {
            'Accept' : 'application/json', 
            'Content-Type' : 'application/json'
        }
    )
    response.raise_for_status()
    
    return response.json()

def fetch(typename, body, get_all = False, remove_meta = True):
    """
    fetch accesses the Data Lake using read_data_json, and converts the response into a Pandas dataframe. 
    fetch is used for all non-timeseries data in the Data Lake, and will call read_data as many times 
    as required to access all of the relevant data for a given typename and body.
    ------
    typename: The type you want to access, i.e. 'OutbreakLocation', 'LineListRecord', 'BiblioEntry', etc.
    body: The spec you want to pass. For examples, see the API documentation.
    get_all: If True, get all records and ignore any limit argument passed in the body. If False, use the limit argument passed in the body. The default is False.
    remove_meta: If True, remove metadata about each record. If False, include it. The default is True.
    """
    if get_all:
        has_more = True
        offset = 0
        limit = 2000
        df = pd.DataFrame()

        while has_more:
            body['spec'].update(limit = limit, offset = offset)
            response_json = read_data_json(typename, 'fetch', body)
            new_df = pd.json_normalize(response_json['objs'])
            df = df.append(new_df)
            has_more = response_json['hasMore']
            offset += limit
            
    else:
        response_json = read_data_json(typename, 'fetch', body)
        df = pd.json_normalize(response_json['objs'])
        
    if remove_meta:
        df = df.drop(columns = [c for c in df.columns if ('meta' in c) | ('version' in c)])
    
    return df
    
def evalmetrics(typename, body, remove_meta = True):
    """
    evalmetrics accesses the Data Lake using read_data_json, and converts the response into a Pandas dataframe.
    evalmetrics is used for all timeseries data in the Data Lake.
    ------
    typename: The type you want to access, i.e. 'OutbreakLocation', 'LineListRecord', 'BiblioEntry', etc.
    body: The spec you want to pass. For examples, see the API documentation.
    remove_meta: If True, remove metadata about each record. If False, include it. The default is True.
    """
    response_json = read_data_json(typename, 'evalmetrics', body)
    df = pd.json_normalize(response_json['result'])
    
    # get the useful data out
    df = df.apply(pd.Series.explode)
    if remove_meta:
        df = df.filter(regex = 'dates|data|missing')
    
    # only keep one date column
    date_cols = [col for col in df.columns if 'dates' in col]
    keep_cols =  date_cols[:1] + [col for col in df.columns if 'dates' not in col]
    df = df.filter(items = keep_cols).rename(columns = {date_cols[0] : "dates"})
    df["dates"] = pd.to_datetime(df["dates"])
    
    return df

#### Streamlined request for single item

In [32]:
def fetch_one(typename: str, body: dict, objs_only=True) -> dict:
    """
    Returns JSON output from single API call
    
    Args:
        typename: the C3.ai type name
        body: the body of the request
        objs_only: if True, remove the metadata and just returns the objects
        
    Returns:
        JSON response as dictionary
    
    """

    response = read_data_json(typename, 'fetch', body)
    if objs_only:
        for r in response['objs']:
            if 'meta' in r.keys():
                del(r['meta'])
                
        return response['objs']
    
    return response
    

### Load the location codes for the US into a Pandas DataFrame

In [35]:
def get_us_locations(file_name='C3-ai-Location-IDs.xlsx'): 
    """ Loads all US counties from C3 ai spreadsheet 
    
    Args:
        file_name: the name of the spreadsheet
        
    Returns:
        Pandas dataframe with the results
    
    """
                     
    locations = pd.read_excel(path.join('.', file_name), sheet_name='County IDs', header=2)
    us_locations = locations[locations.Country=='United States']
    
    return us_locations
    

Unnamed: 0,County id,County,State,Country,JHU,NYT
0,Autauga_Alabama_UnitedStates,Autauga,Alabama,United States,X,X
1,Baldwin_Alabama_UnitedStates,Baldwin,Alabama,United States,X,X
2,Barbour_Alabama_UnitedStates,Barbour,Alabama,United States,X,X
3,Bibb_Alabama_UnitedStates,Bibb,Alabama,United States,X,X
4,Blount_Alabama_UnitedStates,Blount,Alabama,United States,X,X
...,...,...,...,...,...,...
3244,Unassigned_Wisconsin_UnitedStates,Unassigned,Wisconsin,United States,X,
3245,Unassigned_Wyoming_UnitedStates,Unassigned,Wyoming,United States,X,
3246,Unassigned_Guam_UnitedStates,Unassigned,Guam,United States,X,X
3247,Unassigned_NorthernMarianaIslands_UnitedStates,Unassignned,Northern Mariana Islands,United States,X,X


### Get the basic population data for each of the 3429 counties

In [36]:
def make_outbreaklocation_body(county_id: str) -> dict:
    """ Forms the request body for a count for the outbreak location API 
    
    Args:
        count_id: the ID for the County
    
    Returns:
        The request body
    
    """
    return {
              "spec": {
                "filter": f"id == '{county_id}'"
              }
}

# fetch_one('outbreaklocation', make_outbreaklocation_body('Autauga_Alabama_UnitedStates'))




In [90]:
def load_population_data(file_name='counties.json'):
    """ Loads all population data for US counties and stores in a file called counties.json"""

    us_locations = get_us_locations()
    keep_going = True
    tries = 0
    while keep_going:
        try:
            with open(file_name) as file:
                county_data = json.load(file)
        except:    
            county_data = {}
        i = 0
        for county in us_locations['County id']:
            if county not in county_data.keys():
                try:
                    data = fetch_one('outbreaklocation',  make_outbreaklocation_body(county))
                    county_data[county] = data[0]
                    i += 1
                    if i % 100 == 0:
                        print(f'Saving: {i}')
                        with open(file_name, 'w') as file:
                            json.dump(county_data, file)
                except:
                    county_data[county] = None
                    print(f'Problem with {county}')
                sleep(1)
        with open('counties.json', 'w') as file:
            json.dump(county_data, file)
        if len(county_data) >= len(us_locations) or tries >= 5:
            keep_going = False
        else:
            tries += 1
        
def get_counties_df(file_name='counties.json'):
    with open(file_name) as file:
                county_data = json.load(file)
    
    df = pd.DataFrame.from_dict(county_data)
    
    data = [df[col] for col in cols]    
    
    # pivot
    return pd.DataFrame(data,columns=df.index, index=df.columns)
    
get_counties_df()    


Unnamed: 0,hospitalIcuBeds,hospitalStaffedBeds,hospitalLicensedBeds,latestTotalPopulation,location,fips,id,name,version,typeIdent
Autauga_Alabama_UnitedStates,6.0,55.0,85.0,55869.0,{'value': {'id': 'Autauga_Alabama_UnitedStates...,{'id': '01001'},Autauga_Alabama_UnitedStates,Autauga,3735560.0,EP_LOC
Baldwin_Alabama_UnitedStates,51.0,362.0,386.0,223234.0,{'value': {'id': 'Baldwin_Alabama_UnitedStates...,{'id': '01003'},Baldwin_Alabama_UnitedStates,Baldwin,3801096.0,EP_LOC
Barbour_Alabama_UnitedStates,5.0,30.0,74.0,24686.0,{'value': {'id': 'Barbour_Alabama_UnitedStates...,{'id': '01005'},Barbour_Alabama_UnitedStates,Barbour,3801096.0,EP_LOC
Bibb_Alabama_UnitedStates,4.0,25.0,35.0,22394.0,"{'value': {'id': 'Bibb_Alabama_UnitedStates'},...",{'id': '01007'},Bibb_Alabama_UnitedStates,Bibb,3997704.0,EP_LOC
Blount_Alabama_UnitedStates,6.0,25.0,25.0,57826.0,{'value': {'id': 'Blount_Alabama_UnitedStates'...,{'id': '01009'},Blount_Alabama_UnitedStates,Blount,3801096.0,EP_LOC
...,...,...,...,...,...,...,...,...,...,...
Unassigned_Wisconsin_UnitedStates,,,,,{'value': {'id': 'Unassigned_Wisconsin_UnitedS...,,Unassigned_Wisconsin_UnitedStates,Unassigned,3080195.0,EP_LOC
Unassigned_Wyoming_UnitedStates,,,,,{'value': {'id': 'Unassigned_Wyoming_UnitedSta...,,Unassigned_Wyoming_UnitedStates,Unassigned,3080201.0,EP_LOC
Unassigned_Guam_UnitedStates,,,,,,,Unassigned_Guam_UnitedStates,Unknown,1.0,EP_LOC
Unassigned_NorthernMarianaIslands_UnitedStates,,,,,{'value': {'id': 'Unassigned_NorthernMarianaIs...,,Unassigned_NorthernMarianaIslands_UnitedStates,Unknown,458755.0,EP_LOC


### Get the County Stats from the Census Bureau

In [110]:
def get_county_stats_df(file_name='county_stats.csv'):
    return pd.read_csv(path.join('.',file_name))

get_county_stats_df()

Unnamed: 0,fips,PST045212,PST040210,PST120212,POP010210,AGE135212,AGE295212,AGE775212,SEX255212,RHI125212,...,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030212,LND110210,POP060210
0,0,313914040,308747508,1.7,308745538,6.4,23.5,13.7,50.8,77.9,...,8.3,28.8,5319456312,4174286516,3917663456,12990,613795732,829658,3531905.43,87.4
1,1000,4822023,4779745,0.9,4779736,6.3,23.3,14.5,51.5,70.0,...,1.2,28.1,112858843,52252752,57344851,12364,6426342,13506,50645.33,94.4
2,1001,55514,54571,1.7,54571,6.5,26.0,13.0,51.3,78.5,...,0.7,31.7,0,0,598175,12003,88157,385,594.44,91.8
3,1003,190790,182265,4.7,182265,5.9,22.6,17.7,51.2,87.3,...,1.3,27.3,1410273,0,2966489,17166,436955,1184,1589.78,114.6
4,1005,27201,27457,-0.9,27457,5.6,21.2,15.2,46.3,50.5,...,0.0,27.0,0,0,188337,6334,0,2,884.88,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3190,56037,45267,43806,3.3,43806,7.9,27.0,8.8,47.8,94.5,...,3.8,27.2,0,437493,898189,22843,150439,132,10426.65,4.2
3191,56039,21675,21294,1.8,21294,6.1,19.4,11.2,47.7,95.3,...,3.3,25.3,0,0,515644,25688,327363,122,3995.38,5.3
3192,56041,21025,21118,-0.4,21118,7.8,29.3,9.9,49.4,96.0,...,2.2,15.9,0,159375,413983,20626,35497,38,2081.26,10.1
3193,56043,8464,8533,-0.8,8533,5.9,25.0,18.7,49.8,95.3,...,0.0,26.9,0,12128,98308,12596,10175,1,2238.55,3.8


### Get the Evalmetrics Data

In [40]:

def get_last_file_date(county: str) -> str:
    max_date = '2020-01-01'
    files = glob(path.join('.', 'data', f'{county}*.psv'))
    if not files:
        return max_date
    for file in files:
        match = re.search(r'(\d\d\d\d-\d\d-\d\d).psv', file)
        if match:
            max_date = max(max_date, match.group(1))
    return max_date
            
    
def download_evalmetrics_data():
    """ Downloads the evalmetrics data from the last download through current """
    
    today = datetime.now().strftime('%Y-%m-%d')

    # Get the list of counties
    counties_file = path.join('.', 'counties.json')
    with open(counties_file) as file:
        counties = json.load(file)

    # Iterate through counties saving the time series data
    for county, details in counties.items():
        print(county)

        # Skip if missing county details
        if not details:
            continue

        # Get the last date we processed
        last_date = get_last_file_date(county)

        if last_date == today:
            continue

        # Get the data for the county from the last date processed
        body = {"spec" : {
                            "ids" : [county],
                            "expressions": [ "JHU_ConfirmedCases", "JHU_ConfirmedDeaths", "JHU_ConfirmedRecoveries"], 
                            "start" : last_date,
                            "end" : today,
                            "interval" : "DAY",
                        }
                }

        try:
            df = evalmetrics("outbreaklocation", body)
            file_name = path.join('.', 'data', f'{county}-{last_date}-{today}.psv')
            df.to_csv(file_name, sep='|')
        except Exception as e:
            print(f'Error processing {county}: {e}')
        
        sleep(1)
  
    

In [176]:
def append_to_dataframe(files):
    name_pattern = re.compile('([\w_]+)')
    county_df = None

    for file in files:
        base = path.basename(file)
        try:
            county = name_pattern.match(base).group(1)
            records = []
            with open(file) as fd:
                reader = csv.reader(fd, delimiter='|')
                # advance past the header
                next(iter(reader))
                records = [[county, row[1], row[2], row[4], row[6]] for row in reader]
            temp_df = pd.DataFrame(records, columns=['county', 'date', 'confirmed_cases', 'confirmed_deaths', 'confirmed_recoveries'])
            if county_df is None:
                county_df = temp_df
            else:
                county_df = pd.concat([county_df, temp_df], ignore_index=True)
        except Exception as e:
            print(f'Failure on file {base} with error {e}')
        
    return county_df

def join_evalmetrics_to_county(evalmetrics_df, counties_df, county_stats_df):
    df = evalmetrics_df.merge(counties_df, how='left', left_on='county', right_index=True, sort=True)
    # df.fips = [int(fips['id']) for fips in df.fips]
    fips = []
    for f in df.fips:
        if isinstance(f, dict):
            fips.append(int(f['id']))
        else:
            fips.append(f)
    df.fips = fips
    df = df.merge(county_stats_df, how='left', left_on='fips', right_on='fips')
    return df


df = append_to_dataframe([path.join('.','data', file) for file in os.listdir(path.join('.', 'data'))])

# df = append_to_dataframe([path.join('.','data','Wayne_Ohio_UnitedStates-2020-01-01-2020-05-14.psv'),
#                      path.join('.','data','Faulkner_Arkansas_UnitedStates-2020-01-01-2020-05-14.psv')])

c_df = get_counties_df()    
cs_df = get_county_stats_df()

df = join_evalmetrics_to_county(df, c_df, cs_df)
df

Failure on file .ipynb_checkpoints with error 'NoneType' object has no attribute 'group'


Unnamed: 0,county,date,confirmed_cases,confirmed_deaths,confirmed_recoveries,hospitalIcuBeds,hospitalStaffedBeds,hospitalLicensedBeds,latestTotalPopulation,location,...,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030212,LND110210,POP060210
0,Abbeville_SouthCarolina_UnitedStates,2020-01-01,0.0,0.0,0.0,6.0,25.0,25.0,24527.0,{'value': {'id': 'Abbeville_SouthCarolina_Unit...,...,0.0,33.4,657498.0,0.0,71936.0,2841.0,10963.0,23.0,490.48,51.8
1,Abbeville_SouthCarolina_UnitedStates,2020-01-02,0.0,0.0,0.0,6.0,25.0,25.0,24527.0,{'value': {'id': 'Abbeville_SouthCarolina_Unit...,...,0.0,33.4,657498.0,0.0,71936.0,2841.0,10963.0,23.0,490.48,51.8
2,Abbeville_SouthCarolina_UnitedStates,2020-01-03,0.0,0.0,0.0,6.0,25.0,25.0,24527.0,{'value': {'id': 'Abbeville_SouthCarolina_Unit...,...,0.0,33.4,657498.0,0.0,71936.0,2841.0,10963.0,23.0,490.48,51.8
3,Abbeville_SouthCarolina_UnitedStates,2020-01-04,0.0,0.0,0.0,6.0,25.0,25.0,24527.0,{'value': {'id': 'Abbeville_SouthCarolina_Unit...,...,0.0,33.4,657498.0,0.0,71936.0,2841.0,10963.0,23.0,490.48,51.8
4,Abbeville_SouthCarolina_UnitedStates,2020-01-05,0.0,0.0,0.0,6.0,25.0,25.0,24527.0,{'value': {'id': 'Abbeville_SouthCarolina_Unit...,...,0.0,33.4,657498.0,0.0,71936.0,2841.0,10963.0,23.0,490.48,51.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444560,Ziebach_SouthDakota_UnitedStates,2020-05-12,1.0,0.0,0.0,1.0,8.0,8.0,2756.0,{'value': {'id': 'Ziebach_SouthDakota_UnitedSt...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1961.27,1.4
444561,Ziebach_SouthDakota_UnitedStates,2020-05-13,1.0,0.0,0.0,1.0,8.0,8.0,2756.0,{'value': {'id': 'Ziebach_SouthDakota_UnitedSt...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1961.27,1.4
444562,Ziebach_SouthDakota_UnitedStates,2020-05-14,1.0,0.0,0.0,1.0,8.0,8.0,2756.0,{'value': {'id': 'Ziebach_SouthDakota_UnitedSt...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1961.27,1.4
444563,Ziebach_SouthDakota_UnitedStates,2020-05-15,1.0,0.0,0.0,1.0,8.0,8.0,2756.0,{'value': {'id': 'Ziebach_SouthDakota_UnitedSt...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1961.27,1.4


In [139]:
df[500:600]

Unnamed: 0,county,date,confirmed_cases,confirmed_deaths,confirmed_recoveries,hospitalIcuBeds,hospitalStaffedBeds,hospitalLicensedBeds,latestTotalPopulation,location,...,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030212,LND110210,POP060210
500,Ada_Idaho_UnitedStates-2020-01-01-2020-05,2020-03-30,113.0,2.0,0.0,,,,,,...,,,,,,,,,,
501,Ada_Idaho_UnitedStates-2020-01-01-2020-05,2020-03-31,195.0,3.0,0.0,,,,,,...,,,,,,,,,,
502,Ada_Idaho_UnitedStates-2020-01-01-2020-05,2020-04-01,195.0,3.0,0.0,,,,,,...,,,,,,,,,,
503,Ada_Idaho_UnitedStates-2020-01-01-2020-05,2020-04-02,312.0,3.0,0.0,,,,,,...,,,,,,,,,,
504,Ada_Idaho_UnitedStates-2020-01-01-2020-05,2020-04-03,307.0,3.0,0.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,Adair_Iowa_UnitedStates-2020-01-01-2020-05,2020-02-17,0.0,0.0,0.0,,,,,,...,,,,,,,,,,
596,Adair_Iowa_UnitedStates-2020-01-01-2020-05,2020-02-18,0.0,0.0,0.0,,,,,,...,,,,,,,,,,
597,Adair_Iowa_UnitedStates-2020-01-01-2020-05,2020-02-19,0.0,0.0,0.0,,,,,,...,,,,,,,,,,
598,Adair_Iowa_UnitedStates-2020-01-01-2020-05,2020-02-20,0.0,0.0,0.0,,,,,,...,,,,,,,,,,


In [140]:
df.fips

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
444560   NaN
444561   NaN
444562   NaN
444563   NaN
444564   NaN
Name: fips, Length: 444565, dtype: float64

In [163]:
evalmetrics_df = append_to_dataframe([path.join('.','data','Wayne_Ohio_UnitedStates-2020-01-01-2020-05-14.psv'),
                     path.join('.','data','Faulkner_Arkansas_UnitedStates-2020-01-01-2020-05-14.psv')])

In [149]:
counties_df = get_counties_df()
counties_df

Unnamed: 0,hospitalIcuBeds,hospitalStaffedBeds,hospitalLicensedBeds,latestTotalPopulation,location,fips,id,name,version,typeIdent
Autauga_Alabama_UnitedStates,6.0,55.0,85.0,55869.0,{'value': {'id': 'Autauga_Alabama_UnitedStates...,{'id': '01001'},Autauga_Alabama_UnitedStates,Autauga,3735560.0,EP_LOC
Baldwin_Alabama_UnitedStates,51.0,362.0,386.0,223234.0,{'value': {'id': 'Baldwin_Alabama_UnitedStates...,{'id': '01003'},Baldwin_Alabama_UnitedStates,Baldwin,3801096.0,EP_LOC
Barbour_Alabama_UnitedStates,5.0,30.0,74.0,24686.0,{'value': {'id': 'Barbour_Alabama_UnitedStates...,{'id': '01005'},Barbour_Alabama_UnitedStates,Barbour,3801096.0,EP_LOC
Bibb_Alabama_UnitedStates,4.0,25.0,35.0,22394.0,"{'value': {'id': 'Bibb_Alabama_UnitedStates'},...",{'id': '01007'},Bibb_Alabama_UnitedStates,Bibb,3997704.0,EP_LOC
Blount_Alabama_UnitedStates,6.0,25.0,25.0,57826.0,{'value': {'id': 'Blount_Alabama_UnitedStates'...,{'id': '01009'},Blount_Alabama_UnitedStates,Blount,3801096.0,EP_LOC
...,...,...,...,...,...,...,...,...,...,...
Unassigned_Wisconsin_UnitedStates,,,,,{'value': {'id': 'Unassigned_Wisconsin_UnitedS...,,Unassigned_Wisconsin_UnitedStates,Unassigned,3080195.0,EP_LOC
Unassigned_Wyoming_UnitedStates,,,,,{'value': {'id': 'Unassigned_Wyoming_UnitedSta...,,Unassigned_Wyoming_UnitedStates,Unassigned,3080201.0,EP_LOC
Unassigned_Guam_UnitedStates,,,,,,,Unassigned_Guam_UnitedStates,Unknown,1.0,EP_LOC
Unassigned_NorthernMarianaIslands_UnitedStates,,,,,{'value': {'id': 'Unassigned_NorthernMarianaIs...,,Unassigned_NorthernMarianaIslands_UnitedStates,Unknown,458755.0,EP_LOC


In [170]:
df = evalmetrics_df.merge(counties_df, how='left', left_on='county', right_index=True, sort=True)
df

Unnamed: 0,county,date,confirmed_cases,confirmed_deaths,confirmed_recoveries,hospitalIcuBeds,hospitalStaffedBeds,hospitalLicensedBeds,latestTotalPopulation,location,fips,id,name,version,typeIdent
134,Faulkner_Arkansas_UnitedStates,2020-01-01,0.0,0.0,0.0,25.0,355.0,357.0,126007.0,{'value': {'id': 'Faulkner_Arkansas_UnitedStat...,{'id': '05045'},Faulkner_Arkansas_UnitedStates,Faulkner,3670024.0,EP_LOC
135,Faulkner_Arkansas_UnitedStates,2020-01-02,0.0,0.0,0.0,25.0,355.0,357.0,126007.0,{'value': {'id': 'Faulkner_Arkansas_UnitedStat...,{'id': '05045'},Faulkner_Arkansas_UnitedStates,Faulkner,3670024.0,EP_LOC
136,Faulkner_Arkansas_UnitedStates,2020-01-03,0.0,0.0,0.0,25.0,355.0,357.0,126007.0,{'value': {'id': 'Faulkner_Arkansas_UnitedStat...,{'id': '05045'},Faulkner_Arkansas_UnitedStates,Faulkner,3670024.0,EP_LOC
137,Faulkner_Arkansas_UnitedStates,2020-01-04,0.0,0.0,0.0,25.0,355.0,357.0,126007.0,{'value': {'id': 'Faulkner_Arkansas_UnitedStat...,{'id': '05045'},Faulkner_Arkansas_UnitedStates,Faulkner,3670024.0,EP_LOC
138,Faulkner_Arkansas_UnitedStates,2020-01-05,0.0,0.0,0.0,25.0,355.0,357.0,126007.0,{'value': {'id': 'Faulkner_Arkansas_UnitedStat...,{'id': '05045'},Faulkner_Arkansas_UnitedStates,Faulkner,3670024.0,EP_LOC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,Wayne_Ohio_UnitedStates,2020-05-09,188.0,42.0,0.0,18.0,129.0,184.0,115710.0,"{'value': {'id': 'Wayne_Ohio_UnitedStates'}, '...",{'id': '39169'},Wayne_Ohio_UnitedStates,Wayne,3932179.0,EP_LOC
130,Wayne_Ohio_UnitedStates,2020-05-10,186.0,42.0,0.0,18.0,129.0,184.0,115710.0,"{'value': {'id': 'Wayne_Ohio_UnitedStates'}, '...",{'id': '39169'},Wayne_Ohio_UnitedStates,Wayne,3932179.0,EP_LOC
131,Wayne_Ohio_UnitedStates,2020-05-11,186.0,42.0,0.0,18.0,129.0,184.0,115710.0,"{'value': {'id': 'Wayne_Ohio_UnitedStates'}, '...",{'id': '39169'},Wayne_Ohio_UnitedStates,Wayne,3932179.0,EP_LOC
132,Wayne_Ohio_UnitedStates,2020-05-12,190.0,47.0,0.0,18.0,129.0,184.0,115710.0,"{'value': {'id': 'Wayne_Ohio_UnitedStates'}, '...",{'id': '39169'},Wayne_Ohio_UnitedStates,Wayne,3932179.0,EP_LOC


In [155]:
counties = list(df.county)
counties

county = counties[0]

re.match('([\w_]+)',county).group(1)



'Faulkner_Arkansas_UnitedStates'

In [171]:
cs_df.fips

0           0
1        1000
2        1001
3        1003
4        1005
        ...  
3190    56037
3191    56039
3192    56041
3193    56043
3194    56045
Name: fips, Length: 3195, dtype: int64

In [172]:
fips = []
for f in df.fips:
    if isinstance(f, dict):
        fips.append(int(f['id']))
    else:
        fips.append(int(f))
df.fips = fips

In [173]:
df.fips

134     5045
135     5045
136     5045
137     5045
138     5045
       ...  
129    39169
130    39169
131    39169
132    39169
133    39169
Name: fips, Length: 268, dtype: int64

In [164]:
evalmetrics_df

Unnamed: 0,county,date,confirmed_cases,confirmed_deaths,confirmed_recoveries
0,Wayne_Ohio_UnitedStates,2020-01-01,0.0,0.0,0.0
1,Wayne_Ohio_UnitedStates,2020-01-02,0.0,0.0,0.0
2,Wayne_Ohio_UnitedStates,2020-01-03,0.0,0.0,0.0
3,Wayne_Ohio_UnitedStates,2020-01-04,0.0,0.0,0.0
4,Wayne_Ohio_UnitedStates,2020-01-05,0.0,0.0,0.0
...,...,...,...,...,...
263,Faulkner_Arkansas_UnitedStates,2020-05-09,81.0,2.0,0.0
264,Faulkner_Arkansas_UnitedStates,2020-05-10,82.0,2.0,0.0
265,Faulkner_Arkansas_UnitedStates,2020-05-11,82.0,2.0,0.0
266,Faulkner_Arkansas_UnitedStates,2020-05-12,82.0,2.0,0.0
