In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import json 
import matplotlib.pyplot as plt
import os

from scipy.ndimage import gaussian_filter1d

import us 

## World data

This is part of the code that gathers data in strains for world countries.

In [3]:
# Main source for the training data
DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
# Local file
DATA_FILE = 'csv_files/OxCGRT_latest.csv'

oxford_data = pd.read_csv(DATA_FILE, 
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)


In [4]:
population_df = pd.read_csv("csv_files/population_unique.csv")

In [5]:
CSV_FOLDER = 'csv_files'
if not os.path.exists(CSV_FOLDER):
    os.makedirs(CSV_FOLDER)

In [6]:
today = datetime.today().strftime('%Y-%m-%d')

In [1]:
def get_strains_world(start_date = '2020-01-01', end_day = today):
    
    new_strains_df = pd.DataFrame()

    dates = pd.date_range(start=start_date,end=end_day)
    dates = pd.DataFrame({'Date':dates})
    
    new_strains_df = pd.DataFrame()
    n_days = len(dates)

    #Get lineages WORLD
    for location in oxford_data.CountryCode.unique():
        country_name = oxford_data.loc[oxford_data.CountryCode == location, 'CountryName']
        response = requests.get(f'https://api.outbreak.info/genomics/prevalence-by-location-all-lineages?location_id={location}&ndays={n_days}').text
        if(json.loads(response)["success"]):
            results =  json.loads(response)['results']
            results_df = pd.DataFrame.from_dict(results)
            location_strains_df = dates.copy()
            population = population_df.loc[population_df.CountryCode == location,'Population'].max()
            location_strains_df['CountryCode'] = location
            location_strains_df['CountryName'] = country_name.iloc[0]
            location_strains_df['Population'] = population
            for lineage in results_df.lineage.unique():
                df_lineage = results_df.loc[results_df.lineage == lineage]
                new_strain = pd.DataFrame()
                new_strain['Date'] = pd.to_datetime(df_lineage['date'])
                new_strain[f'prevalence_rolling_{lineage}'] = df_lineage['prevalence_rolling'].fillna(0)
                location_strains_df  = location_strains_df.merge(new_strain, how='left')
                #print(location_strains_df)
                location_strains_df[f'prevalence_rolling14_{lineage}'] = location_strains_df[f'prevalence_rolling_{lineage}'].fillna(0).rolling(14).mean()
                location_strains_df[f'prevalence_gaussian5_{lineage}'] = pd.Series(gaussian_filter1d(location_strains_df[f'prevalence_rolling14_{lineage}'].fillna(0),5))
                location_strains_df[f'prevalence_gaussian10_{lineage}'] = pd.Series(gaussian_filter1d(location_strains_df[f'prevalence_rolling14_{lineage}'].fillna(0),10))
            new_strains_df = new_strains_df.append(location_strains_df, ignore_index=True)
                    
    return new_strains_df


NameError: name 'today' is not defined

# States data

In [None]:
all_states = [x.name for x  in us.states.STATES]
all_states_abbr = [x.abbr for x  in us.states.STATES]

In [None]:
df_states = pd.DataFrame(columns = ["StateCode","StateName"])
df_states["StateCode"] = all_states_abbr
df_states["StateName"] = all_states

In [None]:
def get_strains_states(start_date = '2020-01-01', end_day = today):
    
    new_strains_df = pd.DataFrame()

    dates = pd.date_range(start=start_date,end=end_day)
    dates = pd.DataFrame({'Date':dates})
    
    new_strains_df = pd.DataFrame()
    n_days = len(dates)

    #Get lineages WORLD
    for location in all_states_abbr:
        state_name = df_states.loc[df_states.StateCode == location, 'StateName']
        response = requests.get(f'https://api.outbreak.info/genomics/prevalence-by-location-all-lineages?location_id=USA_{location}&ndays={n_days}').text
        if(json.loads(response)["success"]):
            #print(f'Reading data for {location}')
            results =  json.loads(response)['results']
            results_df = pd.DataFrame.from_dict(results)
            location_strains_df = dates.copy()
            location_strains_df['StateCode'] = location
            location_strains_df['StateName'] = state_name.iloc[0]
            for lineage in results_df.lineage.unique():
                df_lineage = results_df.loc[results_df.lineage == lineage]
                new_strain = pd.DataFrame()
                new_strain['Date'] = pd.to_datetime(df_lineage['date'])
                new_strain[f'prevalence_rolling_{lineage}'] = df_lineage['prevalence_rolling'].fillna(0)
                location_strains_df  = location_strains_df.merge(new_strain, how='left')
                #print(location_strains_df)
                location_strains_df[f'prevalence_rolling14_{lineage}'] = location_strains_df[f'prevalence_rolling_{lineage}'].fillna(0).rolling(14).mean()
                location_strains_df[f'prevalence_gaussian5_{lineage}'] = pd.Series(gaussian_filter1d(location_strains_df[f'prevalence_rolling14_{lineage}'].fillna(0),5))
                location_strains_df[f'prevalence_gaussian10_{lineage}'] = pd.Series(gaussian_filter1d(location_strains_df[f'prevalence_rolling14_{lineage}'].fillna(0),10))
            new_strains_df = new_strains_df.append(location_strains_df, ignore_index=True)
                   
    return new_strains_df
