In [32]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import seaborn as sns
#import pandas_gbp
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
from matplotlib.ticker import (AutoMinorLocator, MultipleLocator)

class Covid19:
    df_cumulative = pd.DataFrame()
    df_daily = pd.DataFrame()

    def __init__(self):
        self.data = []
        df_confirmed = self.fetch_covid19_cases('confirmed_cases')
        df_deaths = self.fetch_covid19_cases('deaths')
        df_recovered = self.fetch_covid19_cases('recovered_cases')
        df_world_population = self.fetch_population()
        #df_exchange_rates = fetch_exchange_rates()
        #Preprocess
        #remove canada as per https://towardsdatascience.com/covid-19-data-processing-58aaa3663f6
        df_recovered = df_recovered[df_recovered['country']!='Canada']
        df_covid = self.covid_flip_datetocolumn(df_confirmed, 'confirmed').merge(
            right = self.covid_flip_datetocolumn(df_deaths, 'deaths'), 
            how='left',
            on=['country', 'date']
        ).merge(
            right = self.covid_flip_datetocolumn(df_recovered, 'recovered'),
            how='left',
            on=['country', 'date']
        )
        #remove ship data  https://towardsdatascience.com/covid-19-data-processing-58aaa3663f6
        ship_rows = df_covid['country'].str.contains('Diamond Princess') | df_covid['country'].str.contains('MS Zaandam')
        df_covid = df_covid[~(ship_rows)]

        df_covid['active'] = df_covid['confirmed'] - df_covid['deaths'] - df_covid['recovered']
        df_covid_popl = df_covid.join(df_world_population, how='outer', on='country')
        for var in ['confirmed', 'deaths', 'recovered', 'active']:
            df_covid_popl[var + '_per1m'] = df_covid_popl[var] * 1000000 / df_covid_popl['population']
        df_covid['date'] = df_covid['date'].map(lambda x: datetime.strptime(x, "%m/%d/%y"))

        df_covid_popl.dropna(subset=['date'], inplace=True)
        df_covid_popl['date'] = df_covid_popl['date'].map(lambda x: datetime.strptime(x, "%m/%d/%y"))
        df_covid_popl = df_covid_popl[df_covid_popl['date'] > np.datetime64('2020-02-20T00:00:00-08')]
        self.df_cumulative = df_covid_popl
        self.calculate_new()
    def fetch_covid19_cases(self, table):
        # Construct a BigQuery client object.
        client = bigquery.Client()
        query = """
            SELECT *, 
            FROM `bigquery-public-data.covid19_jhu_csse.{0}`
        """.format(table)
        df_confirmed_cases = client.query(query).to_dataframe()
        df_confirmed_cases.drop(['latitude', 'longitude', 'location_geom'], axis=1, inplace=True)
        df_confirmed_cases['province_state'] = df_confirmed_cases['province_state'].fillna('ALL')
        df_confirmed_cases = df_confirmed_cases[df_confirmed_cases['province_state'] == 'ALL']
        df_confirmed_cases.rename(columns={'country_region': 'country'}, inplace=True)
        #df_confirmed_cases = df_confirmed_cases.set_index('country')
        df_confirmed_cases.drop(['province_state'], axis=1, inplace=True)
        return df_confirmed_cases
    def fetch_population(self):
        client = bigquery.Client()
        query = """
            SELECT country_code, country_name as country, midyear_population as population 
            FROM `bigquery-public-data.census_bureau_international.midyear_population`
            where year = EXTRACT(year from (CURRENT_DATE()))
            order by midyear_population desc
        """
        df_world_population = client.query(query).to_dataframe()
        df_world_population['country'][df_world_population['country'] == 'United States'] = 'US'
        df_world_population = df_world_population.set_index('country')
        return df_world_population
    def covid_flip_datetocolumn(self, df_covid, col_name):
        dates = df_covid.columns[1:]
        df_temp = df_covid.melt(
            id_vars=['country'], 
            value_vars=dates,
            var_name='date', 
            value_name=col_name
        )
        df_temp['date']=df_temp['date'].map(lambda strtodate: strtodate.replace('_','/')[1:])
        return df_temp
        
    # Calculates ratio of two countries numbes either confirmed cases, deaths, recovered
    def covid_ratio_by_country(self, col_name, source, dest):
        df_top = self.df_cumulative[[col_name, 'date']][df_cumulative['country'] == source] 
        df_down = self.df_cumulative[[col_name, 'date']][df_cumulative['country'] == dest] 
        df_ratio = df_top.merge(df_down, on='date')
        df_ratio['ratio'] = df_ratio[col_name + '_x'] / df_ratio[col_name + '_y'] 
        return df_ratio[['date', 'ratio']]

    def calculate_new(self):
        # new cases 
        df_covid_popl_new = self.df_cumulative.groupby(
            ['date', 'country'])[
                'confirmed', 'deaths', 'recovered', 
                'active','population', 'confirmed_per1m','deaths_per1m',
                'recovered_per1m'
            ].sum().reset_index()
        temp = df_covid_popl_new.groupby(['country', 'date', ])['confirmed', 'deaths', 'recovered']
        temp = temp.sum().diff().reset_index()
        mask = temp['country'] != temp['country'].shift(1)
        temp.loc[mask, 'confirmed'] = np.nan
        temp.loc[mask, 'deaths'] = np.nan
        temp.loc[mask, 'recovered'] = np.nan
        # renaming columns
        temp.columns = ['country', 'date', 'New cases', 'New deaths', 'New recovered']
        # merging new values
        df_covid_popl_new = pd.merge(df_covid_popl_new, temp, on=['country', 'date'])
        # filling na with 0
        df_covid_popl_new = df_covid_popl_new.fillna(0)
        # fixing data types
        cols = ['New cases', 'New deaths', 'New recovered']
        df_covid_popl_new[cols] = df_covid_popl_new[cols].astype('int')
        # 
        df_covid_popl_new['New cases'] = df_covid_popl_new['New cases'].apply(lambda x: 0 if x<0 else x)
        self.df_daily = df_covid_popl_new
