In [1]:
import pandas
import psycopg2
import numpy
import io
import time
import timeit
import datetime

In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pandas.set_option('display.max_rows', 1000)

In [3]:
conn = psycopg2.connect(
    dbname='coveo',
    host='',
    user='public_loader',
    password='',
    application_name = 'steger loader_notebook'
)
C = conn.cursor()

In [4]:
schema = 'datahub_0'

In [5]:
common_comment = 'upgrade JHD 230223'

In [6]:
class TimeLogCommit:
    def __init__(self, task, table_name = None, commit = True, verbose=True):
        self.table_name = table_name
        self.task = task
        self.verbose = verbose
        self.commit = commit

    def __enter__(self):
        self.t0 = datetime.datetime.now()
        self.start = timeit.default_timer()

    def __exit__(self, exc_type, exc_value, traceback):
        self.took = (timeit.default_timer() - self.start)
        if self.table_name:
            C.execute(f"""
INSERT INTO {schema}.merge_log
VALUES ('{common_comment}', '{self.table_name}', '{self.task}', '{self.t0}', '{datetime.datetime.now()}');
""")
        if self.commit:
            conn.commit()
        if self.verbose:
            t = f'on {self.table_name} ' if self.table_name else ' '
            print(f'\n\033[38;5;208mCode block {self.task} {t}took:\t{self.took:.5f} seconds\033[0;0m')

In [7]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'

Data cleaning

In [8]:
with TimeLogCommit(task = 'retrieve and clean dataset', commit = False):
    df = pandas.read_csv(url)
    df.drop(columns = ['Province/State', 'Lat', 'Long'], inplace = True)
    c = pandas.read_sql(f"SELECT * FROM {schema}.country", con = conn)

    country_map = {
     'Burma': 'Myanmar/Burma',
     'Cabo Verde': 'Cape Verde',
    # 'Congo (Brazzaville)',
    # 'Congo (Kinshasa)',
     "Cote d'Ivoire": "Côte D’Ivoire",
    # 'Diamond Princess',
    # 'Holy See',
     'Korea, North': 'North Korea',
     'Korea, South': 'South Korea',
    # 'Kosovo',
    # 'MS Zaandam',
     'Russia': 'Russian Federation',
     'Saint Vincent and the Grenadines': 'Saint Vincent and The Grenadines',
     'Sao Tome and Principe': 'Sao Tomé and Príncipe',
    # 'Summer Olympics 2020',
    # 'Taiwan*',
     'Tanzania':'United Republic of Tanzania',
     'US': 'United States',
    # 'West Bank and Gaza',
    # 'Winter Olympics 2022'   
    }
    
    cols = list(df.columns)
    cols.remove('Country/Region')
    cols.append('id')

    X = pandas.merge(
        left = c, right = df,
        left_on = 'country_name', right_on = 'Country/Region',
        how = 'inner'
    )[cols]

    dfs = pandas.melt(
        X.groupby('id').sum().reset_index(), # in case there are more than one state, sum them up
        id_vars = 'id', var_name = 'date', value_name = "cases"
    ).rename(columns={'id': 'country_id'})

    year_week = lambda ts:f"{ts.isocalendar()[0]}_{ts.isocalendar()[1]:02d}"

    dfs['date'] = pandas.to_datetime(dfs['date'])
    dfs['year_week'] = dfs['date'].apply(year_week)

    dfs_pivot = dfs.groupby(['country_id', 'year_week'])[['cases']].max()
    dfs_pivot['diff_cases'] = dfs_pivot['cases'].diff().astype(pandas.Int32Dtype())

    dfs_pivot.reset_index(inplace=True)
    mask = dfs_pivot['country_id'] != dfs_pivot['country_id'].shift(1)
    dfs_pivot.loc[mask, 'diff_cases'] = dfs_pivot.loc[mask, 'cases']
    dfs_pivot[['year', 'week']] = dfs_pivot['year_week'].str.split('_', expand=True)
    dfs_pivot.drop(columns='year_week', inplace=True)





[38;5;208mCode block retrieve and clean dataset  took:	2.93347 seconds[0;0m


```python
country_db = set(c['country_name'].unique())
country_ds = set(df['Country/Region'].unique())
country_ds.difference(country_db)
```

In [9]:
data_before = pandas.read_sql(f"select * from {schema}.jhd_covid_country_weekly", con = conn)



In [10]:
with TimeLogCommit(task = 'rewrite', table_name = 'jhd_covid_country_weekly'):
    C.execute(f"TRUNCATE {schema}.jhd_covid_country_weekly;")

    pipe = io.StringIO()
    dfs_pivot[['country_id', 'year', 'week', 'diff_cases']].to_csv(pipe, sep = '\t', header = False, index = False)
    pipe.seek(0)
    C.copy_expert(f"COPY {schema}.jhd_covid_country_weekly FROM STDIN WITH (format csv, delimiter '\t')", pipe)
    pipe.close()


[38;5;208mCode block rewrite on jhd_covid_country_weekly took:	2.40953 seconds[0;0m


In [11]:
data_after = pandas.read_sql(f"select * from {schema}.jhd_covid_country_weekly", con = conn)



In [12]:
data_before.shape, data_after.shape

((28779, 4), (29322, 4))