## Imports

In [1]:
import glob
import math
import dateutil.parser as dparser
from datetime import datetime, date, timedelta
from io import StringIO
from decimal import Decimal

import pandas as pd
import requests
import fastparquet

## Create dataframe from daily reports files

In [2]:
yesterday = (date.today() - timedelta(days=1)).strftime('%m-%d-%Y')
r = requests.get(
        f'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{yesterday}.csv')
f = StringIO(r.text)

daily_reports_frame = pd.read_csv(f, sep=",")

### Change column names

In [3]:
daily_reports_frame = daily_reports_frame.rename(
    columns={
            'Province_State':'province_state', 
             'Country_Region':'country_region', 
             'Last_Update':'last_update', 
             'Confirmed':'confirmed', 
             'Deaths':'deaths', 
             'Recovered':'recovered', 
             'Lat':'latitude', 
             'Long_':'longitude',
             'FIPS':'fips',
             'Admin2':'county_US',
             'Active':'active',
             'Combined_Key':'combined_key'
             })

## Dataset cleaning

In [4]:
def extract_date(date):
    date_object = dparser.parse(date,fuzzy=True)
    return date_object.strftime('%Y-%m-%d')

In [5]:
daily_reports_frame['county_US'] = daily_reports_frame['county_US'].fillna('Unknown')
daily_reports_frame['fips'] = daily_reports_frame['fips'].fillna('0')
daily_reports_frame['active'] = daily_reports_frame['active'].fillna('0')
daily_reports_frame['combined_key'] = daily_reports_frame['combined_key'].fillna('Unknown')
daily_reports_frame['province_state'] = daily_reports_frame['province_state'].fillna('Unknown')
daily_reports_frame['country_region'] = daily_reports_frame['country_region'].fillna('Unknown')
daily_reports_frame['last_update'] = daily_reports_frame['last_update'].fillna('Unknown')
daily_reports_frame['latitude'] = daily_reports_frame['latitude'].fillna('0.0')
daily_reports_frame['longitude'] = daily_reports_frame['longitude'].fillna('0.0')
daily_reports_frame['confirmed'] = daily_reports_frame['confirmed'].fillna(0)
daily_reports_frame['deaths'] = daily_reports_frame['deaths'].fillna(0)
daily_reports_frame['recovered'] = daily_reports_frame['recovered'].fillna(0)
daily_reports_frame['last_update'] = daily_reports_frame['last_update'].apply(extract_date)

daily_reports_frame.reset_index(drop=True)

Unnamed: 0,fips,county_US,province_state,country_region,last_update,latitude,longitude,confirmed,deaths,recovered,active,combined_key
0,45001,Abbeville,South Carolina,US,2020-05-16,34.2233,-82.4617,34,0,0,34,"Abbeville, South Carolina, US"
1,22001,Acadia,Louisiana,US,2020-05-16,30.2951,-92.4142,189,11,0,178,"Acadia, Louisiana, US"
2,51001,Accomack,Virginia,US,2020-05-16,37.7671,-75.6323,648,9,0,639,"Accomack, Virginia, US"
3,16001,Ada,Idaho,US,2020-05-16,43.4527,-116.242,759,22,0,737,"Ada, Idaho, US"
4,19001,Adair,Iowa,US,2020-05-16,41.3308,-94.4711,4,0,0,4,"Adair, Iowa, US"
...,...,...,...,...,...,...,...,...,...,...,...,...
3295,0,Unknown,Unknown,West Bank and Gaza,2020-05-16,31.9522,35.2332,375,2,315,58,West Bank and Gaza
3296,0,Unknown,Unknown,Western Sahara,2020-05-16,24.2155,-12.8858,6,0,6,0,Western Sahara
3297,0,Unknown,Unknown,Yemen,2020-05-16,15.5527,48.5164,106,15,1,90,Yemen
3298,0,Unknown,Unknown,Zambia,2020-05-16,-13.1339,27.8493,654,7,124,523,Zambia


In [72]:
daily_reports_frame.to_parquet('/tmp/latest_report.parquet.gz', compression = 'gzip', index=False)

In [6]:
daily_reports_frame.to_csv('/tmp/latest_report.csv', index=False)