In [1]:
import sys
sys.path.append('../koalas/')

In [2]:
from koalas import DataFrame

In [3]:
def get_covid_data():
    import csv
    lines = list(csv.reader(open('covid.csv', 'r')))
    headers, rows = lines[0], lines[1:]
    headers[0] = headers[0][1:]
    def to_number(v):
        try: return float(v)
        except Exception: return 0. if v == 'NA' else v
    rows = [[to_number(value) for value in row] for row in rows]
    return DataFrame(fields=headers, rows=rows)

In [4]:
df = get_covid_data()
df[1000:1003]

country country_code year_week level    region region_name new_cases tests_done population testing_rate positivity_rate testing_data_source
------- ------------ --------- -----    ------ ----------- --------- ---------- ---------- ------------ --------------- -------------------
Czechia CZ           2020-W16  national CZ     Czechia     755.0     44165.0    10516707.0 419.9508458  1.709498472     TESSy COVID-19     
Czechia CZ           2020-W17  national CZ     Czechia     658.0     46583.0    10516707.0 442.9428337  1.412532469     TESSy COVID-19     
Czechia CZ           2020-W18  national CZ     Czechia     379.0     43476.0    10516707.0 413.3993654  0.871745331     TESSy COVID-19     

In [5]:
summary = (
    df
        .apply('New Cases', lambda new_cases: new_cases or 0., 'new_cases')
        .rename('country', 'Country')
        .rename('year_week', 'Period')        
        .rename('population', 'Population')
        .select('Country', 'New Cases', 'Period', 'Population')      
        .apply('New Cases (Norm.)', lambda new_cases, population: round(100_000 * new_cases / population, 2), 'New Cases', 'Population')  
        .group('Period')
        .apply('Most New Cases (Norm.)', lambda new_cases: max(new_cases), 'New Cases (Norm.)')
        .apply('Country with Most New Cases (Norm.)', lambda countries, new_cases, most_new_cases: [c for c, n in zip(countries, new_cases) if n == most_new_cases][0], 'Country', 'New Cases (Norm.)', 'Most New Cases (Norm.)')
        .select('Period', 'Most New Cases (Norm.)', 'Country with Most New Cases (Norm.)')
        [:20]
)
summary

Period   Most New Cases (Norm.) Country with Most New Cases (Norm.)
------   ---------------------- -----------------------------------
2020-W01 0.17                   Denmark                            
2020-W02 0.2                    Denmark                            
2020-W03 0.14                   Denmark                            
2020-W04 0.26                   Denmark                            
2020-W05 0.22                   Denmark                            
2020-W06 3.99                   Iceland                            
2020-W07 0.24                   Denmark                            
2020-W08 0.29                   Italy                              
2020-W09 3.35                   Italy                              
2020-W10 14.13                  Italy                              
2020-W11 35.76                  Italy                              
2020-W12 122.53                 Iceland                            
2020-W13 418.23                 Estonia         