# Exploring the git history of MoH Covid-19 data

This notebook explores the git history of the https://github.com/UoA-eResearch/nz-covid19-data-auto repository. We are interested in unpacking the DHB cases over time by reading the git history of the file.

In [2]:
%cd ../nz-covid19-data-auto

/home/kaimahi/covid-19/nz-covid19-data-auto


In [150]:
# Import libraries
import os
import pygit2
import subprocess
import pandas as pd
from shutil import copyfile
import matplotlib.pyplot as plt

%matplotlib inline

In [37]:
# Extract the commit history for the cases by DHB data
! git log --pretty="%H|%cd|%B" cases_by_DHB.csv | grep -vE '^$' > cases_by_DHB.githistory

In [130]:
cases_by_dhb_history = pd.read_table(
    'cases_by_DHB.githistory', 
    sep='|', 
    names=['commit', 'date', 'body']
)

def extract_date(date):
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    months = {m: str(n+1).zfill(2) for n, m in enumerate(months)}
    weekday, month, day, time, year, timezone = date.split(' ')
    return '-'.join([year, months[month], day.zfill(2)])

cases_by_dhb_history['date'] = cases_by_dhb_history.date.apply(extract_date)

cases_by_dhb_history = (cases_by_dhb_history
    .sort_values('date')
    .groupby('date')
    .last()
    .reset_index()
)
cases_by_dhb_history.to_csv('cases_by_dhb_history.csv', index=False)
cases_by_dhb_history

Unnamed: 0,date,commit,body
0,2020-04-16,87324300a6589148b5e682fdaee604bfbc5e0a0e,auto update from covid-case_list-16-april.xlsx
1,2020-04-17,2bbcfd6030adc231aec3c55fbf58d3ff98a60e2a,auto update from covid-19-case-list-17-april-2...
2,2020-04-18,69342f8fc546d096d7faf09989efa8364aca66de,auto update from web-covid-confprob_20200418-2...
3,2020-04-19,594ce69343e6931b0bd3a08435b65496f9f57d64,rows now have th
4,2020-04-20,6d57c044784c19c9b7e2b6ac7d3379bf1d01bf70,auto update from covid-caselist-20april.xlsx
...,...,...,...
477,2021-10-20,e90804da2e38a396e4d12cc19c23fe2f8c61ab8c,auto update to 1pm 20 October 2021
478,2021-10-21,6296ad6039d5cec3b67984b1a2e1a386874bec40,auto update to 1pm 21 October 2021
479,2021-10-22,3d3f4af6e0b51cdb4a8c928a1c4807596d0105d1,auto update to 1pm 22 October 2021
480,2021-10-23,dae22dac976c858e38cb2e28fb8324b77e3585bf,auto update to 1pm 23 October 2021


In [131]:
! mkdir -p cases_by_dhb

In [132]:
for i, row in cases_by_dhb_history.iterrows():
    
    # Check out the cases_by_DHB.csv file
    subprocess.check_output(
        ['git', 'checkout', row.commit, 'cases_by_DHB.csv']
    )
    # Copy the csv to the new location
    copyfile('cases_by_DHB.csv', os.path.join('cases_by_dhb', row.date + '_cases_by_DHB.csv'))

# Check out latest_cases_by_DHB.csv
subprocess.check_output(
    ['git', 'checkout', 'cases_by_DHB.csv']
)

b''

In [143]:
def read_cases_by_dhb(filepath):
    folder, filename = filepath.rsplit('/', 1)
    df = pd.read_csv(filepath).iloc[:,:5]
    df['Date'] = filename.split('_', 1)[0]
    df = (df.set_index("Date")
            .sort_values("Date"))
    for col in ['Active', 'Recovered', 'Deceased']:
        df[col] = (df[col]
            .fillna(0)
            .apply(str)
            .str.replace('*','')
            .apply(float)
            .astype(int)
        )
    return df

cases_by_dhb_over_time = pd.concat([
     read_cases_by_dhb(os.path.join('cases_by_dhb', csv)) for csv in sorted(os.listdir('cases_by_dhb'))
])

cases_by_dhb_over_time

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0_level_0,DHB,Active,Recovered,Deceased,Total
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-16,Auckland,60,123,0,183
2020-04-16,West Coast,2,2,1,5
2020-04-16,Waitemata,105,100,0,205
2020-04-16,Wairarapa,0,8,0,8
2020-04-16,Waikato,106,76,0,182
...,...,...,...,...,...
2021-10-24,Capital and Coast,0,111,2,113
2021-10-24,Canterbury,0,156,12,168
2021-10-24,Bay of Plenty,0,48,0,48
2021-10-24,Managed Isolation & Quarantine,31,1298,1,1330


In [145]:
cases_by_dhb_over_time.to_csv('cases_by_DHB_over_time.csv')

In [153]:
cases_by_dhb_over_time[cases_by_dhb_over_time.DHB == 'Auckland']

Unnamed: 0_level_0,DHB,Active,Recovered,Deceased,Total
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-16,Auckland,60,123,0,183
2020-04-17,Auckland,53,131,0,184
2020-04-18,Auckland,49,136,0,185
2020-04-19,Auckland,46,139,0,185
2020-04-20,Auckland,43,142,0,185
...,...,...,...,...,...
2021-10-20,Auckland,126,483,1,610
2021-10-21,Auckland,147,483,1,631
2021-10-22,Auckland,174,485,1,660
2021-10-23,Auckland,190,487,1,678
