# Visualising the data

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import json

In [18]:
# an iPython  "magic" that enables the embedding of matplotlib output
%matplotlib inline
# make figures larger
plt.rcParams['figure.dpi'] = 100

## Time series data

In [31]:
with open("timeseries.json", "rt") as INFILE:
    data=json.load(INFILE)

In [32]:
datalist=data['data']

[{'date': '2023-10-19', 'tests': None, 'cases': 10043},
 {'date': '2023-10-18', 'tests': None, 'cases': 0},
 {'date': '2023-10-17', 'tests': None, 'cases': 0},
 {'date': '2023-10-16', 'tests': None, 'cases': 0},
 {'date': '2023-10-15', 'tests': None, 'cases': 0},
 {'date': '2023-10-14', 'tests': None, 'cases': 0},
 {'date': '2023-10-13', 'tests': None, 'cases': 0},
 {'date': '2023-10-12', 'tests': None, 'cases': 16855},
 {'date': '2023-10-11', 'tests': None, 'cases': 0},
 {'date': '2023-10-10', 'tests': None, 'cases': 0},
 {'date': '2023-10-09', 'tests': None, 'cases': 0},
 {'date': '2023-10-08', 'tests': None, 'cases': 0},
 {'date': '2023-10-07', 'tests': None, 'cases': 0},
 {'date': '2023-10-06', 'tests': None, 'cases': 0},
 {'date': '2023-10-05', 'tests': None, 'cases': 16545},
 {'date': '2023-10-04', 'tests': None, 'cases': 0},
 {'date': '2023-10-03', 'tests': None, 'cases': 0},
 {'date': '2023-10-02', 'tests': None, 'cases': 0},
 {'date': '2023-10-01', 'tests': None, 'cases': 0},


In [33]:
dates=[dictionary['date'] for dictionary in datalist ]
dates.sort()

['2020-01-31',
 '2020-02-01',
 '2020-02-02',
 '2020-02-03',
 '2020-02-04',
 '2020-02-05',
 '2020-02-06',
 '2020-02-07',
 '2020-02-08',
 '2020-02-09',
 '2020-02-10',
 '2020-02-11',
 '2020-02-12',
 '2020-02-13',
 '2020-02-14',
 '2020-02-15',
 '2020-02-16',
 '2020-02-17',
 '2020-02-18',
 '2020-02-19',
 '2020-02-20',
 '2020-02-21',
 '2020-02-22',
 '2020-02-23',
 '2020-02-24',
 '2020-02-25',
 '2020-02-26',
 '2020-02-27',
 '2020-02-28',
 '2020-02-29',
 '2020-03-01',
 '2020-03-02',
 '2020-03-03',
 '2020-03-04',
 '2020-03-05',
 '2020-03-06',
 '2020-03-07',
 '2020-03-08',
 '2020-03-09',
 '2020-03-10',
 '2020-03-11',
 '2020-03-12',
 '2020-03-13',
 '2020-03-14',
 '2020-03-15',
 '2020-03-16',
 '2020-03-17',
 '2020-03-18',
 '2020-03-19',
 '2020-03-20',
 '2020-03-21',
 '2020-03-22',
 '2020-03-23',
 '2020-03-24',
 '2020-03-25',
 '2020-03-26',
 '2020-03-27',
 '2020-03-28',
 '2020-03-29',
 '2020-03-30',
 '2020-03-31',
 '2020-04-01',
 '2020-04-02',
 '2020-04-03',
 '2020-04-04',
 '2020-04-05',
 '2020-04-

In [34]:
def parse_date(datestring):
    """ Convert a date string into a pandas datetime object """
    return pd.to_datetime(datestring, format="%Y-%m-%d")

In [35]:
startdate=parse_date(dates[0])
enddate=parse_date(dates[-1])
print (startdate, ' to ', enddate)

2020-01-31 00:00:00  to  2023-10-19 00:00:00


In [36]:
index=pd.date_range(startdate, enddate, freq='D')
timeseriesdf=pd.DataFrame(index=index, columns=['cases', 'tests'])
timeseriesdf

Unnamed: 0,cases,hospital,deaths
2020-01-31,,,
2020-02-01,,,
2020-02-02,,,
2020-02-03,,,
2020-02-04,,,
...,...,...,...
2023-10-15,,,
2023-10-16,,,
2023-10-17,,,
2023-10-18,,,


In [None]:
for entry in datalist: # each entry is a dictionary with date, cases, hospital and deaths
    date=parse_date(entry['date'])
    for column in ['cases', 'tests']:
        # check that nothing is there yet - just in case some dates are duplicated,
        # maybe with data for different columns in each entry
        if pd.isna(timeseriesdf.loc[date, column]): 
            # replace None with 0 in our data 
            value= float(entry[column]) if entry[column]!=None else 0.0
            # this is the way you access a specific location in the dataframe - use .loc
            # and put index,column in a single set of [ ]
            timeseriesdf.loc[date, column]=value
            
# fill in any remaining "holes" due to missing dates
timeseriesdf.fillna(0.0, inplace=True)
            
timeseriesdf

### Plotting

In [None]:
timeseriesdf.plot() 

In [None]:
timeseriesdf.plot(logy=True) 

In [29]:
timeseriesdf.to_pickle("timeseriesdf.pkl")