In [10]:
import requests
import json
import time
from http import HTTPStatus
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

url = 'https://api.coronavirus.data.gov.uk/v1/data'

*Task 1*

In [11]:
def get_API_data(filters, structure):
    params = {
    "filters": str.join(";", filters['filters']),
    "structure": json.dumps(structure, separators=(",", ":"))
    }
    data = []
    page_number = 1
    request_counter = 0
    while True:
    # Adding page number to query params
        print(request_counter)
        params["page"] = page_number

        response = requests.get(url, params = params)
        response_json = response.json()
        data.extend(response_json['data'])
        # if request_counter % 10 == 0:
        #     time.sleep(100)
        if response.status_code >= HTTPStatus.BAD_REQUEST:
            raise RuntimeError(f'Request failed: {response.text}')
        elif response.status_code == HTTPStatus.NO_CONTENT:
            break
        
        if response_json["pagination"]["next"] is None:
            break

        page_number += 1
        request_counter += 1
    
    return data

Task 2

In [12]:
AREA_TYPE = {"nat": "nation", "rgn": "region"}

REQUEST_STRUCTURE = {
    "date": "date",
    "name": "areaName",
    "daily_cases": "newCasesBySpecimenDate",
    "cumulative_cases": "cumCasesBySpecimenDate",
    "daily_deaths": "newDeaths28DaysByPublishDate",
    "cumulative_deaths": "cumDeaths28DaysByPublishDate",
    "cumulative_vaccinated": "cumPeopleVaccinatedSecondDoseByVaccinationDate",
    "vaccination_age": "vaccinationsAgeDemographics"
}

def get_filters(area_type): 
    filters = {'filters' : [
        f"areaType={ area_type }",
    ]}
    return filters



In [13]:
national_filter = get_filters(AREA_TYPE["nat"])
regional_filter = get_filters(AREA_TYPE["rgn"])
results_json_national = get_API_data(national_filter, REQUEST_STRUCTURE)
results_json_regional = get_API_data(regional_filter, REQUEST_STRUCTURE)

0
0
1
2


Task 3

In [14]:
combined_results_json = results_json_national + results_json_regional

Task 4

In [15]:
covid_data = pd.DataFrame(combined_results_json)
covid_data.head()

Unnamed: 0,date,name,daily_cases,cumulative_cases,daily_deaths,cumulative_deaths,cumulative_vaccinated,vaccination_age
0,2022-12-08,England,,,461.0,173821.0,,[]
1,2022-12-07,England,2718.0,20317848.0,0.0,173360.0,41987655.0,"[{'age': '05_11', 'VaccineRegisterPopulationBy..."
2,2022-12-06,England,4102.0,20315130.0,0.0,173360.0,41987099.0,"[{'age': '05_11', 'VaccineRegisterPopulationBy..."
3,2022-12-05,England,4447.0,20311028.0,0.0,173360.0,41986506.0,"[{'age': '05_11', 'VaccineRegisterPopulationBy..."
4,2022-12-04,England,3278.0,20306581.0,0.0,173360.0,41985981.0,"[{'age': '05_11', 'VaccineRegisterPopulationBy..."


Task 5

In [16]:
covid_data.drop(covid_data.loc[covid_data["name"]=="England"].index, inplace=True)


Task 6

In [17]:
covid_data.rename({"name": "area"}, axis=1, inplace=True)
covid_data.head()

Unnamed: 0,date,area,daily_cases,cumulative_cases,daily_deaths,cumulative_deaths,cumulative_vaccinated,vaccination_age
1044,2022-05-20,Northern Ireland,,,0.0,3445.0,,[]
1045,2022-05-19,Northern Ireland,190.0,713294.0,4.0,3445.0,,[]
1046,2022-05-18,Northern Ireland,266.0,713104.0,1.0,3441.0,,[]
1047,2022-05-17,Northern Ireland,318.0,712838.0,2.0,3440.0,,[]
1048,2022-05-16,Northern Ireland,356.0,712520.0,1.0,3438.0,,[]


Task 7

In [18]:
covid_data["date"] = pd.to_datetime(covid_data["date"], format="%Y-%m-%d")

print(covid_data.dtypes)

date                     datetime64[ns]
area                             object
daily_cases                     float64
cumulative_cases                float64
daily_deaths                    float64
cumulative_deaths               float64
cumulative_vaccinated           float64
vaccination_age                  object
dtype: object


Task 8

In [19]:
covid_data["vaccination_age"] = covid_data["vaccination_age"].apply(lambda value_list: value_list if value_list else np.nan)

covid_data.head()
covid_data.info()

print("\nMissing data from each columnn: \n{}".format(covid_data.isnull().sum()))
print("\nTotal number of missing values is: {}".format(covid_data.isnull().sum().sum()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12154 entries, 1044 to 13197
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   12154 non-null  datetime64[ns]
 1   area                   12154 non-null  object        
 2   daily_cases            12142 non-null  float64       
 3   cumulative_cases       12141 non-null  float64       
 4   daily_deaths           10179 non-null  float64       
 5   cumulative_deaths      10191 non-null  float64       
 6   cumulative_vaccinated  7306 non-null   float64       
 7   vaccination_age        7212 non-null   object        
dtypes: datetime64[ns](1), float64(5), object(2)
memory usage: 759.8+ KB

Missing data from each columnn: 
date                        0
area                        0
daily_cases                12
cumulative_cases           13
daily_deaths             1975
cumulative_deaths        1963
cumulative_v

Task 9

In [20]:
covid_data["cumulative_cases"] = covid_data.sort_values(by=["date"], ascending=False).groupby(["area"])["cumulative_cases"].fillna(method="bfill")
covid_data["cumulative_deaths"] = covid_data.sort_values(by=["date"], ascending=False).groupby(["area"])["cumulative_deaths"].fillna(method="bfill")
covid_data["cumulative_vaccinated"] = covid_data.sort_values(by=["date"], ascending=False).groupby(["area"])["cumulative_vaccinated"].fillna(method="bfill")

Task 10

In [21]:
covid_data.dropna(axis=0, inplace=True)
covid_data.info()
covid_data.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7090 entries, 2044 to 12897
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   7090 non-null   datetime64[ns]
 1   area                   7090 non-null   object        
 2   daily_cases            7090 non-null   float64       
 3   cumulative_cases       7090 non-null   float64       
 4   daily_deaths           7090 non-null   float64       
 5   cumulative_deaths      7090 non-null   float64       
 6   cumulative_vaccinated  7090 non-null   float64       
 7   vaccination_age        7090 non-null   object        
dtypes: datetime64[ns](1), float64(5), object(2)
memory usage: 498.5+ KB


Unnamed: 0,date,area,daily_cases,cumulative_cases,daily_deaths,cumulative_deaths,cumulative_vaccinated,vaccination_age
2044,2022-06-02,Scotland,1101.0,1960631.0,28.0,12389.0,4097950.0,"[{'age': '05_11', 'VaccineRegisterPopulationBy..."
2045,2022-06-01,Scotland,1246.0,1959530.0,0.0,12361.0,4097561.0,"[{'age': '05_11', 'VaccineRegisterPopulationBy..."
2046,2022-05-31,Scotland,1329.0,1958284.0,0.0,12361.0,4097145.0,"[{'age': '05_11', 'VaccineRegisterPopulationBy..."
2047,2022-05-30,Scotland,1154.0,1956955.0,17.0,12361.0,4096767.0,"[{'age': '05_11', 'VaccineRegisterPopulationBy..."
2048,2022-05-29,Scotland,959.0,1955801.0,0.0,12344.0,4096402.0,"[{'age': '05_11', 'VaccineRegisterPopulationBy..."
