In [1]:
import numpy as np
import pandas as pd
from uk_covid19 import Cov19API

In [2]:
def getUkdf(area_type="utla", filename=None, min_confirmed=0) -> pd.DataFrame:
    """
    Get the data from NHS API
    This should return a dataframe
    """

    all_nations = [f"areaType={area_type}"]

    cases_and_deaths = {
        "date": "date",
        "areaName": "areaName",
        "areaCode":"areaCode",
        "dailyCases": "newCasesByPublishDate",
        "dailyDeaths": "newDeaths28DaysByPublishDate"
    }

    api = Cov19API(filters=all_nations, structure=cases_and_deaths)

    df = api.get_dataframe()

    # Check if dataframe is empty
    if df.empty:
        print("No data retrieved from the API")
        return df

    df["date"] = pd.to_datetime(df["date"]).dt.date
    df.fillna(
        value={"dailyDeaths": 0, "dailyCases": 0},
        inplace=True,
        downcast="int64",
    )
    df = df[df.dailyCases > min_confirmed]
    df["day"] = df.date.apply(lambda x: (x - df.date.min()).days)
    df = df.reset_index(drop=True)

    # Check if daily data is correct
    if not df['date'].sort_values().equals(df['date']):
        print("Warning: Dates are not in order. The daily data might be incorrect.")

    if filename:
        df.to_csv(filename, index=False)

    return df



In [3]:
nation_data = getUkdf("nation", "../data/Nation_region/nation_data.csv", 5)



In [4]:
nation_data

Unnamed: 0,date,areaName,areaCode,dailyCases,dailyDeaths,day
0,2023-05-25,England,E92000001,6748,355,1180
1,2023-05-18,England,E92000001,7258,361,1173
2,2023-05-11,England,E92000001,9014,407,1166
3,2023-05-04,England,E92000001,10352,588,1159
4,2023-04-27,England,E92000001,10486,614,1152
...,...,...,...,...,...,...
3143,2020-03-15,Wales,W92000004,34,0,14
3144,2020-03-14,Wales,W92000004,22,0,13
3145,2020-03-13,Wales,W92000004,13,0,12
3146,2020-03-12,Wales,W92000004,10,0,11


In [5]:
print(nation_data[nation_data["areaName"] == "England"])

           date areaName   areaCode  dailyCases  dailyDeaths   day
0    2023-05-25  England  E92000001        6748          355  1180
1    2023-05-18  England  E92000001        7258          361  1173
2    2023-05-11  England  E92000001        9014          407  1166
3    2023-05-04  England  E92000001       10352          588  1159
4    2023-04-27  England  E92000001       10486          614  1152
..          ...      ...        ...         ...          ...   ...
853  2020-03-06  England  E92000001          39            1     5
854  2020-03-05  England  E92000001          25            0     4
855  2020-03-04  England  E92000001          32            0     3
856  2020-03-03  England  E92000001          11            0     2
857  2020-03-01  England  E92000001          12            0     0

[858 rows x 6 columns]


In [6]:
# Create a boolean mask
mask = (nation_data['day'].diff().abs() == 1)

# Apply the mask to the DataFrame
sequential_days_df = nation_data[mask]

# Print the resulting DataFrame
print(sequential_days_df)

            date areaName   areaCode  dailyCases  dailyDeaths  day
48    2022-06-30  England  E92000001       21732           68  851
49    2022-06-29  England  E92000001       21428          117  850
50    2022-06-28  England  E92000001       21415          117  849
51    2022-06-27  England  E92000001       47332           84  848
53    2022-06-23  England  E92000001       17481           64  844
...          ...      ...        ...         ...          ...  ...
3143  2020-03-15    Wales  W92000004          34            0   14
3144  2020-03-14    Wales  W92000004          22            0   13
3145  2020-03-13    Wales  W92000004          13            0   12
3146  2020-03-12    Wales  W92000004          10            0   11
3147  2020-03-11    Wales  W92000004           9            0   10

[2850 rows x 6 columns]


In [7]:
sequential_days_df

Unnamed: 0,date,areaName,areaCode,dailyCases,dailyDeaths,day
48,2022-06-30,England,E92000001,21732,68,851
49,2022-06-29,England,E92000001,21428,117,850
50,2022-06-28,England,E92000001,21415,117,849
51,2022-06-27,England,E92000001,47332,84,848
53,2022-06-23,England,E92000001,17481,64,844
...,...,...,...,...,...,...
3143,2020-03-15,Wales,W92000004,34,0,14
3144,2020-03-14,Wales,W92000004,22,0,13
3145,2020-03-13,Wales,W92000004,13,0,12
3146,2020-03-12,Wales,W92000004,10,0,11


In [8]:
utla_data = getUkdf("utla", "../data/UTLA_region/utla_data.csv", 1)



In [9]:
utla_data

Unnamed: 0,date,areaName,areaCode,dailyCases,dailyDeaths,day
0,2023-05-25,Rutland,E06000017,3,0,1133
1,2023-05-18,Rutland,E06000017,4,1,1126
2,2023-05-11,Rutland,E06000017,8,1,1119
3,2023-05-04,Rutland,E06000017,7,0,1112
4,2023-04-27,Rutland,E06000017,12,0,1105
...,...,...,...,...,...,...
147020,2020-04-21,Worcestershire,E10000034,29,0,4
147021,2020-04-20,Worcestershire,E10000034,42,0,3
147022,2020-04-19,Worcestershire,E10000034,40,0,2
147023,2020-04-18,Worcestershire,E10000034,43,0,1


In [10]:
ltla_data = getUkdf("ltla", "../data/LTLA_region/ltla_data.csv", 1)



In [11]:
ltla_data

Unnamed: 0,date,areaName,areaCode,dailyCases,dailyDeaths,day
0,2023-05-25,South Gloucestershire,E06000025,48,0,1133
1,2023-05-18,South Gloucestershire,E06000025,34,2,1126
2,2023-05-11,South Gloucestershire,E06000025,39,0,1119
3,2023-05-04,South Gloucestershire,E06000025,67,2,1112
4,2023-04-27,South Gloucestershire,E06000025,63,7,1105
...,...,...,...,...,...,...
255165,2020-04-26,Basingstoke and Deane,E07000084,6,0,9
255166,2020-04-25,Basingstoke and Deane,E07000084,8,0,8
255167,2020-04-24,Basingstoke and Deane,E07000084,6,0,7
255168,2020-04-23,Basingstoke and Deane,E07000084,9,0,6


In [12]:
region_data = getUkdf("region", "../data/NHS_region/region_data.csv", 1)



In [13]:
region_data

Unnamed: 0,date,areaName,areaCode,dailyCases,dailyDeaths,day
0,2023-05-25,North West,E12000002,922,46,1133
1,2023-05-18,North West,E12000002,815,44,1126
2,2023-05-11,North West,E12000002,1003,60,1119
3,2023-05-04,North West,E12000002,1316,80,1112
4,2023-04-27,North West,E12000002,1337,93,1105
...,...,...,...,...,...,...
7251,2020-04-21,South East,E12000008,778,0,4
7252,2020-04-20,South East,E12000008,315,0,3
7253,2020-04-19,South East,E12000008,678,0,2
7254,2020-04-18,South East,E12000008,414,0,1


In [14]:
region_data["areaName"].unique()

array(['North West', 'Yorkshire and The Humber', 'East Midlands',
       'East of England', 'London', 'South West', 'North East',
       'West Midlands', 'South East'], dtype=object)

In [15]:
ltla_data = pd.read_csv("../data/LTLA_region/ltla_data.csv")

In [16]:
ltla_data["areaName"].unique()

array(['South Gloucestershire', 'Cheltenham', 'Surrey Heath', 'Rugby',
       'Tameside', 'Dudley', 'Antrim and Newtownabbey', 'Aylesbury Vale',
       'Craven', 'Staffordshire Moorlands', 'Rochdale',
       'Hackney and City of London', 'Gwynedd', 'Southend-on-Sea',
       'Gloucester', 'Hinckley and Bosworth', 'Newark and Sherwood',
       'Manchester', 'Sunderland', 'Luton', 'Shropshire', 'Sevenoaks',
       'South Ribble', 'Harborough', 'Ceredigion', 'Peterborough',
       'South Derbyshire', 'Swale', 'Fylde', 'North Kesteven',
       'Scarborough', 'Braintree', 'Mansfield', 'West Oxfordshire',
       'Barking and Dagenham', 'Camden', 'Redbridge',
       'Armagh City, Banbridge and Craigavon', 'Southampton', 'Cambridge',
       'Broxbourne', 'Blaby', 'Boston', 'Mole Valley',
       'North East Lincolnshire', 'Derby', 'Dorset',
       'East Cambridgeshire', 'Tendring', 'Argyll and Bute', 'Medway',
       'Eden', 'Breckland', 'North Norfolk', 'Redditch', 'North Ayrshire',
       'Sto

In [17]:
ltla_data["areaName"]

0         South Gloucestershire
1         South Gloucestershire
2         South Gloucestershire
3         South Gloucestershire
4         South Gloucestershire
                  ...          
255165    Basingstoke and Deane
255166    Basingstoke and Deane
255167    Basingstoke and Deane
255168    Basingstoke and Deane
255169    Basingstoke and Deane
Name: areaName, Length: 255170, dtype: object