# Import libs

In [None]:
import os
import glob
import pandas as pd
from datetime import datetime
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## NHS App

In [None]:
import os
import glob
import pandas as pd

def concat_datasets(path):
    """import multiple datasets (.csv files) and concatenates into one dataframe"""
    # glob makes a list of all files and paths to each dataset i.e., ".../data/dataset1.csv" and so on
    file_list = glob.glob(os.path.join(data_path, "*.csv"))      
    # for each file (.csv) in the list of files concatenate them together    
    df = pd.concat((pd.read_csv(file) for file in file_list))
    # returns the single dataframe as the output of the function
    return df

# set the path to data folder
data_path = r'data/NHS_app_extract'
# invoke function and save results as nhsapp_df
nhsapp_df = concat_datasets(data_path)
# show result
nhsapp_df

In [None]:
# group all data by date and NHS region
nhsapp_df = nhsapp_df.groupby(['First_Report_Date','First_Region']).sum().reset_index()
# show result
nhsapp_df

In [None]:
# set cols that relate to activity metrics
activity_cols = ['Sum_Usage_LoginSessions_Login_Sessions',
                 'Sum_Usage_Appointments_Appointments_booked',
                 'Sum_Usage_CancelledAppointments_Cancellation_Count',
                 'Sum_Usage_MedicalRecords_Medical_record_views',
                 'Sum_Usage_Prescriptions_Prescriptions_Ordered']

# set col that contains the population figures
population_col = 'Max_GP_Registered_Patients'
       
def per100kpopulation(df, cols, population, per_capita):
    """divide the calculation_cols by the population_col on a per capita basis"""
    per100kpop = [col + "_per100kpop" for col in cols]
    df[per100kpop] = df[cols].div(df[population]/per_capita, axis=0)
    return df

# set per capita value i.e., per 1000 or per 100K population
per_capita = 100000

# invoke function and save results as nhsapp_df
nhsapp_df = per100kpopulation(nhsapp_df, activity_cols, population_col, per_capita)
# show result
nhsapp_df

In [None]:
start_date = '2019-01-01'
end_date = '2021-01-01'
# convert the 'report date' col to datetime format
nhsapp_df['First_Report_Date'] = pd.to_datetime(
    nhsapp_df['First_Report_Date']
    )

# create a mask that will filter by date
mask = (
    nhsapp_df['First_Report_Date'] > start_date) & (
    nhsapp_df['First_Report_Date'] <= end_date
    )
# apply the date filter to the dataframe
nhsapp_df = nhsapp_df.loc[mask]

data_path = r'data/NHS_app_extract/outputs'
nhsapp_df.to_csv(os.path.join(data_path, 'nhsapp_df_analysis.csv'))

## POMI

In [None]:
# import data
data_path = r'data/Pomi_latest'
file_list = glob.glob(os.path.join(data_path, "*.csv"))
pomi_df = pd.concat((pd.read_csv(file) for file in file_list))

In [None]:
pomi_df = pomi_df.groupby(['region_name','field'])['value'].sum().unstack('field').reset_index()

data_path = r'data/Pomi_latest/outputs'
pomi_df.to_csv(os.path.join(data_path, 'pomi_df_analysis.csv'))

## GP Survey

In [None]:
# import data
data_path = r'data/GP_survey'
file_name = r'GPPS_2021_CCG_data_(weighted)_(csv)_PUBLIC.csv'
file_list = glob.glob(os.path.join(data_path, file_name))
gpSurvey_df = pd.concat((pd.read_csv(file) for file in file_list))
gpSurvey_df.columns

In [None]:
# import data
data_path = r'data/GP_survey'
file_name = r'GPPS_2021_List_of_reporting_variables_(csv)_PUBLIC.csv'
file_list = glob.glob(os.path.join(data_path, file_name))
gpSurvey_questions_df = pd.concat((pd.read_csv(file) for file in file_list))
gpSurvey_questions_df.head(10)