# OPG: LPA Data Pre-processing, Cleaning, Manipulation, Analysis, and insight tool

#==============================================================================
# @author: Dr. Leila Yousefi 
# MoJ Modelling Hub
#==============================================================================# OPG : LPA Data Pre-processing and Cleaning tool

==============================================================================

 OPG Demand Forecast modelling for LPA
 
 @author: Leila Yousefi
 
 MoJ Modelling Hub
 
============================================================================== 

## Setup

Before you can run this project, you need to install some Python packages using the terminal:


### create and activate  a virtual environment
1. cd OPG
2. python3 -m venv venv
3. source venv/bin/activate

### install the python packages required
4. pip install --upgrade pip
5. pip install -r requirements.txt

### Updating your branch with main
When working on your models it is likely that your branch will get out of date with the main branch. To update you branch with the latest changes from main open a terminal and run the following:

Check your working tree, commit/push any changes if required

git status
Switch to the main branch and collect the latest changes, if any

git switch main
git fetch
git pull
Switch back to your branch and merge in the changes from main

git switch <your initial>/model-a-development
git merge main -m "update branch with main"

# Installing the required packages:

In [None]:
# Un-comment and Run the below code if there is an error with packages installation:

!pip install pip
!pip install arrow_pd_parser
!pip install pydbtools
!pip install xlsxwriter
!pip install holidays
#!pip install statsforecast
##You can add lines to install the required packages

In [None]:
!pip install panda update

# Importing the required packages:

In [None]:

import sys
print(sys.path)

import os.path
from os import path
os.getcwd()

import pandas as pd
import numpy as np
#import awswrangler as wr
#import statsmodels.api as sm
#import tensorflow as tf
import boto3
import getpass
import pytz
#import openpyxl
#import matplotlib
import csv
from arrow_pd_parser import reader, writer
import shutil
import pydbtools as pydb
import datetime as dt
from datetime import timedelta
from datetime import datetime
from datetime import date
#import statsforecast

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt

from io import StringIO

#from tensorflow import keras
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000
from matplotlib import rc
import statsmodels.api as sm

# consistent plot size wherever not specifiied
from pylab import rcParams
mpl.rcParams['figure.figsize'] = (15,8)
mpl.rcParams['axes.grid'] = False
rcParams['xtick.labelsize'] = 14
rcParams['ytick.labelsize'] = 14
rcParams['axes.labelsize'] = 14

%matplotlib inline
%config InlineBackend.figure_format='retina'

import xlsxwriter
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rc
import statsmodels.api as sm

# consistent plot size wherever not specifiied
from pylab import rcParams
mpl.rcParams['figure.figsize'] = (15,8)
mpl.rcParams['axes.grid'] = False
rcParams['xtick.labelsize'] = 14
rcParams['ytick.labelsize'] = 14
rcParams['axes.labelsize'] = 14


# LPA Data Import from Dom1

This was used previously before transfering data to AWS

In [None]:
# ## Enter the corresponding bucket name
# bucketName = "alpha-opg-analytical"

# ##For Automation import getpass
# #bucketName = getpass.getpass()


# ## Select the folder
# folderPath = "sirius_data_cuts_3"


# ## Set the folder in which the final output will be uploade to in S3
# #output_path = f"s3://alpha-opg-analytical/" + folderPath + "/"

# ## Then create a new excel file and copy the previous record from the S3 buckets and add the newly copied raws
# ## Finaly convert the excel file to csv and upload it in the following path:
# ## s3://alpha-opg-analytical/sirius_data_cuts_3/


# ## Explore the s3 bucket path
# path_s3 = f"s3://{bucketName}/{folderPath}/*.csv"
# print ([path_s3])


# ## Listing CSV Files in an S3 Bucket Folder: 
# ### To list all CSV files in a specific folder within an S3 bucket, we can use the AWS CLI or the boto3 Python library. 
# ###list all files in a specific folder within an S3 bucket Using AWS CLI:
# #aws s3 ls s3://your-bucket-name/your-folder-name/ --recursive

# ### lists all CSV files in a specific folder within an S3 bucket using boto3:
# def list_csv_files(bucketName, folderPath):
#     s3 = boto3.resource('s3')
#     bucket = s3.Bucket(bucketName)
#     for obj in bucket.objects.filter(Prefix=folderPath):
#         if obj.key.endswith('.csv'):
#             print(obj.key)


# ## list all csv in the current folder in s3 bucket:
# ### Getting the List of CSV Files
# csv_files = list_csv_files(bucketName, folderPath)
# print([csv_files])


# ## Check if the path exists:
# s3 = boto3.resource('s3')
# bucket = s3.Bucket(bucketName)

# def IsObjectExists(path):
#     for object_summary in bucket.objects.filter(Prefix=path):
#         return True
#     return False

# for fileName in [csv_files]:
#     if(IsObjectExists(f"{folderPath}/{fileName}")):
#         print("Path for the actual LPA data exists")
#     else:
#         print("Path for the actual LPA data doesn't exists")
        

# S3 Bucket Data Extraction for LPA Data (actuals)

These will be used when extracting the raw data from the S3

In [None]:
# Download and upload the LPA actual data into the S3 bucket

## Enter the corresponding S3 bucket name
bucketName = "alpha-opg-analytical"

##For Automation import getpass
#bucketName = getpass.getpass()


## Select the corresponding folder includes new LPA data in S3 bucket:
folderPath = "sirius_data_cuts_3"


## Set the folder in which the final output will be uploade to in S3
#output_path = f"s3://alpha-opg-analytical/" + folderPath + "/"

## Then create a new excel file and copy the previous record from the S3 buckets and add the newly copied raws
## Finaly convert the excel file to csv and upload it in the following path:
## s3://alpha-opg-analytical/sirius_data_cuts_3/

# Initialize the S3 client
s3_client = boto3.client('s3')

# Specify your bucket name and folder (prefix)
bucket_name = bucketName
folder_prefix = 'sirius_data_cuts_3/'

# List objects in the specified folder
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=folder_prefix)

# Extract the keys (file names) from the response
file_keys = [obj['Key'] for obj in response.get('Contents', [])]

# Filter out None values (if any)
non_none_file_keys = [key for key in file_keys if key is not None]
#print(non_none_file_keys)

# Remove folder prefix from file keys
file_names = [os.path.basename(key) for key in non_none_file_keys]
#print(file_names)

csv_extension = '.csv'
filtered_file_names = [fn for fn in file_names if fn.lower().endswith(csv_extension)]

print(filtered_file_names)

# ## Explore the s3 bucket path
# path_s3 = f"s3://{bucketName}/{folderPath}/"
# print ([path_s3])

# ## Check if the path exists:
# s3 = boto3.resource('s3')
# bucket = s3.Bucket(bucketName)

# def IsObjectExists(path):
#     for object_summary in bucket.objects.filter(Prefix=path):
#         return True
#     return False

# if(IsObjectExists(path_s3)):
#     print("Path for the actual data exists")
# else:
#     print("Path for the actual data doesn't exists")


## Listing CSV Files in an S3 Bucket Folder: 
### To list all CSV files in a specific folder within an S3 bucket, we can use the AWS CLI or the boto3 Python library. 
###list all files in a specific folder within an S3 bucket Using AWS CLI:
#aws s3 ls s3://your-bucket-name/your-folder-name/ --recursive

# ### lists all CSV files in a specific folder within an S3 bucket using boto3:
# def list_csv_files(bucketName, folderPath):
#     s3 = boto3.resource('s3')
#     bucket = s3.Bucket(bucketName)
#     for obj in bucket.objects.filter(Prefix=folderPath):
#         if obj.key.endswith('.csv'):
#             print(obj.key)


# ## list all csv in the current folder in s3 bucket:
# ### Getting the List of CSV Files
# csv_files = list_csv_files(bucketName, folderPath)
# print([csv_files])


# ## Opening CSV Files Based on Selected Column and Condition: 
# def process_csv_file(file_path, selected_column, condition):
#     df = pd.read_csv(file_path)
#     filtered_df = df[df[selected_column] == condition]
#     # Do further processing with the filtered data

# ## pre-process the csv file to only show the required columns (important variable for the BAU):
# process_csv_file('path/to/your-csv-file.csv', 'column_name', 'desired_value')


# # ## Enter your file name
# # fileName = "d24_2.csv"


# Query the warehouse tables directly from Python/R


    """
    with events as (
        select *
        from "dim_guardianship_dev_dbt"."fct_case_receipts"
        where extract_type = 'latest_extract'
            and receipt_date >= date_parse('01-01-2008', '%d-%m-%Y')
    ),
    dates as (
        select *
        from "common_lookup_dev_dbt"."dim_date"
    ),
    donors as (
        select *
        from "dim_guardianship_dev_dbt"."dim_donors"
    ),
    cases as (
        select *
        from "dim_guardianship_dev_dbt"."dim_cases"
    ),
    attributes as (
        select dates.calendar_year as receipt_year,
            events.receipt_date,
            cases.case_id,
            cases.case_type,
            cases.case_subtype,
            cases.case_status,
            cases.donor_age_at_receipt,
            donors.gender,
            donors.region_name,
            events.extract_date
        from events
            left join dates on events.receipt_date = dates.date_name
            left join cases on events.extract_case_id = ces.extract_case_id
            left join donors on events.extract_donor_id = donors.extract_donor_id
    )
    select *
    from attributes
    """
    

# Reading in Data

This extracts a list of Power of Attorney receipts with the following columns: ['receiptdate', 'uid', 'type', 'casesubtype', 'status', 'donor_postcode', 'donor_gender', 'age'].

In [None]:
# read_csv_files function for Reading in CSV Files in an S3 Bucket Folder

def read_csv_files(bucket_name, file_names, selected_columns):
    """
        This function is written to read in the data from all of CSV files in the corresponding directory in the S3 bucket
        by using input variables:
        the S3 bucket name,
        file_names 
        and the selected_columns 
        The output are the CSV files in the list of dataframes: dfs 
    """
    dfs = {}  # Dictionary to store DataFrames

    # Initialize the S3 client
    s3_client = boto3.client('s3')

    for file_name in file_names:
        s3_path = f's3://{bucket_name}/{file_name}'
        try:
            # Read the CSV data into a Pandas DataFrame
            csv_obj = s3_client.get_object(Bucket=bucket_name, Key=f'{folderPath}/{file_name}')
            csv_string = csv_obj['Body'].read().decode('utf-8')
            df = pd.read_csv(StringIO(csv_string))

            # Select specific columns
            df_selected = df[selected_columns]
            dfs[file_name] = df_selected
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

    return dfs


#bucket_name = bucketName
#file_names = ['file1.csv', 'file2.csv']  # Replace with your actual file names

## Filter the required variables from the datafarame:
selected_columns = ["receiptdate","cases_glueexporteddate","uid","type","casesubtype","status","donor_dob","donor_postcode","donor_gender"]  # Replace with desired column names

## The read_csv_files function for Reading in CSV Files in an S3 Bucket Folder:
dataframes = read_csv_files(bucket_name, filtered_file_names, selected_columns)

## Access individual DataFrames by file name
for file_name, df_selected in dataframes.items():
    print(f"DataFrame for {file_name}:")
    print(df_selected.head())
    
## Concatenating DataFrames: 
### After reading all CSV files, you can concatenate the DataFrames using pd.concat:
combined_df = pd.concat(dataframes, ignore_index=True)
print(combined_df)

## Writing Back to S3: Finally, write the combined DataFrame back to S3:
#combined_data_encoded = combined_df.to_csv(None, index=False).encode('utf-8')
#combined_file_name = 'combined_data.csv'  # Choose a suitable file name
#s3_client.put_object(Body=combined_data_encoded, Bucket=bucket_name, Key=combined_file_name

## Identify the type of data set and pre-processing: 
## Import, manipulate, and clean the data and impute missing values

## Column renaming:
#df1.rename(columns={'old_col1': 'common_col1', 'old_col2': 'common_col2'}, inplace=True)

## Handling Data Mismatch:
###Be cautious when combining data with different structures. If a column has incompatible data types (e.g., mixing strings and numbers), you may need to convert or handle them appropriately.
#combined_df['numeric_col'] = pd.to_numeric(combined_df['numeric_col'], errors='coerce')

## Aggregating Data:
###If the DataFrames have different structures, consider aggregating them based on a common identifier (e.g., date or unique ID).
#combined_df = df1.groupby('product_id').sum()  # Aggregate by product ID

## merge DataFrames based on a common identifier:
#merged_df = pd.merge(df1, df2, on='ID', how='inner')
#print(merged_df)

In [None]:
#     ## Select Date
#     start_date = '2018-06-01' # start date for the train set
#     start_prediction ='2023-02-01' # The end date for the train set
#     end_prediction ='2024-02-01' # test / Validation set



## Import the dataset and read in the actual data
#df = wr.s3.read_csv([path1_s3], sep = ',', parse_dates=True) #import divorce data
#read data
#def parser(s):
#    return datetime.strptime(s, '%Y-%m-%d')
#df = wr.s3.read_csv([path1_s3], parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
## iterating the columns
#for col in df.columns:
#    print(col)


#lpa=LPA_data[["receiptdate","cases_glueexporteddate","uid","type","casesubtype","status","donor_dob","donor_postcode","donor_gender"]]

# Automating the input dates to forecast LPAs


In [None]:
# Set the date you want to extract data based on the latest date extrated LPA data

## Grab part of filename
fist_CSV_fileName = filtered_file_names[1]
snapshot_end = fist_CSV_fileName.split('opg-analytical_cases_P')[1].lstrip().split('_S')[0]
#snapshot_end

#snapshot_end = final_df.values[7].astype(str)[7]

## Automating the input dates to forecast
p = getpass.getpass(prompt='Do you want to change the starting date for forecasting? (Choose Yes=Y OR No=N)')
 
if (p.lower() == 'n') | (p.lower() == ''): #defult start date
    snapshot_start = '2006-12-31'
    print('You have not choosen to change the date, the default date is: ' + snapshot_start)
    ## The first date to be considered:
else:    
    ## Select Date
    print('You have choosen to change the date.')
    snapshot_start = input('Enter the period_start date (for training): e.g., "2006-12-31"')
    print('snapshot_start: ' + snapshot_start)

# Data pre-processing and cleaning - data engineering

## Meta data and Variable selection and Data Cleaning for the LPA data in Data Warehouse:

Goal: to work out how many people applied for lpa and recieved the power of atthorney and how many applications in a year/month/week by age group since 2007? 

### ages over 19 years old

#### Unique case reference for each donor = [donor_dob + donor_postcode + donor_gender]

##### Sort by the unique id and count how many application

###### and then dermine Whether the application type [casesubtype] is hw=health and welfare or pfa=property and finance

###### how many certificate provider (cp) for each lpa application?

###### Location based data and geographical data for the donor can be used to identify the financial situation and wherether they are located in England or Wales


In [None]:

# Filter the records:
df_filtered = combined_df

## Convert the receipt date to date format 
df_filtered['receiptdate'] = pd.to_datetime(df_filtered['receiptdate'], errors = 'coerce') #.dt.date

## Filter records between the selected dates
df_filtered = df_filtered.loc[(df_filtered['receiptdate'] > pd.to_datetime(snapshot_start))]
df_filtered = df_filtered.loc[(df_filtered['receiptdate'] < pd.to_datetime(snapshot_end))]

## Filter the dataframe to select only lpa type records
df_filtered = df_filtered.loc[(df_filtered['type'] == 'lpa')]

# Create a dataframe of the selected columns
## Select the appropriate variable to be forecasted
df = df_filtered[["receiptdate","uid","casesubtype","status","donor_dob","donor_postcode","donor_gender"]]

## Remove Null values and records
lpa_df = df.dropna()

# Extract age by subtracting 'receiptdate' and 'donor_dob'
lpa_df['age'] = pd.to_datetime(lpa_df['receiptdate'], errors = 'coerce').dt.year - pd.to_datetime(lpa_df['donor_dob'], errors = 'coerce').dt.year
#lpa_df['age'] = relativedelta(date, dob).years

# Convert the donor_dob column to a datatime format
lpa_df['donor_dob'] = pd.to_datetime(lpa_df['donor_dob'], errors = 'coerce').dt.date

# Convert the ‘receiptdate’ column to datetime format for proper plotting.
# Convert 'receiptdate' to datetime format 
lpa_df['receiptdate'] = pd.to_datetime(lpa_df['receiptdate'], errors='coerce')

# Extract year from 'receiptdate'
lpa_df['year'] = lpa_df['receiptdate'].dt.year

## Set index
#df['receiptdate'] = pd.to_datetime(df['receiptdate'])

#df = df.set_index('receiptdate').asfreq('D')

####df['receiptdate'] = df.set_index('receiptdate',inplace=True)

#df.index = df.index.to_period('D')
                            
print(lpa_df.head())
print(lpa_df.tail())


#lpa_df['age'] = pd.to_datetime(df['receiptdate'], errors = 'coerce').dt.date - pd.to_datetime(df['donor_dob'], errors = 'coerce').dt.date
#lpa_df['receiptdate'] = pd.to_datetime(lpa_df['receiptdate']).dt.date#.apply(lambda x: x.strftime('%Y-%m-%d'))
#print(lpa_df)#['receiptdate']
#lpa_df

#print(lpa_df['age'])

## infer the frequency of the data:
###lpa_df = df

#lpa_df = df.asfreq(pd.infer_freq(df.index))

#lpa_df = lpa_df[start_date:end_date]

#start_date_years = datetime.strptime(start_date, 
#                                     '%Y-%m-%d') + relativedelta(years = 0)
#print(start_date_years)

#start_date_formatted = start_date_years.date()

# Visualisation of the time series
## Virtualisation of the LPA Data:

# Plot 'age' against 'receiptdate'

## --------------------------------------------------------------------------------  ##
# Create a scatter plot with ‘receiptdate’ as the x-axis and ‘age’ as the y-axis.
# Display the plot with appropriate labels and a grid.

plt.figure(figsize=(20, 10))
plt.scatter(lpa_df['receiptdate'], lpa_df['age'], alpha=0.5)
plt.title('Age vs Receipt Date')
plt.xlabel('Receipt Date')
plt.ylabel('Age')
plt.grid(True)
plt.show()

## --------------------------------------------------------------------------------  ##
# Create a histogram of the 'age' column

# This code will produce a histogram that displays the frequency distribution of ages in your dataset. 
# The bins parameter determines the number of bins used in the histogram, and you can adjust this number
# to change the granularity of your histogram.

plt.figure(figsize=(20, 10))
plt.hist(lpa_df['age'], bins=20, alpha=0.7, color='blue')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

## --------------------------------------------------------------------------------  ##
# Create a line chart of age against receipt date
# Sort the DataFrame by 'receiptdate' to ensure the line chart is ordered
lpa_df.sort_values('receiptdate', inplace=True)

# Plot 'age' against 'receiptdate' using a line chart
plt.figure(figsize=(20, 10))
plt.plot(lpa_df['receiptdate'], lpa_df['age'], marker='o', linestyle='-', color='blue')
plt.title('Age vs Receipt Date')
plt.xlabel('Receipt Date')
plt.ylabel('Age')
plt.grid(True)

## --------------------------------------------------------------------------------  ##
# Produce a line chart that displays the average age of individuals for each year based on the receipt dates in your dataset.
# The data points are connected with a line, which helps in identifying any trends or patterns over the years.

# Group the data by year and calculate the average age for each year
age_by_year = lpa_df.groupby('year')['age'].mean().reset_index()

# Plot 'age' against 'year' using a line chart
plt.figure(figsize=(20, 10))
plt.plot(age_by_year['year'], age_by_year['age'], marker='o', linestyle='-', color='blue')
plt.title('Average Age vs Year')
plt.xlabel('Year')
plt.ylabel('Average Age')
plt.grid(True)
plt.show()



## Observations:
The trend in the line chart indicates the changes in the average age of individuals over the years, 
based on the receipt dates from your dataset.
Such a visualization can help identify patterns, 
such as whether the average age is increasing, decreasing, or remaining relatively stable over time.

For example:
An upward trend would suggest that the average age is increasing each year.
A downward trend would indicate that the average age is decreasing.
A flat line would imply that there is little to no change in the average age over the years.
These trends can be influenced by various factors, such as the demographics of the population being studied, 
changes in policies, or other external factors that might affect the age distribution.

In [None]:

# Plot the Actuals

# lpa_series = lpa_df['age']
# #lpa_series = df.squeeze()
# plt.figure(figsize=(28, 14))
# plt.plot(lpa_series)
# plt.title('UK Actual LPA Data', fontsize=20)
# plt.ylabel('Age', fontsize=16)
# plt.axvline(pd.to_datetime(df['receiptdate'], errors = 'coerce').dt.year, color = 'k', linestyle='--', alpha = 0.2)
# # for year in range(min(pd.to_datetime(df['receiptdate'], errors = 'coerce').dt.year), 
# #     datetime.strptime(snapshot_end, '%Y-%m-%d').year):
# #     #datetime.strptime("2024-03-18", '%Y-%m-%d').year):
# #     plt.axvline(pd.to_datetime(df['receiptdate'], errors = 'coerce'), color = 'k', linestyle='--', alpha = 0.2)
# #     #plt.axvline(pd.to_datetime(str(year) + '-01-01'), color = 'k', 
# #     #print(year)
# plt.legend()    
# #plt.savefig('UK_Actual_LPA_Data.png', dpi=300, bbox_inches='tight')
# plt.show()  

In [None]:
# Perform the GROUP BY operation and calculate the count
#Cases_by_year_age = lpa_df.groupby(
#    ['receiptdate', 'uid', 'type', 'casesubtype', 'status', 'donor_postcode', 'donor_gender', 'age']) \
#    .agg({'No_of_Cases': 'count'}) \ #['donor_postcode', 'donor_gender', 'age']
#    .reset_index()

#agg_funcs = dict(No_of_Cases = 'count')
#Cases_by_year_age = lpa_df.set_index(['receiptdate', 'uid', 'type', 'casesubtype', 'status', 'donor_postcode', 'donor_gender', 'age']) \
#    .stack() \
#    .groupby(level=0) \
#    .agg(agg_funcs)


#Cases_by_year_age
#lpa_by_year_age = lpa_df[['receiptdate', 'uid', 'type', 'casesubtype', 'status', 'donor_postcode', 'donor_gender', 'age']] \
#                    .groupby(['donor_postcode', 'donor_gender', 'age'])  \
#                    .agg('count')#.sum()
#lpa_by_year_age.to_csv(r'lpa_by_year_age.csv')



#lpa_df.to_csv(r'lpa_df.csv')

# Missing Data Imputation:

There are be some people in the LPA data with missing age (they are represented with negetive numbers in column age). 
So for missing data (age) imputation, his code is written to use age distribution of cases that they have age and
apply this to the total number of doners in that year. 
Actually, we allocate proportionaly distributed age across each year of these missing ages. 
E.g., if we get 90% of age distribution for a particular year,
we used this age distribution to be applied to the 100% of donors to get the total distribution. 

The code below: 
first, loads the data from the CSV file and replaces negative ages 
with NaN to represent missing data. 

It then calculates the age distribution for each year. 

For each year, it finds the indices of the missing ages and imputes 
them by randomly choosing from the age distribution of that year. 

The imputed ages are proportional to the age distribution 
of the donors that year. 

Finally, it saves the DataFrame with the imputed ages to a new CSV file.


In [None]:
# # Function to calculate the number of unique records by age, year, gender, and postcode
# #def calculate_unique_records_by_age_year_gender_postcode(records):
# # Get the current year
# #current_year = datetime.now().year  
# #Create a dictionary to store counts for each age, year, gender, and postcode combination
# age_year_gender_postcode_counts = {}

# records = lpa_df
    
# # Iterate over each record
# for record in records:         
#     # Extract gender and postcode
#     gender = record["donor_gender"]
#     postcode = record["donor_postcode"]
#     dob = record["donor_dob"]
    
#     # Create a unique key combining age, gender, and postcode
#     key = (dob, gender, postcode)
        
#     # Increment the count for the key
#     age_year_gender_postcode_counts[key] = age_year_gender_postcode_counts.get(key, 0) + 1
        
# return age_year_gender_postcode_counts

# # Call the function and print the results
# unique_records_by_age_year_gender_postcode = calculate_unique_records_by_age_year_gender_postcode(records)

# print("Number of unique records by age, year, gender, and postcode:")

# for key, count in unique_records_by_age_year_gender_postcode.items():
#     dob, gender, postcode = key
#     print(f"Date of Birth (D.o.B): {dob}, Gender: {gender}, Postcode: {postcode}, Count: {count}")


In [None]:
# This code will output the number of unique records for each age in each year for each donor gender in each donor postcode.
# It calculates the age based on the current year and the birth year of each person in the records.
# Then, it creates a unique key combining age, year, gender, and postcode, and increments the count for each key.
# Finally, it prints the results showing the count of unique records for each combination.

#from datetime import datetime

# Sample data representing records with donor gender, donor postcode, and date of birth
#records = [
#    {"donor_gender": "Male", "donor_postcode": "AB12 3CD", "date_of_birth": "1999-05-15"},
#    {"donor_gender": "Female", "donor_postcode": "XY34 5YZ", "date_of_birth": "1994-08-20"},
#    {"donor_gender": "Male", "donor_postcode": "CD56 7EF", "date_of_birth": "1996-02-10"},
#    {"donor_gender": "Male", "donor_postcode": "FG78 9HI", "date_of_birth": "2000-11-30"},
#    {"donor_gender": "Female", "donor_postcode": "JK90 1LM", "date_of_birth": "1987-03-25"},
#    {"donor_gender": "Male", "donor_postcode": "OP23 4QR", "date_of_birth": "1993-09-05"}
#]

# Function to calculate the number of unique records by age, year, gender, and postcode
#def calculate_unique_records_by_age_year_gender_postcode(records):
    # Get the current year
#    current_year = datetime.now().year
    
    # Create a dictionary to store counts for each age, year, gender, and postcode combination
#    age_year_gender_postcode_counts = {}
    
    # Iterate over each record
#    for record in records:
        # Extract the year of birth from the date_of_birth
#        birth_year = int(record["date_of_birth"].split("-")[0])
        
        # Calculate the age of the person
#        age = current_year - birth_year
        
        # Extract the year from the date_of_birth
#        year = birth_year
        
        # Extract gender and postcode
#        gender = record["donor_gender"]
#        postcode = record["donor_postcode"]
        
        # Create a unique key combining age, year, gender, and postcode
#        key = (age, year, gender, postcode)
        
        # Increment the count for the key
#        age_year_gender_postcode_counts[key] = age_year_gender_postcode_counts.get(key, 0) + 1
        
#    return age_year_gender_postcode_counts

# Call the function and print the results
#unique_records_by_age_year_gender_postcode = calculate_unique_records_by_age_year_gender_postcode(records)
#print("Number of unique records by age, year, gender, and postcode:")
#for key, count in unique_records_by_age_year_gender_postcode.items():
#    age, year, gender, postcode = key
#    print(f"Age: {age}, Year: {year}, Gender: {gender}, Postcode: {postcode}, Count: {count}")


# Missing age imutation

There are two issues with the age:

1. The donor_gender might be missing or entered incorrectly

2. The derieved age might be higher than 126 years old



In [None]:
lpa_data_sample_imputed = lpa_df

# Filter rows with negative or greater than 126 age values
criteria = lpa_data_sample_imputed[(lpa_data_sample_imputed['age'] < 0) | (lpa_data_sample_imputed['age'] > 126)]

# Display the filtered rows
print(criteria)

# Replace age values with NULL (NaN) in the filtered rows
lpa_data_sample_imputed.loc[criteria.index, 'age'] = np.nan #None

# Display the updated DataFrame
print(lpa_data_sample_imputed)

# Group by year and count age groups
age_distribution = lpa_data_sample_imputed.groupby('year')['age'].value_counts()

# Fill missing ages with the most common age for each year
most_common_age = lpa_data_sample_imputed.groupby('year')['age'].apply(lambda x: x.mode().iloc[0])
lpa_data_sample_imputed['age'] = lpa_data_sample_imputed.apply(lambda row: most_common_age[row['year']] if pd.isna(row['age']) else row['age'], axis=1)

# Display the age distribution after filling missing ages
print("\nAge distribution by year (including filled missing ages):")
print(age_distribution)

# Display the final DataFrame
print("\nFinal DataFrame:")
print(lpa_data_sample_imputed)

# Save the dataframe with imputed ages
lpa_data_sample_imputed.to_csv('lpa_data_sample_imputed.csv', index=False)

# Print a success message
print("The missing age data has been successfully imputed and saved to lpa_data_sample_imputed.csv file.")

In [None]:

# lpa_data_sample_imputed = lpa_df


# # Identify the rows with missing age (represented as negative numbers)
# ## 1. The donor_gender might be missing or entered incorrectly:  < 0
# ## 2. The derieved age might be higher than 126 years old > 126
# lpa_data_sample_imputed['missing_age'] = (lpa_data_sample_imputed['age'] < 0) | (lpa_data_sample_imputed['age'] > 126)

# # Replace negative ages with NaN
# lpa_data_sample_imputed.loc[missing_age, 'age'] = np.nan

# # Calculate the age distribution for each year excluding missing ages
# age_distribution = lpa_data_sample_imputed.loc[~missing_age].groupby('year')['age'].value_counts(normalize=True)

# # Calculate the age distribution for each year
# age_distribution_per_year = lpa_data_sample_imputed.groupby('year')['age'].value_counts(normalize=True)

# # Apply the age distribution to the total number of donors in each year
# for year in df['year'].unique():
#     # Calculate the number of missing ages in the current year
#     num_missing = missing_age & (df['year'] == year)
    
#     # If there are missing ages in the current year
#     if num_missing.sum() > 0:
#         # Generate ages according to the age distribution of the current year
#         imputed_ages = np.random.choice(age_distribution[year].index, 
#                                         p=age_distribution[year].values, 
#                                         size=num_missing.sum())
        
#         # Assign the generated ages to the missing ages
#         df.loc[num_missing, 'age'] = imputed_ages


# # Apply the age distribution to the missing ages
# for year in lpa_data_sample_imputed['year'].unique():
#     missing_age_indices = lpa_data_sample_imputed[(lpa_data_sample_imputed['year'] == year) & (lpa_data_sample_imputed['age'].isna())].index
#     if not missing_age_indices.empty:
#         imputed_ages = np.random.choice(age_distribution_per_year[year].index, 
#                                         p=age_distribution_per_year[year].values, 
#                                         size=len(missing_age_indices))
#         lpa_data_sample_imputed.loc[missing_age_indices, 'age'] = imputed_ages

      


In [None]:
# create a unique identifier based on multiple columns:
# lpa_unique_key = lpa_df


# #df1.set_index(['donor_postcode', 'donor_gender', 'age']).index.factorize()[0]+1
# lpa_unique_key.insert(loc = 0, column='ukey', value = lpa_unique_key.set_index(['donor_postcode', 'donor_gender', 'age']).index.factorize()[0]+1)
# #lpa_unique_key

# #(lpa_unique_key.fillna({'donor_postcode':'', 'donor_gender':'', 'age':''})
# #   .groupby(['donor_postcode', 'donor_gender', 'age'],sort=False).ngroup()+1)

# #lpa_unique_key.loc[lpa_unique_key['type']=='lpa','ukey'].agg(['nunique','count','size'])
# #lpa_unique_key.query('type == "lpa"')['ukey'].agg(['nunique','count','size'])
# #lpa_unique_key.query('casesubtype == "hw"')['ukey'].agg(['nunique','count','size'])
# #lpa_unique_key.query('casesubtype == "pfa"')['ukey'].agg(['nunique','count','size'])
# #lpa_unique_key.groupby(['ukey']).count()
# #lpa_unique_key['count_ukey'] = lpa_unique_key['ukey'].value_counts()
# #lpa_unique_key



# lpa_unique_key['CountbyUkey'] = lpa_unique_key.groupby(['donor_postcode', 'donor_gender']).age.transform('count')
# lpa_unique_key['CountbyAge'] = lpa_unique_key.groupby('year').age.transform('count').sum()

# # Perform the GROUP BY operation and calculate the sum
# lpa_age = lpa_unique_key.groupby(['donor_postcode', 'donor_gender', 'age']) \
#     .agg({'CountbyAge': 'sum'}) \
#     .reset_index()

# print(lpa_age)
# #lpa_unique_key['month'] = lpa_unique_key['ArrivalDate'].dt.month


# # Cases_by_year_age

# #lpa_by_year_age = lpa_unique_key[['receiptdate', 'uid', 'type', 'casesubtype', 'status', 'donor_postcode', 'donor_gender', 'age']] \
# #                    .groupby(['donor_postcode', 'donor_gender', 'age'])  \
# #                    .agg('count')#.sum()


# Generate a Unique key by combining age, donor_gender, and donor_postcode

For ages over 19 years old:
Unique case reference for each donor = [donor_dob + donor_postcode + donor_gender]

In [None]:

# DataFrame with the count of unique records for each combination of age and year. 
# Generate a unique key by combining age, donor_gender, and donor_postcode, 
# and then calculate the number of unique records by age and year.

lpa_unique = lpa_data_sample_imputed

# Remove spaces from the donor postcodes
lpa_unique['donor_postcode'] = lpa_unique['donor_postcode'].str.strip()
lpa_unique['donor_postcode'] = lpa_unique['donor_postcode'].str.replace(' ', '')

# Generate a unique key by combining age, donor_gender, and donor_postcode
lpa_unique['unique_key'] = lpa_unique['donor_dob'].astype(str) \
+ lpa_unique['donor_gender'] + lpa_unique['donor_postcode']

# lpa_by_year_age = lpa_unique_key

# lpa_by_year_age.to_csv(r'lpa_by_year_age.csv')

# remove duplicate rows based on Id values(unique_key) and 
# keep only the row that don't have 0 value in all the fields.


duplicateMask = lpa_unique.duplicated('unique_key', keep=False)

lpa_unique = pd.concat([lpa_unique.loc[duplicateMask & lpa_unique[['age', 'donor_gender', 'donor_postcode']].ne(0).any(axis=1)], \
               lpa_unique[~duplicateMask]])

#lpa_df['zero']=lpa_df.select_dtypes(['int','float']).eq(0).sum(axis=1)
#df=df.sort_values(['zero','Id']).drop_duplicates(subset=['Id']).drop(columns='zero')df['zero']=df.select_dtypes(['int','float']).eq(0).sum(axis=1)
#df=df.sort_values(['zero','Id']).drop_duplicates(subset=['Id']).drop(columns='zero')

#lpa_unique = lpa_unique.drop_duplicates(subset="unique_key")
lpa_unique

# Save the LPA data with new unique keys (as a unique ID)

Sort by the unique id

In [None]:
# Sort the rows of dataframe by  'unique_key'  
## column inplace

#lpa_df_index = lpa_unique.sort_values(lpa_unique.columns[9])

# Extract month letter and year 
lpa_unique['month_year'] = lpa_unique['receiptdate'].dt.strftime('%b-%y')

## Sort by 'unique_key' column in ascending order
lpa_df_index = lpa_unique.sort_values(by=['unique_key','receiptdate'])


#lpa_df_index = lpa_unique.sort_values['unique_key']
#lpa_df_index = lpa_unique.sort_values(by = 'unique_key', axis = 1, inplace = True, ascending = True)
#lpa_df_index = lpa_unique.reindex(sorted(lpa_unique.columns), axis=1)

## Set index
#df['receiptdate'] = pd.to_datetime(df['receiptdate'])

#df = df.set_index('receiptdate').asfreq('D')
#lpa_df_index['unique_key'] = 

## Set the unique key as an ID (index)
lpa_df_index.set_index('unique_key', inplace = True)

#df.index = df.index.to_period('D')
                            
###print(df.head())
###print(df.tail())

#Missing_data = lpa_df_index[(lpa_data_sample_imputed['age'] < 0 | lpa_data_sample_imputed['age'] > 126)]
#print(Missing_data)



# Extract and save data into a csv file
lpa_data = lpa_df_index


#lpa_data.to_csv(r'lpa_data.csv')

# Number of LPA reciepts



In [None]:
# Average daily receipts for 2023

# create a drop down list with average daily receipts of LPA application in 2024 in the range say from 4000 – 7000 in increments of 100. 
# Then this should be used as an estimate to apply unceratinty and to be converted into an age-specific annual donor forecast.

# Filter data to involve Registered status and Post-covid data from 2022 onwards
unique_receipts_post_covid = lpa_unique

###unique_receipts_post_covid[unique_receipts_post_covid['status'].str.contains("Registered")]


unique_receipts_post_covid = unique_receipts_post_covid.query('year > 2018')


#df[df['Overall_Percentage'].isin([value for value in df['Overall_Percentage'] if value > 60])]
#df[df.apply(lambda row: row['Overall_Percentage'] > 55, axis=1)]
# # The “loc” method is used to access a group of rows and columns by label(s) or a boolean array. 
# #We can utilise it to filter a DataFrame based on specific column values.
#df.loc[df['Overall_Percentage'] > 40]
# # The “iloc” method is similar to “loc” but uses integer-based indexing instead of labels. 
# #It allows us to filter a DataFrame by specifying the row and column indices.
#df[df.iloc[:, -1] > 40]


#count_unique_receipts_monthly = unique_receipts_post_covid.groupby(['month_year'])['receiptdate'].agg('count').reset_index()

unique_receipts_post_covid = unique_receipts_post_covid.groupby(['receiptdate'])['unique_key'].agg('count').reset_index()

#count_unique_receipts_monthly = unique_receipts_post_covid.groupby(['year', 'month_year'])['uid'].agg('count').reset_index()

#count_unique_receipts_monthly = unique_receipts_post_covid.groupby(['month_year'])['receiptdate'].nunique().reset_index(name='count')

# Calculating the overall percentage for each donor and adding a new column
#count_unique_receipts_daily['Overall_Percentage'] = count_unique_receipts_daily.iloc[:, 1:].mean(axis=1)



count_unique_receipts_daily = unique_receipts_post_covid.rename(columns={'unique_key': 'daily_demand'})

count_unique_receipts_daily['avg_daily_demand'] = count_unique_receipts_daily['daily_demand'].mean()


# Extract month letter and year 
###lpa_reciepts['month_year'] = lpa_reciepts['receiptdate'].dt.strftime('%b-%y')
# Calculate the number of unique records by month and year
#count_reciepts_month = lpa_reciepts.groupby(['year', 'month_year'])['receiptdate'].nunique().reset_index(name='count')
###Count_daily_reciepts = lpa_reciepts.groupby(['receiptdate']).count()

#unique_records = df.groupby('unique_key').agg('count').reset_index()  #.groupby(['year'])['unique_key'].nunique().reset_index(name='count')
###Count_daily_reciepts = Count_daily_reciepts.rename(columns={"count": "Count_of_daily_reciepts"})

#


# Extract month letter and year 
count_unique_receipts_daily['month_year'] = count_unique_receipts_daily['receiptdate'].dt.strftime('%b-%y')
count_unique_receipts_daily['year'] = count_unique_receipts_daily['receiptdate'].dt.strftime('%Y')
print(count_unique_receipts_daily)



In [None]:
daily_demand = count_unique_receipts_daily['daily_demand']

# Mean and standard deviation
mean_demand = daily_demand.mean()
std_dev_demand = daily_demand.std()

print(f"Average of LPA daily demand: {mean_demand}")
print(f"Standard Deviation of LPA daily demand: {std_dev_demand}")

In [None]:
# 95% Confidence Interval
ci_lower = mean_demand - 1.96 * std_dev_demand
ci_upper = mean_demand + 1.96 * std_dev_demand

print(f"Mean Demand: {mean_demand}")
print(f"95% Confidence Interval: [{ci_lower}, {ci_upper}]")


## use the average daily demand from historical data as the basis for your naive forecast.
### So by having daily demand data for the past year, the forecast for tomorrow would be equal to today’s demand.

# **Naïve extrapolation**
Also known as the “naïve forecast,” is a straightforward method for demand forecasting. In Excel, apply this technique by assuming that future demand will be the same as the most recent observed value.
- A naïve extrapolation of the receipts trend immediately before the broadcast event on the 21 November gives us some idea of what receipt volumes might have been between December 2023 and March 2024 and therefore what effect the broadcast had on overall receipt volumes. 
    - create a drop down list with average daily receipts of LPA application in 2024 in the range say from 4000 – 7000 in increments of 100.

# **UPDATED FORECAST: AVERAGE DAILY RECEIPTS**
In terms of how to apply all of this to the long term LPA model:
- Convert the receipts forecast to an annual total by multiplying by the number of working days. If we used the central estimate of 5600 then multiplying this by 256 which the number of working days in 2024 gives an annual total for receipts of 1,433,600. This can be converted into an estimate of the number of donors based on the ratio of donors to receipts (say) over the last couple of years. And then convert the donor estimate into age specific estimates based on the distribution by age , again over (say) the last couple of years.

In [None]:
# Naïve forecast
naïve_forecast = daily_demand.iloc[-1]  # Last observed demand
naïve_forecast

In [None]:


# Adjust with uncertainty
forecast_with_uncertainty = np.random.normal(loc=naïve_forecast, scale=std_dev_demand, size=1000)

# Summary of the forecast
forecast_mean = forecast_with_uncertainty.mean()
forecast_ci_lower = np.percentile(forecast_with_uncertainty, 2.5)
forecast_ci_upper = np.percentile(forecast_with_uncertainty, 97.5)

print(f"Forecast Mean: {forecast_mean}")
print(f"Forecast 95% CI: [{forecast_ci_lower}, {forecast_ci_upper}]")

#  Incorporating uncertainty and short-term impacts into your long-term forecasting model: 
- ensuring that both historical and forecasted data exclude weekends and UK holidays, providing a more accurate and realistic forecast.
- generate and analyze age-specific annual demand forecasts based on the overall forecast and specified age group proportions.
- convert the receipts forecast to an annual total, estimate the number of donors, and convert the donor estimate into age-specific estimates based on the distribution by age.
in order to apply uncertainty based on short term forecasting drivers such as post covid and advertisement impacts of this short term forecasting on the long term forecasting for number of daily reciepts of Living Power of Attorney  (LPA) demands by using average daily demands based on different scenarios Naïve extrapolation for future pandemic demands based on COVID-19 data age-specific reflect the uncertainty around the receipts forecast for next year update quarterly and also to vary the receipts inputs to reflect uncertainty around this estimate which will then also be reflected in the longer term age-specific forecast model?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import holidays

# Generate sample historical data
np.random.seed(42)
dates = pd.date_range(start='2020-01-01', end='2023-12-31', freq='D')
historical_data = pd.DataFrame({
    'date': dates,
    'demand': np.random.poisson(lam=100, size=len(dates))  # Simulating daily demands
})

# UK holidays
uk_holidays = holidays.UnitedKingdom()

# Function to remove weekends and holidays
def remove_weekends_and_holidays(data):
    data['weekday'] = data['date'].dt.weekday
    data['is_holiday'] = data['date'].isin(uk_holidays)
    return data[(data['weekday'] < 5) & (~data['is_holiday'])]

# Remove weekends and holidays from historical data
historical_data = remove_weekends_and_holidays(historical_data)

# Function to calculate average daily demand and uncertainty
def calculate_daily_demand_stats(data):
    mean_demand = data['demand'].mean()
    std_dev_demand = data['demand'].std()
    return mean_demand, std_dev_demand

# Function to perform Naïve extrapolation with uncertainty
def naive_extrapolation_with_uncertainty(last_observed, std_dev, num_days=90):
    forecast = np.random.normal(loc=last_observed, scale=std_dev, size=num_days)
    return forecast

# Function to update quarterly forecasts
def update_quarterly_forecast(data, num_quarters=4):
    forecasts = []
    for _ in range(num_quarters):
        mean_demand, std_dev_demand = calculate_daily_demand_stats(data)
        last_observed = data['demand'].iloc[-1]
        quarterly_forecast = naive_extrapolation_with_uncertainty(last_observed, std_dev_demand)
        
        # Generate new dates for the forecast period
        new_dates = pd.date_range(start=data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(quarterly_forecast))
        new_data = pd.DataFrame({'date': new_dates, 'demand': quarterly_forecast})
        
        # Remove weekends and holidays from new data
        new_data = remove_weekends_and_holidays(new_data)
        
        forecasts.extend(new_data['demand'])
        data = pd.concat([data, new_data], ignore_index=True)
    return data

# Function to apply scenario analysis
def apply_scenario_analysis(base_forecast, impact_factor):
    return base_forecast * impact_factor

# Apply short-term drivers (post-COVID and advertisement impacts)
post_covid_impact = 1.1  # 10% increase
advertisement_impact = 1.2  # 20% increase

# Generate base forecast
mean_demand, std_dev_demand = calculate_daily_demand_stats(historical_data)
base_forecast = naive_extrapolation_with_uncertainty(mean_demand, std_dev_demand, num_days=365)

# Apply scenario impacts
forecast_post_covid = apply_scenario_analysis(base_forecast, post_covid_impact)
forecast_advertisement = apply_scenario_analysis(base_forecast, advertisement_impact)

# Combine impacts
combined_forecast = apply_scenario_analysis(forecast_post_covid, advertisement_impact)

# Update quarterly forecasts
updated_forecast_data = update_quarterly_forecast(historical_data)

# Plotting the results
plt.figure(figsize=(14, 7))
plt.plot(historical_data['date'], historical_data['demand'], label='Historical Data', color='blue')
plt.plot(pd.date_range(start=historical_data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(combined_forecast)), 
         combined_forecast, label='Forecast with Impacts', color='orange')
plt.xlabel('Date')
plt.ylabel('Daily Receipts')
plt.title('LPA Daily Receipts Forecast with Uncertainty and Short-term Drivers')
plt.legend()
plt.show()

print("Forecast with combined impacts for the next year:")
print(combined_forecast)

# Save the forecast data
forecast_dates = pd.date_range(start=historical_data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(combined_forecast))
forecast_df = pd.DataFrame({'date': forecast_dates, 'forecast': combined_forecast})
forecast_df = remove_weekends_and_holidays(forecast_df)
forecast_df.to_csv('lpa_forecast.csv', index=False)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import holidays

# ForecastingModel: Encapsulates all forecasting-related methods and attributes.
class ForecastingModel:
    def __init__(self, historical_data, post_covid_impact=1.1, advertisement_impact=1.2):
        self.historical_data = historical_data
        self.post_covid_impact = post_covid_impact
        self.advertisement_impact = advertisement_impact
        self.uk_holidays = holidays.UnitedKingdom()
        self.age_group_proportions = {
            '50-59': 0.25,
            '60-69': 0.30,
            '70+': 0.45
        }
        self.working_days_per_year = 256
        self.receipts_to_donors_ratio = 0.75  # Example ratio, should be calculated based on historical data


    # remove_weekends_and_holidays: Removes weekends and holidays from the data.
    def remove_weekends_and_holidays(self, data):
        data['weekday'] = data['date'].dt.weekday
        data['is_holiday'] = data['date'].isin(self.uk_holidays)
        return data[(data['weekday'] < 5) & (~data['is_holiday'])]

    #calculate_daily_demand_stats: Computes mean and standard deviation of demand.
    def calculate_daily_demand_stats(self, data):
        mean_demand = data['demand'].mean()
        std_dev_demand = data['demand'].std()
        return mean_demand, std_dev_demand

    # naive_extrapolation_with_uncertainty: Generates a Naïve forecast with uncertainty.
    def naive_extrapolation_with_uncertainty(self, last_observed, std_dev, num_days=90):
        forecast = np.random.normal(loc=last_observed, scale=std_dev, size=num_days)
        return forecast

    # apply_scenario_analysis: Applies short-term impact factors to the base forecast.
    def apply_scenario_analysis(self, base_forecast, impact_factor):
        return base_forecast * impact_factor

    # update_quarterly_forecast: Updates the forecast quarterly and removes weekends/holidays.
    def update_quarterly_forecast(self, data, num_quarters=4):
        forecasts = []
        for _ in range(num_quarters):
            mean_demand, std_dev_demand = self.calculate_daily_demand_stats(data)
            last_observed = data['demand'].iloc[-1]
            quarterly_forecast = self.naive_extrapolation_with_uncertainty(last_observed, std_dev_demand)
            
            new_dates = pd.date_range(start=data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(quarterly_forecast))
            new_data = pd.DataFrame({'date': new_dates, 'demand': quarterly_forecast})
            new_data = self.remove_weekends_and_holidays(new_data)
            
            forecasts.extend(new_data['demand'])
            data = pd.concat([data, new_data], ignore_index=True)
        return data

    # generate_forecast: Generates a forecast for a specified number of days, with optional short-term impacts.
    def generate_forecast(self, num_days=365, short_term=True):
        mean_demand, std_dev_demand = self.calculate_daily_demand_stats(self.historical_data)
        base_forecast = self.naive_extrapolation_with_uncertainty(mean_demand, std_dev_demand, num_days)
        
        if short_term:
            forecast_post_covid = self.apply_scenario_analysis(base_forecast, self.post_covid_impact)
            combined_forecast = self.apply_scenario_analysis(forecast_post_covid, self.advertisement_impact)
        else:
            combined_forecast = base_forecast
        
        forecast_dates = pd.date_range(start=self.historical_data['date'].iloc[-1] + pd.Timedelta(days=1), periods=num_days)
        forecast_df = pd.DataFrame({'date': forecast_dates, 'forecast': combined_forecast})
        forecast_df = self.remove_weekends_and_holidays(forecast_df)
        return forecast_df
    
    # calculate_annual_total_receipts: Calculates the annual total receipts by multiplying the average daily receipts by the number of working days in the year.
    def calculate_annual_total_receipts(self, forecast_df):
        average_daily_receipts = forecast_df['forecast'].mean()
        annual_total_receipts = average_daily_receipts * self.working_days_per_year
        return annual_total_receipts
    
    
    # estimate_number_of_donors: Estimates the number of donors based on the annual total receipts and the receipts-to-donors ratio.
    def estimate_number_of_donors(self, annual_total_receipts):
        number_of_donors = annual_total_receipts * self.receipts_to_donors_ratio
        return number_of_donors
    
    # calculate_age_specific_forecast: Converts the donor estimate into age-specific estimates based on the distribution by age.
    def calculate_age_specific_forecast(self, number_of_donors):
        age_specific_forecast = {}
        for age_group, proportion in self.age_group_proportions.items():
            age_specific_forecast[age_group] = number_of_donors * proportion
        return age_specific_forecast    
    
    # plot_forecast: Plots the historical and forecasted data.
    def plot_forecast(self, forecast_df):
        plt.figure(figsize=(14, 7))
        plt.plot(self.historical_data['date'].values, self.historical_data['demand'].values, label='Historical Data', color='blue')
        plt.plot(forecast_df['date'].values, forecast_df['forecast'].values, label='Forecast with Impacts', color='orange')
        plt.xlabel('Date')
        plt.ylabel('Daily Receipts')
        plt.title('LPA Daily Receipts Forecast with Uncertainty and Short-term Drivers')
        plt.legend()
        plt.show()



In [None]:
# LPA historical data
historical_data = count_unique_receipts_daily[['receiptdate', 'daily_demand']].rename(columns={'receiptdate': 'date', 'daily_demand': 'demand'})

# Instantiate the ForecastingModel class
forecast_model = ForecastingModel(historical_data)

# Remove weekends and holidays from historical data
historical_data = forecast_model.remove_weekends_and_holidays(historical_data)

# Generate short-term forecast
short_term_forecast_df = forecast_model.generate_forecast(num_days=365, short_term=True)

# Generate long-term forecast (without short-term impacts)
long_term_forecast_df = forecast_model.generate_forecast(num_days=365, short_term=False)

# Calculate annual total receipts for short-term forecast
annual_total_receipts_short_term = forecast_model.calculate_annual_total_receipts(short_term_forecast_df)

# Estimate number of donors for short-term forecast
number_of_donors_short_term = forecast_model.estimate_number_of_donors(annual_total_receipts_short_term)

# Calculate age-specific forecasts for short-term forecast
age_specific_forecast_short_term = forecast_model.calculate_age_specific_forecast(number_of_donors_short_term)

# Calculate annual total receipts for long-term forecast
annual_total_receipts_long_term = forecast_model.calculate_annual_total_receipts(long_term_forecast_df)

# Estimate number of donors for long-term forecast
number_of_donors_long_term = forecast_model.estimate_number_of_donors(annual_total_receipts_long_term)

# Calculate age-specific forecasts for long-term forecast
age_specific_forecast_long_term = forecast_model.calculate_age_specific_forecast(number_of_donors_long_term)

# Plotting the short-term forecast
forecast_model.plot_forecast(short_term_forecast_df)

# Plotting the long-term forecast
forecast_model.plot_forecast(long_term_forecast_df)

# Save the short-term forecast data
short_term_forecast_df.to_csv('lpa_short_term_forecast.csv', index=False)

# Save the long-term forecast data
long_term_forecast_df.to_csv('lpa_long_term_forecast.csv', index=False)

# Print annual total receipts, number of donors, and age-specific forecasts
print("Annual total receipts (short-term):", annual_total_receipts_short_term)
print("Number of donors (short-term):", number_of_donors_short_term)
print("Age-specific forecast (short-term):", age_specific_forecast_short_term)

print("Annual total receipts (long-term):", annual_total_receipts_long_term)
print("Number of donors (long-term):", number_of_donors_long_term)
print("Age-specific forecast (long-term):", age_specific_forecast_long_term)

In [None]:
import matplotlib.pyplot as plt
from scipy.stats import norm
import holidays

class ForecastingModel:
    def __init__(self, historical_data, post_covid_impact=1.1, advertisement_impact=1.2):
        self.historical_data = historical_data
        self.post_covid_impact = post_covid_impact
        self.advertisement_impact = advertisement_impact
        self.uk_holidays = holidays.UnitedKingdom()
        self.age_group_proportions = {
            '50-59': 0.25,
            '60-69': 0.30,
            '70+': 0.45
        }
        self.working_days_per_year = 256
        self.receipts_to_donors_ratio = 0.75  # Example ratio, should be calculated based on historical data

    def remove_weekends_and_holidays(self, data):
        data['weekday'] = data['date'].dt.weekday
        data['is_holiday'] = data['date'].isin(self.uk_holidays)
        return data[(data['weekday'] < 5) & (~data['is_holiday'])]

    def calculate_daily_demand_stats(self, data):
        mean_demand = data['demand'].mean()
        std_dev_demand = data['demand'].std()
        return mean_demand, std_dev_demand

    def naive_extrapolation_with_uncertainty(self, last_observed, std_dev, num_days=90):
        forecast = np.random.normal(loc=last_observed, scale=std_dev, size=num_days)
        return forecast

    def apply_scenario_analysis(self, base_forecast, impact_factor):
        return base_forecast * impact_factor

    def update_quarterly_forecast(self, data, num_quarters=4):
        forecasts = []
        for _ in range(num_quarters):
            mean_demand, std_dev_demand = self.calculate_daily_demand_stats(data)
            last_observed = data['demand'].iloc[-1]
            quarterly_forecast = self.naive_extrapolation_with_uncertainty(last_observed, std_dev_demand)
            
            new_dates = pd.date_range(start=data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(quarterly_forecast))
            new_data = pd.DataFrame({'date': new_dates, 'demand': quarterly_forecast})
            new_data = self.remove_weekends_and_holidays(new_data)
            
            forecasts.extend(new_data['demand'])
            data = pd.concat([data, new_data], ignore_index=True)
        return data

    def generate_forecast(self, num_days=365, short_term=True):
        mean_demand, std_dev_demand = self.calculate_daily_demand_stats(self.historical_data)
        base_forecast = self.naive_extrapolation_with_uncertainty(mean_demand, std_dev_demand, num_days)
        
        if short_term:
            forecast_post_covid = self.apply_scenario_analysis(base_forecast, self.post_covid_impact)
            combined_forecast = self.apply_scenario_analysis(forecast_post_covid, self.advertisement_impact)
        else:
            combined_forecast = base_forecast
        
        forecast_dates = pd.date_range(start=self.historical_data['date'].iloc[-1] + pd.Timedelta(days=1), periods=num_days)
        forecast_df = pd.DataFrame({'date': forecast_dates, 'forecast': combined_forecast})
        forecast_df = self.remove_weekends_and_holidays(forecast_df)
        return forecast_df

    def calculate_annual_total_receipts(self, forecast_df):
        average_daily_receipts = forecast_df['forecast'].mean()
        annual_total_receipts = average_daily_receipts * self.working_days_per_year
        return annual_total_receipts

    def estimate_number_of_donors(self, annual_total_receipts):
        number_of_donors = annual_total_receipts * self.receipts_to_donors_ratio
        return number_of_donors

    def calculate_age_specific_forecast(self, number_of_donors):
        age_specific_forecast = {}
        for age_group, proportion in self.age_group_proportions.items():
            age_specific_forecast[age_group] = number_of_donors * proportion
        return age_specific_forecast

    def plot_forecast(self, forecast_df):
        plt.figure(figsize=(14, 7))
        plt.plot(self.historical_data['date'], self.historical_data['demand'], label='Historical Data', color='blue')
        plt.plot(forecast_df['date'], forecast_df['forecast'], label='Forecast with Impacts', color='orange')
        plt.xlabel('Date')
        plt.ylabel('Daily Receipts')
        plt.title('LPA Daily Receipts Forecast with Uncertainty and Short-term Drivers')
        plt.legend()
        plt.show()

# Sample historical data
historical_data = count_unique_receipts_daily[['receiptdate', 'daily_demand']].rename(columns={'receiptdate': 'date', 'daily_demand': 'demand'})

# Instantiate the ForecastingModel class
forecast_model = ForecastingModel(historical_data)

# Remove weekends and holidays from historical data
historical_data = forecast_model.remove_weekends_and_holidays(historical_data)

# Generate short-term forecast
short_term_forecast_df = forecast_model.generate_forecast(num_days=365, short_term=True)

# Generate long-term forecast (without short-term impacts)
long_term_forecast_df = forecast_model.generate_forecast(num_days=365, short_term=False)

# Calculate annual total receipts for short-term forecast
annual_total_receipts_short_term = forecast_model.calculate_annual_total_receipts(short_term_forecast_df)

# Estimate number of donors for short-term forecast
number_of_donors_short_term = forecast_model.estimate_number_of_donors(annual_total_receipts_short_term)

# Calculate age-specific forecasts for short-term forecast
age_specific_forecast_short_term = forecast_model.calculate_age_specific_forecast(number_of_donors_short_term)

# Calculate annual total receipts for long-term forecast
annual_total_receipts_long_term = forecast_model.calculate_annual_total_receipts(long_term_forecast_df)

# Estimate number of donors for long-term forecast
number_of_donors_long_term = forecast_model.estimate_number_of_donors(annual_total_receipts_long_term)

# Calculate age-specific forecasts for long-term forecast
age_specific_forecast_long_term = forecast_model.calculate_age_specific_forecast(number_of_donors_long_term)

# Plotting the short-term forecast
forecast_model.plot_forecast(short_term_forecast_df)

# Plotting the long-term forecast
forecast_model.plot_forecast(long_term_forecast_df)

# Save the short-term forecast data
short_term_forecast_df.to_csv('lpa_short_term_forecast.csv', index=False)

# Save the long-term forecast data
long_term_forecast_df.to_csv('lpa_long_term_forecast.csv', index=False)

# Print annual total receipts, number of donors, and age-specific forecasts
print("Annual total receipts (short-term):", annual_total_receipts_short_term)
print("Number of donors (short-term):", number_of_donors_short_term)
print("Age-specific forecast (short-term):", age_specific_forecast_short_term)

print("Annual total receipts (long-term):", annual_total_receipts_long_term)
print("Number of donors (long-term):", number_of_donors_long_term)
print("Age-specific forecast (long-term):", age_specific_forecast_long_term)
# Explanation
# Class Definition:

# Added working_days_per_year and receipts_to_donors_ratio attributes to the ForecastingModel class.
# Added calculate_annual_total_receipts, estimate_number_of_donors, and calculate_age_specific_forecast methods.
# Calculations:

# calculate_annual_total_receipts: Calculates the annual total receipts by multiplying the average daily receipts by the number of working days in the year.
# estimate_number_of_donors: Estimates the number of donors based on the annual total receipts and the receipts-to-donors ratio.
# calculate_age_specific_forecast: Converts the donor estimate into age-specific estimates based on the distribution by age.
# Execution:

# Generate short-term and long-term forecasts.
# Calculate the annual total receipts and the number of donors for both short-term and long-term forecasts.
# Calculate and print the age-specific forecasts for both short-term and long-term.
# This updated script includes the calculations to convert the receipts forecast to an annual total, estimate the number of donors, and convert the donor estimate into age-specific estimates based on the distribution by age.

# implement the above Python code in Excel, 
create the Excel worksheet, and upload the above Excel model for me here? 
by creating a drop down list with the average daily receipts of LPA application demands in 2024 in the range of 4000–7000 in increments of 100. 
Then this should be used as an estimate to apply uncertainty and be converted into an age-specific annual donor forecast?

In [None]:
import pandas as pd
import numpy as np
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.worksheet.datavalidation import DataValidation
from datetime import timedelta
import holidays

# Initialize the workbook and worksheet
wb = Workbook()
ws = wb.active
ws.title = "LPA Forecast Model"

# Create a dropdown list for average daily receipts
dv = DataValidation(type="list", formula1='"4000,4100,4200,4300,4400,4500,4600,4700,4800,4900,5000,5100,5200,5300,5400,5500,5600,5700,5800,5900,6000,6100,6200,6300,6400,6500,6600,6700,6800,6900,7000"', showDropDown=True)
ws.add_data_validation(dv)
ws['A1'] = "Average Daily Receipts"
ws['B1'] = 5600  # Default value
dv.add(ws['B1'])

# Setup headers for the historical data
ws.append(["Date", "Demand"])
# Generate sample historical data
np.random.seed(42)
dates = pd.date_range(start='2020-01-01', end='2023-12-31', freq='D')
historical_data = pd.DataFrame({
    'date': dates,
    'demand': np.random.poisson(lam=100, size=len(dates))  # Simulating daily demands
})

# Add historical data to worksheet
for r in dataframe_to_rows(historical_data, index=False, header=False):
    ws.append(r)

# Calculate and add working days for forecast
forecast_dates = pd.date_range(start='2024-01-01', end='2024-12-31', freq='D')
forecast_data = pd.DataFrame({'date': forecast_dates})
forecast_data['weekday'] = forecast_data['date'].dt.weekday
forecast_data['is_holiday'] = forecast_data['date'].astype(str).isin(holidays.UnitedKingdom(years=2024))
forecast_data['working_day'] = (forecast_data['weekday'] < 5) & (~forecast_data['is_holiday'])
working_days_2024 = forecast_data['working_day'].sum()

# Insert forecast model structure and calculations in Excel
ws['A10'] = "Forecast Parameters"
ws['A11'] = "Post-COVID Impact"
ws['B11'] = 1.1
ws['A12'] = "Advertisement Impact"
ws['B12'] = 1.2

ws['A14'] = "Working Days in 2024"
ws['B14'] = working_days_2024

ws['A16'] = "Receipts to Donors Ratio"
ws['B16'] = 0.75

ws['A18'] = "Age Group"
ws['B18'] = "Proportion"
ws['A19'] = "50-59"
ws['B19'] = 0.25
ws['A20'] = "60-69"
ws['B20'] = 0.30
ws['A21'] = "70+"
ws['B21'] = 0.45

ws['A23'] = "Estimated Average Daily Receipts"
ws['B23'] = "=B1"

ws['A24'] = "Annual Total Receipts"
ws['B24'] = "=B23 * B14"

ws['A25'] = "Estimated Number of Donors"
ws['B25'] = "=B24 * B16"

ws['A27'] = "Age-Specific Annual Donor Forecast"
ws['A28'] = "50-59"
ws['B28'] = "=B25 * B19"
ws['A29'] = "60-69"
ws['B29'] = "=B25 * B20"
ws['A30'] = "70+"
ws['B30'] = "=B25 * B21"

# Save the workbook to a file
file_path = '/mnt/data/LPA_Forecast_Model.xlsx'
wb.save(file_path)
file_path

In [None]:
Step-by-Step Guide for Excel Model
1. Historical Data
Create a Data Sheet:
Name a sheet Historical Data.
In column A, input your historical dates.
In column B, input your historical daily demand.
2. Remove Weekends and Holidays
Create a Holiday List:

Name a sheet Holidays.
List UK holidays in column A.
Add Formulas to Check for Weekends and Holidays:

In Historical Data, add columns for Weekday and IsHoliday.
Weekday: Use =WEEKDAY(A2, 2) to get the weekday number (1 for Monday, 7 for Sunday).
IsHoliday: Use =IF(COUNTIF(Holidays!$A$2:$A$100, A2) > 0, TRUE, FALSE) to check if the date is a holiday.
Filter out weekends and holidays: Use =AND(B2<>6, B2<>7, NOT(C2)) to filter only working days.
3. Calculate Average and Standard Deviation
Calculate Statistics:
Use =AVERAGEIFS(B2:B1000, C2:C1000, TRUE) to calculate the mean of working day demands.
Use =STDEV.P(IF(D2:D1000, B2:B1000)) to calculate the standard deviation, using an array formula.
4. Naïve Extrapolation with Uncertainty
Create a Forecast Sheet:

Name a sheet Forecast.
In column A, list future dates for the forecast period.
Use a formula like =A2 + 1 to generate sequential dates.
Generate Naïve Forecast:

In column B, use the average calculated earlier as the base forecast.
Add random noise to the forecast to introduce uncertainty using =NORMINV(RAND(), $B$1, $C$1), where $B$1 is the mean and $C$1 is the standard deviation.
5. Apply Short-Term Impacts
Post-COVID and Advertisement Impact:
Use two separate columns to apply impacts.
Post-COVID Impact: Multiply the base forecast by a factor, e.g., =B2 * 1.1.
Advertisement Impact: Multiply the post-COVID forecast by another factor, e.g., =C2 * 1.2.
6. Combine Impacts and Filter Weekends and Holidays
Combined Impact Forecast:

Combine impacts in a new column: =D2 * E2.
Filter out Weekends and Holidays:

Repeat the steps used in the historical data to remove weekends and holidays from the forecast.
7. Quarterly Updates
Create Quarterly Forecasts:
Use separate sections in the Forecast sheet for each quarter.
Update the base forecast using the last observed demand.
Excel Implementation Example
Here is an example layout of the Excel sheet with formulas.

Historical Data Sheet
Date	Demand	Weekday	IsHoliday	WorkingDay
01/01/2020	100	=WEEKDAY(A2, 2)	=IF(COUNTIF(Holidays!$A$2:$A$100, A2) > 0, TRUE, FALSE)	=AND(B2<>6, B2<>7, NOT(C2))
02/01/2020	120	...	...	...
...	...	...	...	...
Forecast Sheet
Date	Base Forecast	Post-COVID Impact	Advertisement Impact	Combined Impact	Weekday	IsHoliday	WorkingDay
01/01/2024	=AVERAGE($B$2:$B$1000)	=B2 * 1.1	=C2 * 1.2	=D2 * E2	=WEEKDAY(A2, 2)	=IF(COUNTIF(Holidays!$A$2:$A$100, A2) > 0, TRUE, FALSE)	=AND(F2<>6, F2<>7, NOT(G2))
02/01/2024	=NORMINV(RAND(), $B$1, $C$1)	...	...	...	...	...	...
...	...	...	...	...	...	...	...
Tips
Use Excel's Data Analysis Toolpak to assist with statistical functions if needed.
Create dynamic named ranges for holidays to easily update the list.
Use Excel's conditional formatting and filters to highlight weekends and holidays.
By following these steps and formulas, you can build an Excel model that captures the essence of the Python script, including handling uncertainty and applying short-term impacts on your forecasting.

In [None]:
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA

# Load your LPA demand data (replace with your actual data)
# Assume 'daily_demand' is a pandas Series with daily demand values

# Short-term forecasting (Naïve Extrapolation)
naive_forecast = daily_demand.shift(1)

# Calculate prediction intervals (adjust alpha as needed)
model = ARIMA(daily_demand, order=(1, 1, 1))
model_fit = model.fit()
forecast, stderr, conf_int = model_fit.forecast(steps=1, alpha=0.05)

# Incorporate advertising impact (adjust as needed)
# Example: Multiply forecast by advertising factor (low, medium, high)

# Reflect post-COVID-19 impact (adjust as needed)
# Example: Adjust forecast based on historical COVID-19 data

# Long-term age-specific model (create your own)
# Apply age-specific factors to overall forecast

# Quarterly updates (revisit and adjust)

# Print results
print("Naïve Forecast for t+1:", naive_forecast.iloc[-1])
print("Forecast for t+1:", forecast[0])
print("Prediction Interval (95%):", conf_int[0])

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import holidays

class ForecastingModel:
    def __init__(self, historical_data, post_covid_impact=1.1, advertisement_impact=1.2):
        self.historical_data = historical_data
        self.post_covid_impact = post_covid_impact
        self.advertisement_impact = advertisement_impact
        self.uk_holidays = holidays.UnitedKingdom()

    def remove_weekends_and_holidays(self, data):
        data['weekday'] = data['date'].dt.weekday
        data['is_holiday'] = data['date'].isin(self.uk_holidays)
        return data[(data['weekday'] < 5) & (~data['is_holiday'])]

    def calculate_daily_demand_stats(self, data):
        mean_demand = data['demand'].mean()
        std_dev_demand = data['demand'].std()
        return mean_demand, std_dev_demand

    def naive_extrapolation_with_uncertainty(self, last_observed, std_dev, num_days=90):
        forecast = np.random.normal(loc=last_observed, scale=std_dev, size=num_days)
        return forecast

    def apply_scenario_analysis(self, base_forecast, impact_factor):
        return base_forecast * impact_factor

    def update_quarterly_forecast(self, data, num_quarters=4):
        forecasts = []
        for _ in range(num_quarters):
            mean_demand, std_dev_demand = self.calculate_daily_demand_stats(data)
            last_observed = data['demand'].iloc[-1]
            quarterly_forecast = self.naive_extrapolation_with_uncertainty(last_observed, std_dev_demand)
            
            new_dates = pd.date_range(start=data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(quarterly_forecast))
            new_data = pd.DataFrame({'date': new_dates, 'demand': quarterly_forecast})
            new_data = self.remove_weekends_and_holidays(new_data)
            
            forecasts.extend(new_data['demand'])
            data = pd.concat([data, new_data], ignore_index=True)
        return data

    def generate_forecast(self, num_days=365, short_term=True):
        mean_demand, std_dev_demand = self.calculate_daily_demand_stats(self.historical_data)
        base_forecast = self.naive_extrapolation_with_uncertainty(mean_demand, std_dev_demand, num_days)
        
        if short_term:
            forecast_post_covid = self.apply_scenario_analysis(base_forecast, self.post_covid_impact)
            combined_forecast = self.apply_scenario_analysis(forecast_post_covid, self.advertisement_impact)
        else:
            combined_forecast = base_forecast
        
        forecast_dates = pd.date_range(start=self.historical_data['date'].iloc[-1] + pd.Timedelta(days=1), periods=num_days)
        forecast_df = pd.DataFrame({'date': forecast_dates, 'forecast': combined_forecast})
        forecast_df = self.remove_weekends_and_holidays(forecast_df)
        return forecast_df

    def plot_forecast(self, forecast_df):
        plt.figure(figsize=(14, 7))
        plt.plot(self.historical_data['date'], self.historical_data['demand'], label='Historical Data', color='blue')
        plt.plot(forecast_df['date'], forecast_df['forecast'], label='Forecast with Impacts', color='orange')
        plt.xlabel('Date')
        plt.ylabel('Daily Receipts')
        plt.title('LPA Daily Receipts Forecast with Uncertainty and Short-term Drivers')
        plt.legend()
        plt.show()

# Sample historical data
np.random.seed(42)
dates = pd.date_range(start='2020-01-01', end='2023-12-31', freq='D')
historical_data = pd.DataFrame({
    'date': dates,
    'demand': np.random.poisson(lam=100, size=len(dates))  # Simulating daily demands
})

# Instantiate the ForecastingModel class
forecast_model = ForecastingModel(historical_data)

# Remove weekends and holidays from historical data
historical_data = forecast_model.remove_weekends_and_holidays(historical_data)

# Generate short-term forecast
short_term_forecast_df = forecast_model.generate_forecast(num_days=365, short_term=True)

# Generate long-term forecast (without short-term impacts)
long_term_forecast_df = forecast_model.generate_forecast(num_days=365, short_term=False)

# Plotting the short-term forecast
forecast_model.plot_forecast(short_term_forecast_df)

# Plotting the long-term forecast
forecast_model.plot_forecast(long_term_forecast_df)

# Save the short-term forecast data
short_term_forecast_df.to_csv('lpa_short_term_forecast.csv', index=False)

# Save the long-term forecast data
long_term_forecast_df.to_csv('lpa_long_term_forecast.csv', index=False)


In [None]:
Explanation
Class Definition:

ForecastingModel: Encapsulates all forecasting-related methods and attributes.
__init__: Initializes the class with historical data and impact factors for short-term scenarios.
remove_weekends_and_holidays: Removes weekends and holidays from the data.
calculate_daily_demand_stats: Computes mean and standard deviation of demand.
naive_extrapolation_with_uncertainty: Generates a Naïve forecast with uncertainty.
apply_scenario_analysis: Applies short-term impact factors to the base forecast.
update_quarterly_forecast: Updates the forecast quarterly and removes weekends/holidays.
generate_forecast: Generates a forecast for a specified number of days, with optional short-term impacts.
plot_forecast: Plots the historical and forecasted data.
Data Simulation and Forecasting:

Generates sample historical data.
Instantiates the ForecastingModel class.
Removes weekends and holidays from the historical data.
Generates both short-term and long-term forecasts.
Plots the forecasts.
Saves the forecast data to CSV files.
This OOP-based approach allows you to easily extend and customize the forecasting model, and to generate and plot forecasts for different scenarios (short-term and long-term) in a structured manner.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm

# Generate sample historical data
np.random.seed(42)
dates = pd.date_range(start='2020-01-01', end='2023-12-31', freq='D')
historical_data = pd.DataFrame({
    'date': dates,
    'demand': np.random.poisson(lam=100, size=len(dates))  # Simulating daily demands
})

# Function to calculate average daily demand and uncertainty
def calculate_daily_demand_stats(data):
    mean_demand = data['demand'].mean()
    std_dev_demand = data['demand'].std()
    return mean_demand, std_dev_demand

# Function to perform Naïve extrapolation with uncertainty
def naive_extrapolation_with_uncertainty(last_observed, std_dev, num_days=90):
    forecast = np.random.normal(loc=last_observed, scale=std_dev, size=num_days)
    return forecast

# Function to update quarterly forecasts
def update_quarterly_forecast(data, num_quarters=4):
    forecasts = []
    for _ in range(num_quarters):
        mean_demand, std_dev_demand = calculate_daily_demand_stats(data)
        last_observed = data['demand'].iloc[-1]
        quarterly_forecast = naive_extrapolation_with_uncertainty(last_observed, std_dev_demand)
        forecasts.extend(quarterly_forecast)
        new_dates = pd.date_range(start=data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(quarterly_forecast))
        new_data = pd.DataFrame({'date': new_dates, 'demand': quarterly_forecast})
        data = pd.concat([data, new_data], ignore_index=True)
    return data

# Function to apply scenario analysis
def apply_scenario_analysis(base_forecast, impact_factor):
    return base_forecast * impact_factor

# Apply short-term drivers (post-COVID and advertisement impacts)
post_covid_impact = 1.1  # 10% increase
advertisement_impact = 1.2  # 20% increase

# Generate base forecast
mean_demand, std_dev_demand = calculate_daily_demand_stats(historical_data)
base_forecast = naive_extrapolation_with_uncertainty(mean_demand, std_dev_demand, num_days=365)

# Apply scenario impacts
forecast_post_covid = apply_scenario_analysis(base_forecast, post_covid_impact)
forecast_advertisement = apply_scenario_analysis(base_forecast, advertisement_impact)

# Combine impacts
combined_forecast = apply_scenario_analysis(forecast_post_covid, advertisement_impact)

# Update quarterly forecasts
updated_forecast_data = update_quarterly_forecast(historical_data)

# Plotting the results
plt.figure(figsize=(14, 7))
plt.plot(historical_data['date'], historical_data['demand'], label='Historical Data', color='blue')
plt.plot(pd.date_range(start=historical_data['date'].iloc[-1] + pd.Timedelta(days=1), periods=365), 
         combined_forecast, label='Forecast with Impacts', color='orange')
plt.xlabel('Date')
plt.ylabel('Daily Receipts')
plt.title('LPA Daily Receipts Forecast with Uncertainty and Short-term Drivers')
plt.legend()
plt.show()

print("Forecast with combined impacts for the next year:")
print(combined_forecast)

# Save the forecast data
forecast_dates = pd.date_range(start=historical_data['date'].iloc[-1] + pd.Timedelta(days=1), periods=365)
forecast_df = pd.DataFrame({'date': forecast_dates, 'forecast': combined_forecast})
forecast_df.to_csv('lpa_forecast.csv', index=False)


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import holidays

# Generate sample historical data
np.random.seed(42)
dates = pd.date_range(start='2020-01-01', end='2023-12-31', freq='D')
historical_data = pd.DataFrame({
    'date': dates,
    'demand': np.random.poisson(lam=100, size=len(dates))  # Simulating daily demands
})

# UK holidays
uk_holidays = holidays.UnitedKingdom()

# Function to remove weekends and holidays
def remove_weekends_and_holidays(data):
    data['weekday'] = data['date'].dt.weekday
    data['is_holiday'] = data['date'].isin(uk_holidays)
    return data[(data['weekday'] < 5) & (~data['is_holiday'])]

# Remove weekends and holidays from historical data
historical_data = remove_weekends_and_holidays(historical_data)

# Function to calculate average daily demand and uncertainty
def calculate_daily_demand_stats(data):
    mean_demand = data['demand'].mean()
    std_dev_demand = data['demand'].std()
    return mean_demand, std_dev_demand

# Function to perform Naïve extrapolation with uncertainty
def naive_extrapolation_with_uncertainty(last_observed, std_dev, num_days=90):
    forecast = np.random.normal(loc=last_observed, scale=std_dev, size=num_days)
    return forecast

# Function to update quarterly forecasts
def update_quarterly_forecast(data, num_quarters=4):
    forecasts = []
    for _ in range(num_quarters):
        mean_demand, std_dev_demand = calculate_daily_demand_stats(data)
        last_observed = data['demand'].iloc[-1]
        quarterly_forecast = naive_extrapolation_with_uncertainty(last_observed, std_dev_demand)
        
        # Generate new dates for the forecast period
        new_dates = pd.date_range(start=data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(quarterly_forecast))
        new_data = pd.DataFrame({'date': new_dates, 'demand': quarterly_forecast})
        
        # Remove weekends and holidays from new data
        new_data = remove_weekends_and_holidays(new_data)
        
        forecasts.extend(new_data['demand'])
        data = pd.concat([data, new_data], ignore_index=True)
    return data

# Function to apply scenario analysis
def apply_scenario_analysis(base_forecast, impact_factor):
    return base_forecast * impact_factor

# Apply short-term drivers (post-COVID and advertisement impacts)
post_covid_impact = 1.1  # 10% increase
advertisement_impact = 1.2  # 20% increase

# Generate base forecast
mean_demand, std_dev_demand = calculate_daily_demand_stats(historical_data)
base_forecast = naive_extrapolation_with_uncertainty(mean_demand, std_dev_demand, num_days=365)

# Apply scenario impacts
forecast_post_covid = apply_scenario_analysis(base_forecast, post_covid_impact)
forecast_advertisement = apply_scenario_analysis(base_forecast, advertisement_impact)

# Combine impacts
combined_forecast = apply_scenario_analysis(forecast_post_covid, advertisement_impact)

# Update quarterly forecasts
updated_forecast_data = update_quarterly_forecast(historical_data)

# Plotting the results
plt.figure(figsize=(14, 7))
plt.plot(historical_data['date'], historical_data['demand'], label='Historical Data', color='blue')
plt.plot(pd.date_range(start=historical_data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(combined_forecast)), 
         combined_forecast, label='Forecast with Impacts', color='orange')
plt.xlabel('Date')
plt.ylabel('Daily Receipts')
plt.title('LPA Daily Receipts Forecast with Uncertainty and Short-term Drivers')
plt.legend()
plt.show()

print("Forecast with combined impacts for the next year:")
print(combined_forecast)

# Save the forecast data
forecast_dates = pd.date_range(start=historical_data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(combined_forecast))
forecast_df = pd.DataFrame({'date': forecast_dates, 'forecast': combined_forecast})
forecast_df = remove_weekends_and_holidays(forecast_df)
forecast_df.to_csv('lpa_forecast.csv', index=False)


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import holidays

# Generate sample historical data
np.random.seed(42)
dates = pd.date_range(start='2020-01-01', end='2023-12-31', freq='D')
historical_data = pd.DataFrame({
    'date': dates,
    'demand': np.random.poisson(lam=100, size=len(dates))  # Simulating daily demands
})

# UK holidays
uk_holidays = holidays.UnitedKingdom()

# Function to remove weekends and holidays
def remove_weekends_and_holidays(data):
    data['weekday'] = data['date'].dt.weekday
    data['is_holiday'] = data['date'].isin(uk_holidays)
    return data[(data['weekday'] < 5) & (~data['is_holiday'])]

# Remove weekends and holidays from historical data
historical_data = remove_weekends_and_holidays(historical_data)

# Function to calculate average daily demand and uncertainty
def calculate_daily_demand_stats(data):
    mean_demand = data['demand'].mean()
    std_dev_demand = data['demand'].std()
    return mean_demand, std_dev_demand

# Function to perform Naïve extrapolation with uncertainty
def naive_extrapolation_with_uncertainty(last_observed, std_dev, num_days=90):
    forecast = np.random.normal(loc=last_observed, scale=std_dev, size=num_days)
    return forecast

# Function to update quarterly forecasts
def update_quarterly_forecast(data, num_quarters=4):
    forecasts = []
    for _ in range(num_quarters):
        mean_demand, std_dev_demand = calculate_daily_demand_stats(data)
        last_observed = data['demand'].iloc[-1]
        quarterly_forecast = naive_extrapolation_with_uncertainty(last_observed, std_dev_demand)
        
        # Generate new dates for the forecast period
        new_dates = pd.date_range(start=data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(quarterly_forecast))
        new_data = pd.DataFrame({'date': new_dates, 'demand': quarterly_forecast})
        
        # Remove weekends and holidays from new data
        new_data = remove_weekends_and_holidays(new_data)
        
        forecasts.extend(new_data['demand'])
        data = pd.concat([data, new_data], ignore_index=True)
    return data

# Function to apply scenario analysis
def apply_scenario_analysis(base_forecast, impact_factor):
    return base_forecast * impact_factor

# Apply short-term drivers (post-COVID and advertisement impacts)
post_covid_impact = 1.1  # 10% increase
advertisement_impact = 1.2  # 20% increase

# Generate base forecast
mean_demand, std_dev_demand = calculate_daily_demand_stats(historical_data)
base_forecast = naive_extrapolation_with_uncertainty(mean_demand, std_dev_demand, num_days=365)

# Apply scenario impacts
forecast_post_covid = apply_scenario_analysis(base_forecast, post_covid_impact)
forecast_advertisement = apply_scenario_analysis(base_forecast, advertisement_impact)

# Combine impacts
combined_forecast = apply_scenario_analysis(forecast_post_covid, advertisement_impact)

# Update quarterly forecasts
updated_forecast_data = update_quarterly_forecast(historical_data)

# Plotting the results
plt.figure(figsize=(14, 7))
plt.plot(historical_data['date'], historical_data['demand'], label='Historical Data', color='blue')
plt.plot(pd.date_range(start=historical_data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(combined_forecast)), 
         combined_forecast, label='Forecast with Impacts', color='orange')
plt.xlabel('Date')
plt.ylabel('Daily Receipts')
plt.title('LPA Daily Receipts Forecast with Uncertainty and Short-term Drivers')
plt.legend()
plt.show()

print("Forecast with combined impacts for the next year:")
print(combined_forecast)

# Save the forecast data
forecast_dates = pd.date_range(start=historical_data['date'].iloc[-1] + pd.Timedelta(days=1), periods=len(combined_forecast))
forecast_df = pd.DataFrame({'date': forecast_dates, 'forecast': combined_forecast})
forecast_df = remove_weekends_and_holidays(forecast_df)
forecast_df.to_csv('lpa_forecast.csv', index=False)


In [None]:
count_unique_receipts_monthly = count_unique_receipts_daily.groupby(['year', 'month_year'], as_index=False).mean()
count_unique_receipts_monthly = count_unique_receipts_monthly.rename(columns={'daily_demand': 'avg_monthly_demand'})

# Display the result
print(count_unique_receipts_monthly)

count_unique_receipts_annual = count_unique_receipts_monthly.groupby(['year'], as_index=False).mean()
count_unique_receipts_annual = count_unique_receipts_annual.rename(columns={'avg_monthly_demand': 'avg_annual_demand'})

# Sort
count_unique_receipts_annual = count_unique_receipts_annual.sort_values(by=['year'])

# Display the result
print(count_unique_receipts_annual)

#count_unique_receipts_daily['Overall_Percentage'] = count_unique_receipts_daily.iloc[:, 1].mean(axis=1)

# Save the result into a csv file
#lpa_reciepts.to_csv(r'lpa_reciepts.csv')

In [None]:
lpa_actuals = lpa_unique.query('year > 2021')
lpa_actuals

In [None]:
## Set index
lpa_actuals['receiptdate'] = pd.to_datetime(lpa_actuals['receiptdate'])

lpa_actuals = lpa_actuals.set_index('receiptdate')#.asfreq('D')

print(lpa_actuals.head())
print(lpa_actuals.tail())

## Select the appropriate variable to be forecasted
lpa_actuals_data = lpa_actuals['unique_key']

## infer the frequency of the data:
#lpa_actuals_data = lpa_actuals_data.asfreq(pd.infer_freq(lpa_actuals_data.index))

# lim_divorce_data = divorce_data[start_date:end_date]

# start_date_years = datetime.strptime(start_date, 
#                                      '%Y-%m-%d') + relativedelta(years = 0)
# print(start_date_years)

# start_date_formatted = start_date_years.date()


In [None]:
lpa_unique.info()
lpa_unique.shape
lpa_unique.describe()
lpa_unique.corr()

In [None]:
## Visualisation of the time series
#Plot the Actual Divorces

plt.figure(figsize=(20, 8))
plt.plot(lpa_actuals_data)
plt.title('UK Actual LPA Data', fontsize=20)
plt.ylabel('Demands', fontsize=16)

for year in range(2023,2024): #datetime.strptime(end_date, '%Y-%m-%d').year):
    plt.axvline(pd.to_datetime(str(year) + '-01-01'), color = 'k', 
                linestyle='--', alpha = 0.2)
    
plt.savefig('ActualLPAData.png', dpi=300, bbox_inches='tight')
plt.show()  

In [None]:
# Average daily receipts for 2024

# create a drop down list with average daily receipts of LPA application in 2024 in the range say from 4000 – 7000 in increments of 100. 
# Then this should be used as an estimate to apply unceratinty and to be converted into an age-specific annual donor forecast.

# Filter data to involve Registered status and Post-covid data from 2022 onwards
unique_receipts_post_covid = lpa_unique
unique_receipts_post_covid[unique_receipts_post_covid['status'].str.contains("Registered")]
unique_receipts_post_covid = unique_receipts_post_covid.query('year > 2023')
#df[df['Overall_Percentage'].isin([value for value in df['Overall_Percentage'] if value > 60])]
#df[df.apply(lambda row: row['Overall_Percentage'] > 55, axis=1)]
# # The “loc” method is used to access a group of rows and columns by label(s) or a boolean array. 
# #We can utilise it to filter a DataFrame based on specific column values.
#df.loc[df['Overall_Percentage'] > 40]
# # The “iloc” method is similar to “loc” but uses integer-based indexing instead of labels. 
# #It allows us to filter a DataFrame by specifying the row and column indices.
#df[df.iloc[:, -1] > 40]


#count_unique_receipts_monthly = unique_receipts_post_covid.groupby(['month_year'])['receiptdate'].agg('count').reset_index()

unique_receipts_post_covid = unique_receipts_post_covid.groupby(['receiptdate'])['unique_key'].agg('count').reset_index()

#count_unique_receipts_monthly = unique_receipts_post_covid.groupby(['year', 'month_year'])['uid'].agg('count').reset_index()

#count_unique_receipts_monthly = unique_receipts_post_covid.groupby(['month_year'])['receiptdate'].nunique().reset_index(name='count')

# Calculating the overall percentage for each donor and adding a new column
#count_unique_receipts_daily['Overall_Percentage'] = count_unique_receipts_daily.iloc[:, 1:].mean(axis=1)



count_unique_receipts_daily = unique_receipts_post_covid.rename(columns={'unique_key': 'daily_count'})

count_unique_receipts_daily['avg_daily_count'] = count_unique_receipts_daily['daily_count'].mean()


# Extract month letter and year 
###lpa_reciepts['month_year'] = lpa_reciepts['receiptdate'].dt.strftime('%b-%y')
# Calculate the number of unique records by month and year
#count_reciepts_month = lpa_reciepts.groupby(['year', 'month_year'])['receiptdate'].nunique().reset_index(name='count')
###Count_daily_reciepts = lpa_reciepts.groupby(['receiptdate']).count()

#unique_records = df.groupby('unique_key').agg('count').reset_index()  #.groupby(['year'])['unique_key'].nunique().reset_index(name='count')
###Count_daily_reciepts = Count_daily_reciepts.rename(columns={"count": "Count_of_daily_reciepts"})

#


# Extract month letter and year 
count_unique_receipts_daily['month_year'] = count_unique_receipts_daily['receiptdate'].dt.strftime('%b-%y')
count_unique_receipts_daily['year'] = count_unique_receipts_daily['receiptdate'].dt.strftime('%Y')
print(count_unique_receipts_daily)

count_unique_receipts_monthly = count_unique_receipts_daily.groupby(['year', 'month_year'], as_index=False).mean()
count_unique_receipts_monthly = count_unique_receipts_monthly.rename(columns={'daily_count': 'avg_monthly_count'})

# Display the result
print(count_unique_receipts_monthly)

count_unique_receipts_annual = count_unique_receipts_monthly.groupby(['year'], as_index=False).mean()
count_unique_receipts_annual = count_unique_receipts_annual.rename(columns={'avg_monthly_count': 'avg_annual_count'})

# Display the result
print(count_unique_receipts_annual)

#count_unique_receipts_daily['Overall_Percentage'] = count_unique_receipts_daily.iloc[:, 1].mean(axis=1)

# Save the result into a csv file
#lpa_reciepts.to_csv(r'lpa_reciepts.csv')

# Work out how many people applied for lpa and recieved the power of atthorney and how many applications in a year/month/week by age group since 2007?

In [None]:
d = lpa_unique.sort_values(by='unique_key')
count_unique_grouped_year = d.groupby(['year'])['unique_key'].nunique().reset_index(name='count')
count_unique_grouped_year

In [None]:
d = lpa_unique.sort_values(by='unique_key')
count_unique_grouped_age = d.groupby(['age'])['unique_key'].nunique().reset_index(name='count')
count_unique_grouped_age

In [None]:
g = lpa_unique
# Group by 'item' and 'color', then count the occurrences
count_unique_grouped_all = g.groupby(['receiptdate', 'uid', 'casesubtype', 'status', 'donor_dob', 'donor_postcode', 'donor_gender', 'age', 'year'])['unique_key'].count().reset_index(name='count')
count_unique_grouped_all

In [None]:
g = lpa_unique
# Group by 'item' and 'color', then count the occurrences
count_unique_grouped_age_year = g.groupby(['age', 'year'])['unique_key'].count().reset_index(name='count')
count_unique_grouped_age_year = count_unique_grouped_age_year.rename(columns={"count": "Count_of_CASEID"})
count_unique_grouped_age_year

# Save the result into a csv file
count_unique_grouped_age_year.to_csv(r'count_unique_grouped_age_year.csv')

In [None]:
d = lpa_unique.sort_values(by='unique_key')
count_unique_grouped_year = d.groupby(['year'])['unique_key'].nunique().reset_index(name='count')
count_unique_grouped_yearcount_unique_grouped_age_year.to_csv(r'count_unique_grouped_age_year.csv')

In [None]:
d = lpa_unique.sort_values(by='unique_key')
count_unique_grouped_age = d.groupby(['age'])['unique_key'].nunique().reset_index(name='count')
count_unique_grouped_age

In [None]:
g = lpa_unique
# Group by 'item' and 'color', then count the occurrences
count_unique_grouped_all = g.groupby(['receiptdate', 'uid', 'casesubtype', 'status', 'donor_dob', 'donor_postcode', 'donor_gender', 'age', 'year'])['unique_key'].count().reset_index(name='count')
count_unique_grouped_all

In [None]:
g = lpa_unique
# Group by 'item' and 'color', then count the occurrences
count_unique_grouped_age_year = g.groupby(['age', 'year'])['unique_key'].count().reset_index(name='count')
count_unique_grouped_age_year = count_unique_grouped_age_year.rename(columns={"count": "Count_of_CASEID"})
count_unique_grouped_age_year

# Save the result into a csv file
count_unique_grouped_age_year.to_csv(r'count_unique_grouped_age_year.csv')

In [None]:
# Extract the year of reciept from the receiptdate
#receipt_year = int(record["receiptdate"].split("-")[0])

# Calculate the number of unique records by age and year
count_unique_records = lpa_data_no_index.groupby(['year', 'donor_gender', 'age'])['unique_key'].nunique().reset_index(name='count')
#####count_unique_records = lpa_data.reset_index(name='count')
#####count_unique_records = lpa_data.reset_index(name='count')


#unique_records = df.groupby('unique_key').agg('count').reset_index()  #.groupby(['year'])['unique_key'].nunique().reset_index(name='count')
####count_unique_records = count_unique_records.rename(columns={"count": "Count_of_CASEID"})

# Display the result
####print(count_unique_records)

# Save the result into a csv file
#count_unique_records.to_csv(r'count_unique_records.csv')

# Dermine Whether the application type [casesubtype] is hw=health and welfare or pfa=property and finance

# How many certificate provider (cp) for each lpa application?

# Location based data and geographical data for the donor can be used to identify the financial situation and wherether they are located in England or Wales

In [None]:
# Extract the year from the receiptdate
#receipt_year = birth_year        

# Calculate the age of the person
#age = receiptdate - birth_year
#lpa_df['a'] = 
############(pd.to_datetime(lpa_df['receiptdate'], errors = 'coerce').dt.day - pd.to_datetime(lpa_df['donor_dob'], errors = 'coerce').dt.day) # / 365.25
#lpa_df

# Create an Excel writer
writer = pd.ExcelWriter('LPA_Data_actuals_Years.xlsx', engine='xlsxwriter')

# Iterate through unique years and save data to separate sheets
for year in count_unique_grouped_age_year['year'].unique():
    year_data = count_unique_grouped_age_year[count_unique_grouped_age_year['year'] == year]
    year_data.to_excel(writer, sheet_name=str(year), index=False)


# Save the Excel file
writer.save()
writer.close()  # Close the ExcelWriter

year_data
# # Iterate through unique years and save data to separate sheets
# for year in lpa_df['year'].unique():
#     year_data = lpa_df[lpa_df['year'] == year]
#     chunk_size = 100000  # Adjust as needed
#     num_chunks = len(year_data) // chunk_size + 1
#     for i in range(num_chunks):
#         start_idx = i * chunk_size
#         end_idx = (i + 1) * chunk_size
#         chunk_data = lpa_df.iloc[start_idx:end_idx]
#         chunk_data.to_excel(writer, sheet_name=f'Sheet{i}', index=False)

# # Save the Excel file
# writer.save()
# writer.close()  # Close the ExcelWriter

In [None]:
# Pivot table with count aggregation
pivot_table = pd.pivot_table(lpa_unique,
                              values='unique_key',
                              index='age',
                              columns='year',
                              aggfunc='count')

# Replace NaN values with zeros
pivot_table_filled = pivot_table.fillna(0)

print(pivot_table_filled)

In [None]:
# Save the result into a csv file
pivot_table_filled.to_csv(r'count_unique_records.csv')

# Define the source path of the CSV file (assuming it's in the current directory)
source_csv_path = "count_unique_records.csv"

# Define the target directory where the CSV file should be placed
target_directory = "csv_files"

# Create the target directory if it doesn't exist
if not os.path.exists(target_directory):
    os.makedirs(target_directory)

# Move the CSV file to the target directory
shutil.move(source_csv_path, os.path.join(target_directory, "count_unique_records.csv"))

# Print a success message
print(f"The CSV file {source_csv_path} was successfully moved to {target_directory}.")

# Mortality Statistics
## Source Data For Mortality Statistics and Modelled Age Specific Survival Rates (Model Input Set By Control Assumptions)

# LPA Control Assumptions
## Specific Key Assumptions that control expected demand , LPA market size and saturation.


# Meta data and Variable selection and Data Cleaning for the Mortality statastics data based on population projections:

## Goal: 
### What proportion of the UK population are likely to buy LPA and still alive?
*How many people are still alive (Living Donors bought LPA)*
*Based on ONS Data of Population of Engalnd and Wales, how many people are still alive and how many of them are dead?*
*e.g., if there are 1000 people and 100 of them are still alive and bought LPA,
so there are 900 of them still didn't buy LPA.



**1. These rates are standardised to the 2013 European Standard Population, expressed per million population; 
they allow comparisons between populations with different age structures, including between males and females and over time. 
**2.  Deaths per 1,000 live births. 
**3.  Death figures are based on deaths registered rather than deaths occurring in a calendar year.

### For information on registration delays for a range of causes, see: 
    https://webarchive.nationalarchives.gov.uk/ukgwa/20160106020016/http://www.ons.gov.uk/ons/guide-method/user-guidance/health-and-life-events/impact-of-registration-delays-on-mortality-statistics/index.html

A limiting factor in modelling numbers of surving LPA holders aged 90+ has been the absence of single age specific mortality rates 
for this group. Estimates* suggested that previously applied mortality rates were too low increasing the apparent numbers of 
surviving LPA holder saged 90+ and therefore over-estimating the "sauration of this market.

For the 2018 LPA forecast , Age specific mortality rates for those aged 90+ have therefore been extrapolated based on 
a standard log power law that best fits existing mortality rates to age. 

*numbers of surviving LPA holders were estimated to exceed the total projected  population in each age group which was 
clearly not possible.


# LPA SURVIVAL TABLES:
 LPA MODEL/LPA SURVIVAL TABLES
percentage of people are died in one year


# if a 1000 40 years old male bought an LPA in 2008, what proportion of are still alove today?

# The model taking each age categories (categorical variable) and assumed that they are 
# singe age-specifics in the age category 18 to 90 and provide figure what percentage of people for male died within one year?

## e.g., in the 15-19 age category, 0.3 percent of males died within one year in the UK and 0.03 per 1000
## e.g., in the 25-29 age category, 0.6 percent of males died within one year in the UK and 0.06 per 1000
## e.g., in the 70-74 age category, 23.7 percent of males died within one year in the UK or 2.37 per 1000

## if you started at age 18, 7 years and become 25 years old ahead, 
## as the ages goes up you will fall into a higher mortality category (from 0.3 to 0.6)




# calculate naïve extrapolation for demand forecasting and calculate low planning estimate, centeral planning estimate, and high planning estimate

In [None]:
# Extract the month of reciept from the receiptdate
#lpa_data_no_index['month'] = lpa_data_no_index['receiptdate'].dt.month
#lpa_data_no_index

In [None]:


# Calculate the number of unique records by month and year
count_unique_month = lpa_data.groupby(['year', 'month_year', 'age'])['uid'].nunique().reset_index(name='count')


#unique_records = df.groupby('unique_key').agg('count').reset_index()  #.groupby(['year'])['unique_key'].nunique().reset_index(name='count')
count_unique_month = count_unique_month.rename(columns={"count": "Count_of_CASEID_month"})

# Display the result
print(count_unique_month)

# Save the result into a csv file
#count_unique_month.to_csv(r'count_unique_month.csv')

In [None]:
#df_count_unique_month = pd.DataFrame(count_unique_month)
age_lower_limit = 50
age_upper_limit = 70
count_unique_month1 = count_unique_month.loc[(count_unique_month["age"] >= age_lower_limit) &
                 (count_unique_month["age"] <= age_upper_limit)]

#age_range = [50:70]
#count_unique_month.loc[count_unique_month["month_year"].isin(age_range)]

count_unique_month2 = count_unique_month1[count_unique_month1['month_year'].isin(['Jan-24', 'Feb-24', 'Mar-24'])]
#count_unique_month2 = count_unique_month1[count_unique_month1['month_year'].isin([50:70])]
#df_count_unique_month['month_year'] = pd.to_datetime(df_count_unique_month['month_year'], format='%b-%y')
#df_count_unique_month = df_count_unique_month.sort_values(df_count_unique_month.columns[1])
#df_count_unique_month = df_count_unique_month.set_index(['month_year'])

count_unique_month_age = count_unique_month2
# Save the result into a csv file
count_unique_month_age.to_csv(r'count_unique_month_age.csv')

In [None]:


# Calculate the naïve forecast (previous month's sales)
count_unique_month_age['Naive_Forecast'] = count_unique_month_age['Count_of_CASEID_month'].shift(1)

# Define planning estimate factors
low_factor = 0.9
high_factor = 1.1

# Calculate planning estimates
count_unique_month_age['Low_Planning_Estimate'] = count_unique_month_age['Naive_Forecast'] * low_factor
count_unique_month_age['Central_Planning_Estimate'] = count_unique_month_age['Naive_Forecast']
count_unique_month_age['High_Planning_Estimate'] = count_unique_month_age['Naive_Forecast'] * high_factor

# Calculate MAPE (Mean Absolute Percentage Error)
count_unique_month_age['Absolute_Percentage_Error'] = abs(count_unique_month_age['Count_of_CASEID_month'] - count_unique_month_age['Naive_Forecast']) / count_unique_month_age['Count_of_CASEID_month']
mape = count_unique_month_age['Absolute_Percentage_Error'].mean() * 100

# Calculate MAD (Mean Absolute Deviation)
count_unique_month_age['Absolute_Deviation'] = abs(count_unique_month_age['Count_of_CASEID_month'] - count_unique_month_age['Naive_Forecast'])
mad = count_unique_month_age['Absolute_Deviation'].mean()

# Display results
print(count_unique_month_age)
print(f"MAPE: {mape:.2f}%")
print(f"MAD: {mad:.2f}")
print("\nPlanning Estimates:")
print(f"Low Planning Estimate: {count_unique_month_age['Low_Planning_Estimate'].iloc[-1]:.2f}")
print(f"Central Planning Estimate: {count_unique_month_age['Central_Planning_Estimate'].iloc[-1]:.2f}")
print(f"High Planning Estimate: {count_unique_month_age['High_Planning_Estimate'].iloc[-1]:.2f}")

In [None]:
# Create a line plot
plt.figure(figsize=(8, 6))
plt.plot(df_count_unique_month['Count_of_CASEID_month'], df_count_unique_month['Low_Planning_Estimate'], label='Low Estimate', marker='o')
plt.plot(df_count_unique_month['Count_of_CASEID_month'], df_count_unique_month['Central_Planning_Estimate'], label='Central Estimate', marker='s')
plt.plot(df_count_unique_month['Count_of_CASEID_month'], df_count_unique_month['High_Planning_Estimate'], label='High Estimate', marker='^')

# Add labels and title
plt.xlabel('Month')
plt.ylabel('Sales Estimate')
plt.title('Demand Forecasting Estimates')
plt.grid(True)
plt.legend()

In [None]:

# Number of LPA reciepts
lpa_reciepts = lpa_unique
# Extract month letter and year 
lpa_reciepts['month_year'] = lpa_reciepts['receiptdate'].dt.strftime('%b-%y')
# Calculate the number of unique records by month and year
count_reciepts_month = lpa_reciepts.groupby(['year', 'month_year'])['receiptdate'].nunique().reset_index(name='count')


#unique_records = df.groupby('unique_key').agg('count').reset_index()  #.groupby(['year'])['unique_key'].nunique().reset_index(name='count')
count_reciepts_month = count_reciepts_month.rename(columns={"count": "Count_of_reciepts_month"})

# Display the result
print(count_reciepts_month)

# Save the result into a csv file
#count_reciepts_month.to_csv(r'count_reciepts_month.csv')

In [None]:

# Number of LPA reciepts
lpa_reciepts = lpa_data

# Extract month letter and year 
lpa_reciepts['month_year'] = lpa_reciepts['receiptdate'].dt.strftime('%b-%y')

# Calculate the number of unique records by month and year
Count_of_reciepts_annual = lpa_reciepts.groupby(['year'])['receiptdate'].nunique().reset_index(name='count')


#unique_records = df.groupby('unique_key').agg('count').reset_index()  #.groupby(['year'])['unique_key'].nunique().reset_index(name='count')
Count_of_reciepts_annual = Count_of_reciepts_annual.rename(columns={"count": "Count_of_reciepts_annual"})


# Display the result
print(Count_of_reciepts_annual)

# Save the result into a csv file
####Count_of_reciepts_annual.to_csv(r'Count_of_reciepts_annual.csv')