In [None]:
import os
import pandas as pd
import numpy as np
import boto3
import getpass
import warnings
warnings.filterwarnings('ignore')

import datetime as dt
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta

import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO

!pip install xlsxwriter
import xlsxwriter
#from scipy.stats import kstest, probplot

#import pymc as pm
#import arviz as az

In [None]:
## ------------------Step 1: DataHandler Class-------------------- ##

class DataHandler:
    def __init__(self, bucket_name, folder_path):
        self.bucket_name = bucket_name
        self.folder_path = folder_path
        self.s3_client = boto3.client('s3')
        self.selected_columns = ["receiptdate", "cases_glueexporteddate", "uid", "type", "casesubtype", "status", "donor_dob", "donor_postcode", "donor_gender"]
        self.combined_df = None
        self.snapshot_start = '2021-12-31'#'2006-12-31'
        self.snapshot_end = None #'2023-12-31' #

    def list_files(self):
        response = self.s3_client.list_objects_v2(Bucket=self.bucket_name, Prefix=self.folder_path)
        file_keys = [obj['Key'] for obj in response.get('Contents', [])]
        non_none_file_keys = [key for key in file_keys if key is not None]
        file_names = [os.path.basename(key) for key in non_none_file_keys]
        csv_extension = '.csv'
        filtered_file_names = [fn for fn in file_names if fn.lower().endswith(csv_extension)]
        return filtered_file_names

    def read_csv_files(self, file_names):
        dfs = {}
        for file_name in file_names:
            s3_path = f's3://{self.bucket_name}/{file_name}'
            try:
                csv_obj = self.s3_client.get_object(Bucket=self.bucket_name, Key=f'{self.folder_path}/{file_name}')
                csv_string = csv_obj['Body'].read().decode('utf-8')
                df = pd.read_csv(StringIO(csv_string))
                df_selected = df[self.selected_columns]
                dfs[file_name] = df_selected
            except Exception as e:
                print(f"Error reading {file_name}: {e}")
        return dfs

    def concatenate_dataframes(self, dataframes):
        self.combined_df = pd.concat(dataframes.values(), ignore_index=True)

    def set_dates(self, start_date=None, end_date=None):
        self.snapshot_start = start_date if start_date else '2021-12-31' 
        if end_date:
            self.snapshot_end = end_date
        else:
            first_csv_file_name = self.list_files()[1]
            self.snapshot_end = first_csv_file_name.split('opg-analytical_cases_P')[1].lstrip().split('_S')[0] #'2023-12-31' # 

## -------------------

# Enter the corresponding S3 bucket name
bucketName = "alpha-opg-analytical"

# Select the corresponding folder includes new LPA data in S3 bucket:
folderPath = "sirius_data_cuts_3"

# Instantiate the DataHandler
data_handler = DataHandler(bucket_name=bucketName, folder_path=folderPath)
file_names = data_handler.list_files()
dataframes = data_handler.read_csv_files(file_names)
data_handler.concatenate_dataframes(dataframes)
data_handler.set_dates()


print(data_handler.combined_df.head(2))


In [None]:

class DataPreprocessor:
    def __init__(self, combined_df, snapshot_start, snapshot_end):
        self.combined_df = combined_df
        self.snapshot_start = snapshot_start
        self.snapshot_end = snapshot_end
        self.lpa_data_sample_imputed = None

    def preprocess_data(self):
        df_filtered = self.combined_df
        df_filtered['receiptdate'] = pd.to_datetime(df_filtered['receiptdate'], errors='coerce')
        df_filtered = df_filtered.loc[(df_filtered['receiptdate'] > pd.to_datetime(self.snapshot_start))]
        df_filtered = df_filtered.loc[(df_filtered['receiptdate'] < pd.to_datetime(self.snapshot_end))]
        df_filtered = df_filtered.loc[(df_filtered['type'] == 'lpa')]
        df = df_filtered[["receiptdate", "uid", "casesubtype", "status", "donor_dob", "donor_postcode", "donor_gender"]]
        lpa_df = df.dropna()
        lpa_df['age'] = pd.to_datetime(lpa_df['receiptdate'], errors='coerce').dt.year - pd.to_datetime(lpa_df['donor_dob'], errors='coerce').dt.year
        lpa_df['donor_dob'] = pd.to_datetime(lpa_df['donor_dob'], errors='coerce').dt.date
        lpa_df['receiptdate'] = pd.to_datetime(lpa_df['receiptdate'], errors='coerce')
        lpa_df['year'] = lpa_df['receiptdate'].dt.year
        lpa_data_sample_imputed = lpa_df

        criteria = lpa_data_sample_imputed[(lpa_data_sample_imputed['age'] < 0) | (lpa_data_sample_imputed['age'] > 126)]
        lpa_data_sample_imputed.loc[criteria.index, 'age'] = np.nan

        most_common_age = lpa_data_sample_imputed.groupby('year')['age'].apply(lambda x: x.mode().iloc[0])
        lpa_data_sample_imputed['age'] = lpa_data_sample_imputed.apply(lambda row: most_common_age[row['year']] if pd.isna(row['age']) else row['age'], axis=1)

        # Ensure all ages from 18 to 106 are present
        all_ages = pd.DataFrame({'age': range(18, 107)})
        lpa_data_sample_imputed = lpa_data_sample_imputed.merge(all_ages, on='age', how='right')

        # Impute missing values in all columns based on the distribution of that particular age in previous years
        for column in lpa_data_sample_imputed.columns:
            if column not in ['age', 'year']:
                lpa_data_sample_imputed[column] = lpa_data_sample_imputed.groupby('age')[column].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else x.mean()))

        self.lpa_data_sample_imputed = lpa_data_sample_imputed

    def calculate_annual_receipts(self):
        df = self.lpa_data_sample_imputed

        def count_working_days(year):
            start_date = f'{year}-01-01'
            end_date = f'{year}-12-31'
            date_range = pd.date_range(start_date, end_date, freq='B')
            return len(date_range)

        results = {
            'year': [],
            'Average Daily Reciept': [],
            'Yearly Receipt': [],
            'Standard Deviation Daily Reciept': [],
            'Yearly Standard Deviation': []
        }

        for year in df['year'].unique():
            df_year = df[df['year'] == year]
            daily_reciept = df_year.groupby('receiptdate').size()
            average_daily_reciept = daily_reciept.mean()
            std_dev_daily_reciept = daily_reciept.std()
            working_days = count_working_days(year)
            yearly_reciept = average_daily_reciept * working_days
            yearly_std_dev = average_daily_reciept * working_days

            results['year'].append(year)
            results['Average Daily Reciept'].append(average_daily_reciept)
            results['Yearly Receipt'].append(yearly_reciept)
            results['Standard Deviation Daily Reciept'].append(std_dev_daily_reciept)
            results['Yearly Standard Deviation'].append(yearly_std_dev)

        annual_receipts = pd.DataFrame(results)
        annual_receipts = annual_receipts.sort_values(by=['year'])

        return annual_receipts

    def create_unique_identifier(self):
        lpa_unique = self.lpa_data_sample_imputed
        lpa_unique['donor_postcode'] = lpa_unique['donor_postcode'].str.strip().str.replace(' ', '')
        lpa_unique['unique_key'] = lpa_unique['donor_dob'].astype(str) + lpa_unique['donor_gender'] + lpa_unique['donor_postcode']

        duplicate_mask = lpa_unique.duplicated('unique_key', keep=False)
        lpa_unique = pd.concat([lpa_unique.loc[duplicate_mask & lpa_unique[['age', 'donor_gender', 'donor_postcode']].ne(0).any(axis=1)], 
                                lpa_unique[~duplicate_mask]])

        lpa_unique['month_year'] = lpa_unique['receiptdate'].dt.strftime('%b-%y')
        lpa_unique = lpa_unique.sort_values(by=['receiptdate'])
        return lpa_unique

    def calculate_unique_donors_per_year(self, lpa_df_index):
        lpa_df_index.set_index('receiptdate', inplace=True)
        annual_unique_donors = lpa_df_index.groupby(['year', 'age'])['unique_key'].nunique().reset_index(name='count')
        annual_unique_donors = annual_unique_donors.sort_values(by=['year', 'age'])
        return annual_unique_donors

    def calculate_unique_donors_count_monthly(self, lpa_df_index):
        lpa_df_no_index = lpa_df_index.reset_index()
        lpa_df_no_index['month'] = lpa_df_no_index['receiptdate'].dt.month
        monthly_unique_donors = lpa_df_no_index.groupby(['year', 'month', 'donor_gender', 'age'])['unique_key'].nunique().reset_index(name='count')
        monthly_unique_donors = monthly_unique_donors.sort_values(by=['year', 'month', 'donor_gender', 'age'])
        return monthly_unique_donors

    def calculate_unique_donors_per_day(self, lpa_df_index):     
        lpa_df_no_index = lpa_df_index.reset_index()
        lpa_df_no_index['month'] = lpa_df_no_index['receiptdate'].dt.month
        lpa_df_no_index['day'] = lpa_df_no_index['receiptdate'].dt.day
        daily_unique_donors = lpa_df_no_index.groupby(['year', 'month', 'day', 'receiptdate', 'donor_gender', 'age'])['unique_key'].nunique().reset_index(name='count')
        daily_unique_donors = daily_unique_donors.sort_values(by=['receiptdate', 'year', 'month', 'day', 'donor_gender', 'age'])
        return daily_unique_donors    
    
    def calculate_unique_donors_age_specific(self, lpa_df_index):
       #lpa_df_index.set_index('receiptdate', inplace=True)
        annual_unique_donors_age_specific = lpa_df_index.groupby(['year', 'age'])['unique_key'].nunique().reset_index(name='count')
        annual_unique_donors_age_specific = annual_unique_donors_age_specific.sort_values(by=['year', 'age'])
        return annual_unique_donors_age_specific
        
    def calculate_unique_donors_count_daily(self, lpa_df_index):
        #daily_unique_donors = lpa_df_index.groupby(['year'])['unique_key'].nunique().reset_index(name='count')
        #daily_unique_donors = daily_unique_donors.sort_values(by=['year'])
        # Group by 'year' and count unique 'unique_id'
        annual_unique_counts = lpa_df_index.groupby(lpa_df_index['year'])['unique_key'].nunique()    
        return annual_unique_counts


In [None]:
# ## ------------------Step 2: DataPreprocessor Class-------------------- ##

# class DataPreprocessor:
#     def __init__(self, combined_df, snapshot_start, snapshot_end):
#         self.combined_df = combined_df
#         self.snapshot_start = snapshot_start
#         self.snapshot_end = snapshot_end
#         self.lpa_data_sample_imputed = None

#     def preprocess_data(self):
#         df_filtered = self.combined_df
#         df_filtered['receiptdate'] = pd.to_datetime(df_filtered['receiptdate'], errors='coerce')
#         df_filtered = df_filtered.loc[(df_filtered['receiptdate'] > pd.to_datetime(self.snapshot_start))]
#         df_filtered = df_filtered.loc[(df_filtered['receiptdate'] < pd.to_datetime(self.snapshot_end))]
#         df_filtered = df_filtered.loc[(df_filtered['type'] == 'lpa')]
#         df = df_filtered[["receiptdate", "uid", "casesubtype", "status", "donor_dob", "donor_postcode", "donor_gender"]]
#         lpa_df = df.dropna()
#         lpa_df['age'] = pd.to_datetime(lpa_df['receiptdate'], errors='coerce').dt.year - pd.to_datetime(lpa_df['donor_dob'], errors='coerce').dt.year
#         lpa_df['donor_dob'] = pd.to_datetime(lpa_df['donor_dob'], errors='coerce').dt.date
#         lpa_df['receiptdate'] = pd.to_datetime(lpa_df['receiptdate'], errors='coerce')
#         lpa_df['year'] = lpa_df['receiptdate'].dt.year
#         lpa_data_sample_imputed = lpa_df

#         criteria = lpa_data_sample_imputed[(lpa_data_sample_imputed['age'] < 0) | (lpa_data_sample_imputed['age'] > 126)]
#         lpa_data_sample_imputed.loc[criteria.index, 'age'] = np.nan

#         most_common_age = lpa_data_sample_imputed.groupby('year')['age'].apply(lambda x: x.mode().iloc[0])
#         lpa_data_sample_imputed['age'] = lpa_data_sample_imputed.apply(lambda row: most_common_age[row['year']] if pd.isna(row['age']) else row['age'], axis=1)

#         self.lpa_data_sample_imputed = lpa_data_sample_imputed

#     def calculate_annual_receipts(self):
#         df = self.lpa_data_sample_imputed

#         def count_working_days(year):
#             start_date = f'{year}-01-01'
#             end_date = f'{year}-12-31'
#             date_range = pd.date_range(start_date, end_date, freq='B')
#             return len(date_range)

#         results = {
#             'year': [],
#             'Average Daily Reciept': [],
#             'Yearly Receipt': [],
#             'Standard Deviation Daily Reciept': [],
#             'Yearly Standard Deviation': []
#         }

#         for year in df['year'].unique():
#             df_year = df[df['year'] == year]
#             daily_reciept = df_year.groupby('receiptdate').size()
#             average_daily_reciept = daily_reciept.mean()
#             std_dev_daily_reciept = daily_reciept.std()
#             working_days = count_working_days(year)
#             yearly_reciept = average_daily_reciept * working_days
#             yearly_std_dev = average_daily_reciept * working_days

#             results['year'].append(year)
#             results['Average Daily Reciept'].append(average_daily_reciept)
#             results['Yearly Receipt'].append(yearly_reciept)
#             results['Standard Deviation Daily Reciept'].append(std_dev_daily_reciept)
#             results['Yearly Standard Deviation'].append(yearly_std_dev)

#         annual_receipts = pd.DataFrame(results)
#         annual_receipts = annual_receipts.sort_values(by=['year'])

#         return annual_receipts

#     def create_unique_identifier(self):
#         lpa_unique = self.lpa_data_sample_imputed
#         lpa_unique['donor_postcode'] = lpa_unique['donor_postcode'].str.strip().str.replace(' ', '')
#         lpa_unique['unique_key'] = lpa_unique['donor_dob'].astype(str) + lpa_unique['donor_gender'] + lpa_unique['donor_postcode']

#         duplicate_mask = lpa_unique.duplicated('unique_key', keep=False)
#         lpa_unique = pd.concat([lpa_unique.loc[duplicate_mask & lpa_unique[['age', 'donor_gender', 'donor_postcode']].ne(0).any(axis=1)], 
#                                 lpa_unique[~duplicate_mask]])

#         lpa_unique['month_year'] = lpa_unique['receiptdate'].dt.strftime('%b-%y')
#         lpa_unique = lpa_unique.sort_values(by=['receiptdate'])
#         return lpa_unique

#     def calculate_unique_donors_per_year(self, lpa_df_index):
#         lpa_df_index.set_index('receiptdate', inplace=True)
#         annual_unique_donors = lpa_df_index.groupby(['year', 'donor_gender', 'age'])['unique_key'].nunique().reset_index(name='count')
#         annual_unique_donors = annual_unique_donors.sort_values(by=['year', 'donor_gender', 'age'])
#         return annual_unique_donors

#     def calculate_unique_donors_count_monthly(self, lpa_df_index):
#         lpa_df_no_index = lpa_df_index.reset_index()
#         lpa_df_no_index['month'] = lpa_df_no_index['receiptdate'].dt.month
#         monthly_unique_donors = lpa_df_no_index.groupby(['year', 'month', 'donor_gender', 'age'])['unique_key'].nunique().reset_index(name='count')
#         monthly_unique_donors = monthly_unique_donors.sort_values(by=['year', 'month', 'donor_gender', 'age'])
#         return monthly_unique_donors

#     def calculate_unique_donors_per_day(self, lpa_df_index):     
#         lpa_df_no_index = lpa_df_index.reset_index()
#         lpa_df_no_index['month'] = lpa_df_no_index['receiptdate'].dt.month
#         lpa_df_no_index['day'] = lpa_df_no_index['receiptdate'].dt.day
#         daily_unique_donors = lpa_df_no_index.groupby(['year', 'month', 'day', 'receiptdate', 'donor_gender', 'age'])['unique_key'].nunique().reset_index(name='count')
#         daily_unique_donors = daily_unique_donors.sort_values(by=['receiptdate', 'year', 'month', 'day', 'donor_gender', 'age'])
#         return daily_unique_donors    
    
#     def calculate_unique_donors_age_specific(self, lpa_df_index):
#        #lpa_df_index.set_index('receiptdate', inplace=True)
#         annual_unique_donors_age_specific = lpa_df_index.groupby(['year', 'age'])['unique_key'].nunique().reset_index(name='count')
#         annual_unique_donors_age_specific = annual_unique_donors_age_specific.sort_values(by=['year', 'age'])
#         return annual_unique_donors_age_specific
        
#     def calculate_unique_donors_count_daily(self, lpa_df_index):
#         #daily_unique_donors = lpa_df_index.groupby(['year'])['unique_key'].nunique().reset_index(name='count')
#         #daily_unique_donors = daily_unique_donors.sort_values(by=['year'])
#         # Group by 'year' and count unique 'unique_id'
#         annual_unique_counts = lpa_df_index.groupby(lpa_df_index['year'])['unique_key'].nunique()    
#         return annual_unique_counts
    
#     # def aggregate_by_age_group(self, bins, labels):
        
#     #     daily_unique_donors = lpa_df_index.groupby(['year', 'month_year', 'donor_gender', 'age'])['unique_key'].nunique().reset_index(name='count')

#     #     # Create age groups
#     #     bins = list(range(18, 106))
#     #     labels = [str(i) for i in range(18, 107)]
        
#     #     # Create a new column 'AgeGroup' with the age bins
#     #     self.data['AgeGroup'] = pd.cut(self.data['age'], bins=bins, labels=labels, right=False)
        
#     #     # Aggregate data within each age group
#     #     aggregated_data = self.data.groupby('AgeGroup').agg({
#     #         'demand': 'sum',
#     #         'name': 'count'
#     #     }).rename(columns={'name': 'count'})
        
#     #     return aggregated_data    
 



# short-term forecast the rest of age-specific count of donor demands for the remaining months in 2024 based on a bunch of short-term drivers like Martin Lewis broadcast 
As we have only first quarter of age-specific count of donor demands (annual_donors_age_2024), to short-term forecast the rest of quarters age-specific count of donor demands for the remaining months in 2024:
- Estimate the short-term forecasts for the remaining of 2024 by first, take the average number for receipts at working day across 2024 which currently is 5600 multiplied by the total number of working days in the year which is 256 give me the total number of expected receipts for the year (5600 * 256 = 1.433 million receipts across the whole of 2024). 
    1. Calculate the number of reciepts across the whole year (for the available data (only 6 months) on average are going to be for 2024 = average_daily_reciepts_2024
    2. Muliply (average_daily_reciepts_2024) to the Number of working days in 2024 (num_working_day_2024) = total_annual_reciepts_2024
    3. Estimate the Ratio of total_annual_reciepts_2024 to annual_donors_age_2024: ratio_reciept_donor_2024 = total_annual_reciepts_2024 / annual_donors_age_2024
- Once you have calculated the total number of receipts for 2024 then by looking at 2022/2023 historical data count how many receipts there were in 2022/2023 and how many donors were in 2022/2023 just to get the overall ratio. The starting point is just the total number for receipts so according to this that would be just over 1.4 million more receipts then scale that back based on total number for receipts 2022/2023 e.g., 70% then that would suggest that would be 1,000,000 donors roughly. E.g., just to keep it simple we consider the same overall ratio in 2022/2023 (e.g., 70%) should be use in 2024, as 2022 and 2023 are the nearest year to 2024. This will give us an estimate for the total number of donors that we might expect in 2024 based on what we're seeing about average receipts across the year in 2024.      
    1. Calculate the number of reciepts across the whole year on average for 2023 = average_daily_reciepts_2023
    2. Muliply (average_daily_reciepts_2023) to the Number of working days in 2023 (num_working_day_2023) = total_annual_reciepts_2023
    3. Muliply (average_daily_reciepts_2022) to the Number of working days in 2022 (num_working_day_2022) = total_annual_reciepts_2022
    4. Estimate the Ratio of total_annual_reciepts_2023 to annual_donors_age_2023: ratio_reciept_donor_2023 = total_annual_reciepts_2023 / annual_donors_age_2023
    5. Estimate the Ratio of total_annual_reciepts_2022 to annual_donors_age_2022: ratio_reciept_donor_2022 = total_annual_reciepts_2022 / annual_donors_age_2022
- Then scale that back to work out however how many donors that is expected to be in 2024 and then using the historical age distribution in 2022/2023 just to break that down by age so that we can plug in those figures for 2024. So once calculated the number of donors in 2024, then need to find out of those donors what percentage would be aged 18 what percentage the age 19, 20, 21, ..., 106 based on the previous distribution so 2023 or 2022 and both years as well as the total number for receipts for 2022/2023. Then once you've got the total number of estimated donors for 2024 by considering 2022/2023 and we say how many donors were aged 18, how many were aged 19, how many were aged 20, … , etc (we should have all ages incremented by a year from 18 to 106. Then take those figures for 2024 and the plug come back in the age-specific number of annual donor demands. 


# Monte-Carlo simulation 
To ensure a future-proven forecasting model with much more reliable forecast outputs by applying uncertainty, 
especially when there are short-term volatilities in the average daily receipts in the pattern of data due to some short-term variation of external factors / drivers 
(e.g., advertisement), which has an impact on the short-term,  to capture these changes in the pattern of data in the short term and then apply the uncertainty related 
to this to a long-term forecasting of age-specific donor demands, e.g., as it is evident that the number of receipts changes in the short-term 
(e.g., from the range of 4000 to 7000 (increment by 100) in the drop-down list, 
so there is volatility in the short term and the range of the average daily receipts? Each time we run the model, the forecasting output  of age-specific donor demands changes. 

# Dealing with Uncertanty & data volatility
THis is to introduce uncertainty into the model by simulating different scenarios for average daily receipts. 
This approach will capture the short-term volatility in donor receipts, leading to more reliable and robust long-term forecasts.

## Set Up Monte Carlo Simulation Parameters
set up the simulation to run multiple iterations, where each iteration randomly selects an average daily receipt value within 
a given range (e.g., 4000 to 7000). The model will then calculate the donor demand based on this value, 
and we'll aggregate the results to produce a distribution of possible outcomes.

## Run the model
- Run the Monte Carlo simulation for a specified number of iterations.
- Randomly generate an average daily receipt within the specified range for each iteration.
- Aggregate and analyse the results to provide a distribution of forecasted donor numbers.

Monte Carlo Simulation Setup:

The iterations variable defines how many times the simulation will run (in this example, 1000 iterations).
lowerBound and upperBound set the range for possible average daily receipts, reflecting the volatility (from 4000 to 7000).
Generating Random Average Daily Receipts:

For each iteration, a random average daily receipt is selected from the specified range using Application.WorksheetFunction.RandBetween(lowerBound, upperBound).
Running the Forecasting Model:

For each random receipt value, the total receipts for 2024 are calculated.
Using the ratios from 2022 and 2023, the estimated number of donors for 2024 is calculated for each iteration.
Storing and Analyzing Results:

The results of all iterations are stored in an array (monteCarloResults).
The mean and standard deviation of the forecasted donor numbers for each age group are calculated across all iterations.
Visualizations:

Two charts are added to visualize the results:
A clustered column chart showing the mean forecasted number of donors by age.
A line chart showing the standard deviation of forecasted donors by age, indicating the uncertainty.
Output:

The final results are saved in the "Forecast_2024" sheet, and the charts are added for visual analysis.
Running the Simulation:
Each time you run the ForecastWithMonteCarlo macro, it will generate different results based on the random variation in average daily receipts. This randomness helps to account for short-term volatility, making the forecast more robust and reliable over the long term.
Interpretation of Results:
The mean values give an expected forecast for each age group based on the Monte Carlo simulation.
The standard deviation indicates the level of uncertainty or volatility in the forecasted numbers. A higher standard deviation means more variability in the donor forecasts for that age group, suggesting that predictions for this group are more sensitive to changes in daily receipts.
This approach ensures that the forecasting model accounts for potential short-term variations and provides a more nuanced understanding of the possible outcomes.

## Additional Columns:

Year: Added to specify the forecast year.
AgeDistribution2022 & AgeDistribution2023: These columns store the age-specific distribution of donors for the years 2022 and 2023.
AverageAgeDistribution: This column contains the average age distribution calculated from 2022 and 2023 data.
RandomReceipt: Stores the random daily receipt value used in each iteration.
MonteCarloResults: Holds the forecasted donor number for each iteration.
Sum of Values and Sum of Squared Values: These are intermediate calculations needed to compute the mean and standard deviation for the forecasted donor numbers.
Final Forecast (Mean) and Final Forecast (StdDev): The mean and standard deviation of the forecasted donor numbers after all iterations.
Monte Carlo Simulation:

Each iteration generates a random receipt value and calculates the forecasted donor numbers. The results are stored and used to compute the final forecast.
Visualizations:

Clustered Column Chart: Visualizes the mean forecasted donor numbers by age group.
Line Chart: Shows the standard deviation (uncertainty) in the forecasted donor numbers for each age group.
Interpretation:
Year column gives context to the forecast year, which is 2024 in this scenario.
Age Distribution Columns show how donor distributions are evolving over time.
Random Receipt column illustrates the volatility and randomness considered in each iteration.
Monte Carlo Results provide insight into the forecasted donor numbers for each random receipt value.
Sum of Values and Sum of Squared Values are used to calculate the final forecast's mean and standard deviation, indicating the expected donor numbers and their uncertainty.
The charts visually summarize the data, showing how different age groups' donor forecasts change under various scenarios and the associated uncertainties.
This detailed output will help in analyzing the Monte Carlo simulation results in a more comprehensive manner.

In [None]:
# Calculate Uncertainty Ratio: average annual dornor demands and average annual reciepts:
# Calculate the Ratio of Short-Term to Long-Term Demand:
# Compute the ratio of the average short-term demand to the average long-term demand. 
# This ratio represents the relative impact of the short-term period on the long-term forecast.

## -------------------
# Instantiate the DataPreprocessor
data_preprocessor = DataPreprocessor(data_handler.combined_df, data_handler.snapshot_start, data_handler.snapshot_end)
data_preprocessor.preprocess_data()

In [None]:
# Calculate the The total annual reciepts
annual_receipts = data_preprocessor.calculate_annual_receipts()
results = annual_receipts[['year', 'Yearly Receipt']]

print(f"Total annual reciepts: {results}")

total_annual_reciepts_2022 = results['Yearly Receipt'].iloc[0]
print(f"total_annual_reciepts_2022: {results['Yearly Receipt'].iloc[0]}")

total_annual_reciepts_2023 = results['Yearly Receipt'].iloc[1]
print(f"total_annual_reciepts_2023: {results['Yearly Receipt'].iloc[1]}")

total_annual_reciepts_2024 = results['Yearly Receipt'].iloc[2]
print(f"total_annual_reciepts_2024: {results['Yearly Receipt'].iloc[2]}")

In [None]:
lpa_df_index = data_preprocessor.create_unique_identifier()

monthly_unique_donors = data_preprocessor.calculate_unique_donors_count_monthly(lpa_df_index)
daily_unique_donors = data_preprocessor.calculate_unique_donors_per_day(lpa_df_index)
annual_donor_demands = data_preprocessor.calculate_unique_donors_count_daily(lpa_df_index)
annual_unique_donors_age_specific = data_preprocessor.calculate_unique_donors_age_specific(lpa_df_index)
annual_unique_donors = data_preprocessor.calculate_unique_donors_per_year(lpa_df_index)

In [None]:
# Calculate the Total annual donor demands based on 'unique_id'
daily_unique_donors.sort_values(by=['year', 'age'])
total_annual_donor = daily_unique_donors[['year', 'count']]
total_annual_donor = total_annual_donor.groupby(['year']).sum()

total_annual_donor_2022 = total_annual_donor.iloc[0]
total_annual_donor_2023 = total_annual_donor.iloc[1]
total_annual_donor_2024 = total_annual_donor.iloc[2]

print(f"Total annual donor demands: {total_annual_donor}")

print(f"Total annual donor demands for 2022: {total_annual_donor_2022}")
print(f"Total annual donor demands for 2023: {total_annual_donor_2023}")
print(f"Total annual donor demands for 2024: {total_annual_donor_2024}")
        

In [None]:
# Filter the data to include only ages from 18 to 106
annual_unique_donors = annual_unique_donors.rename(columns={'count': 'demand'})
historical_data_df = annual_unique_donors[(annual_unique_donors['age'] >= 18) & (annual_unique_donors['age'] <= 106)]
historical_data_df.to_csv('annual_unique_donors.csv', index=False)

monthly_unique_donors = monthly_unique_donors.rename(columns={'count': 'demand'})
historical_demand_monthly = monthly_unique_donors[(monthly_unique_donors['age'] >= 18) & (monthly_unique_donors['age'] <= 106)]
historical_demand_monthly.to_csv('historical_demand_monthly.csv', index=False)

daily_unique_donors = daily_unique_donors.rename(columns={'count': 'demand'})
historical_demand_daily = daily_unique_donors[(daily_unique_donors['age'] >= 18) & (daily_unique_donors['age'] <= 106)] 
historical_demand_daily.to_csv('historical_demand_daily.csv', index=False)

In [None]:

annual_receipts_historical = annual_receipts[['year', 'Yearly Receipt']]
# Save to CSV file
annual_receipts_historical.to_csv('annual_receipts.csv', index=False)
annual_demands_historical = annual_donor_demands
annual_demands_age_specific_historical = annual_unique_donors_age_specific
annual_unique_donors_age_specific.to_csv('annual_unique_donors_age_specific.csv', index=False)

In [None]:
# Filter rows where 'year' is 2023 or 2022
filtered_annual_receipts = annual_receipts_historical #[annual_receipts_historical['year'].isin([2023, 2022, 2021, 2020])]
print(f"annual_donor_receipts is: {filtered_annual_receipts}")


filtered_annual_demands = annual_demands_historical #[(annual_demands_historical.index == 2020) | (annual_demands_historical.index == 2021) | (annual_demands_historical.index == 2022) | (annual_demands_historical.index == 2023) | (annual_demands_historical.index == 2024)]
print(f"annual_donor_demands is: {filtered_annual_demands}")

filtered_annual_demands_age_specific = annual_demands_age_specific_historical #[annual_demands_age_specific_historical['year'].isin([2023, 2022, 2021, 2020])]
# Extract and save data into a csv file
filtered_annual_demands_age_specific = filtered_annual_demands_age_specific.rename(columns={'count': 'demand'})

print(f"annual_donor_demands_age_specific is: {filtered_annual_demands_age_specific}") 

# Save to CSV file
filtered_annual_demands_age_specific.to_csv('annual_demands_age_specific.csv', index=False)

In [None]:

# Load and Preprocess Data: Load the historical data and filter it for the age range 18-106.
# Calculate Annual Receipts: Calculate the total annual receipts for 2022, 2023, and 2024 based on the provided daily averages and working days.
# Estimate Ratios: Calculate the ratios of receipts to donors for 2022 and 2023.
# Forecast Donors for 2024: Estimate the total number of donors for 2024 based on the historical ratios.
# Age-Specific Distribution: Use historical age distributions from 2022 and 2023 to distribute the forecasted donors by age for 2024.
# Combine Forecasts: Integrate these estimates into the final forecast for age-specific donor demands for the remaining months in 2024.

# class to provide short-term forecast the rest of age-specific count of donor demands for the remaining months in 2024 based on a bunch of short-term drivers like Martin Lewis broadcast 
# As we have only first quarter of age-specific count of donor demands (annual_donors_age_2024), to short-term forecast the rest of quarters age-specific count of donor demands for the remaining months in 2024:
# - Estimate the short-term forecasts for the remaining of 2024 by first, take the average number for receipts at working day across 2024 which currently is 5600 multiplied by the total number of working days in the year which is 256 give me the total number of expected receipts for the year (5600 * 256 = 1.433 million receipts across the whole of 2024). 
#     1. Calculate the number of receipts across the whole year (for the available data (only 6 months) on average are going to be for 2024 = average_daily_reciepts_2024
#     2. Multiply (average_daily_reciepts_2024) to the Number of working days in 2024 (num_working_day_2024) = total_annual_reciepts_2024
#     3. Estimate the Ratio of total_annual_reciepts_2024 to annual_donors_age_2024: ratio_reciept_donor_2024 = total_annual_reciepts_2024 / annual_donors_age_2024
# - Once you have calculated the total number of receipts for 2024 then by looking at 2022/2023 historical data count how many receipts there were in 2022/2023 and how many donors were in 2022/2023 just to get the overall ratio. The starting point is just the total number for receipts so according to this that would be just over 1.4 million more receipts then scale that back based on total number for receipts 2022/2023 e.g., 70% then that would suggest that would be 1,000,000 donors roughly. E.g., just to keep it simple we consider the same overall ratio in 2022/2023 (e.g., 70%) should be use in 2024, as 2022 and 2023 are the nearest year to 2024. This will give us an estimate for the total number of donors that we might expect in 2024 based on what we're seeing about average receipts across the year in 2024.      
#     1. Calculate the number of receipts across the whole year on average for 2023 = average_daily_reciepts_2023
#     2. Multiply (average_daily_reciepts_2023) to the Number of working days in 2023 (num_working_day_2023) = total_annual_reciepts_2023
#     3. Multiply (average_daily_reciepts_2022) to the Number of working days in 2022 (num_working_day_2022) = total_annual_reciepts_2022                                                                    
#(Note: assume we have already calculated the above figures:  
# total_annual_reciepts_2022: 751967.5346260389
# total_annual_reciepts_2023: 970035.0282485875
# total_annual_reciepts_2024: 1124615.1902439024                                                              
# Total annual donor demands for 2022: count    675089
# Total annual donor demands for 2023: count    852627
# Total annual donor demands for 2024: count    568367  
#     4. Estimate the Ratio of total_annual_reciepts_2023 to annual_donors_age_2023: ratio_reciept_donor_2023 = total_annual_reciepts_2023 / annual_donors_age_2023
#     5. Estimate the Ratio of total_annual_reciepts_2022 to annual_donors_age_2022: ratio_reciept_donor_2022 = total_annual_reciepts_2022 / annual_donors_age_2022  
# - Then scale that back to work out however how many donors that is expected to be in 2024 and then using the historical age distribution in 2022/2023 just to break that down by age so that we can plug in those figures for 2024. So once calculated the number of donors in 2024, then need to find out of those donors what percentage would be aged 18 what percentage the age 19, 20, 21, …, 106 based on the previous distribution so 2023 or 2022 and both years as well as the total number for receipts for 2022/2023. Then once you've got the total number of estimated donors for 2024 by considering 2022/2023 and we say how many donors were aged 18, how many were aged 19, how many were aged 20, … , etc 
# (we should have all ages incremented by a year from 18 to 106. Then take those figures for 2024 and the plug come back in the age-specific number of annual donor demands. based on the attached data?  

# Load the historical data
#historical_data_path = 'annual_unique_donors_22_23_24.csv'
historical_data_df = annual_unique_donors

# Filter the data to include only ages from 18 to 106
filtered_data_df = historical_data_df[(historical_data_df['age'] >= 18) & (historical_data_df['age'] <= 106)]

# # Add a column for the month (assuming the month is in the 'month' column in the dataset)
# # If the dataset doesn't have month data, this step needs to be adjusted accordingly
# # Here we assume that the dataset has a 'month' column with values from 1 to 12
# filtered_data_df['month'] = pd.to_datetime(filtered_data_df['date']).dt.month

# # Define a function to calculate quarters
# def calculate_quarter(month):
#     if 1 <= month <= 3:
#         return 1
#     elif 4 <= month <= 6:
#         return 2
#     elif 7 <= month <= 9:
#         return 3
#     elif 10 <= month <= 12:
#         return 4

# # Apply the function to add the 'quarter' column
# filtered_data_df['quarter'] = filtered_data_df['month'].apply(calculate_quarter)

# Calculate annual donor receipts and demands for the provided years
annual_donor_demands = filtered_data_df.groupby('year')['demand'].sum()

# Load the additional data provided in the task
average_daily_receipts_2022 = total_annual_reciepts_2022 / 256
average_daily_receipts_2023 = total_annual_reciepts_2023 / 256
average_daily_receipts_2024 = 5600  # Given average daily receipts for 2024

num_working_days_2022 = 256
num_working_days_2023 = 256
num_working_days_2024 = 256

total_annual_receipts_2022 = round(average_daily_receipts_2022 * num_working_days_2022, 2)
print(f"The total number of receipts for 2022: {total_annual_receipts_2022}")

total_annual_receipts_2023 = round(average_daily_receipts_2023 * num_working_days_2023, 2)
print(f"The total number of receipts for 2023: {total_annual_receipts_2023}")

total_annual_receipts_2024 = round(average_daily_receipts_2024 * num_working_days_2024, 2)
print(f"The total number of receipts for 2024: {total_annual_receipts_2024}")

# Calculate ratios of receipts to donors for 2022 and 2023
ratio_receipt_donor_2022 = round(total_annual_receipts_2022 / annual_donor_demands[2022], 2)
print(f"The Ratio of total number of receipt to donor demands for 2022: {ratio_receipt_donor_2022}")

ratio_receipt_donor_2023 = round(total_annual_receipts_2023 / annual_donor_demands[2023], 2)
print(f"The Ratio of total number of receipt to donor demands for 2023: {ratio_receipt_donor_2023}")

# Estimate the total number of donors for 2024 based on historical ratios
estimated_donors_2024 = round(total_annual_receipts_2024 / ((ratio_receipt_donor_2022 + ratio_receipt_donor_2023) / 2), 2)
print(f"The estimate total number of donors for 2024 based on historical ratios: {estimated_donors_2024}")

# Age-specific distribution based on 2022 and 2023
age_distribution_2022 = filtered_data_df[filtered_data_df['year'] == 2022].groupby('age')['demand'].sum() / annual_donor_demands[2022]
print(f"Age-specific distribution based on 2022: {age_distribution_2022}")

age_distribution_2023 = filtered_data_df[filtered_data_df['year'] == 2023].groupby('age')['demand'].sum() / annual_donor_demands[2023]
print(f"Age-specific distribution based on 2023: {age_distribution_2023}")

# Average age distribution
average_age_distribution = round((age_distribution_2022 + age_distribution_2023) / 2, 6)
print(f"Average age-specific distribution based on 2022/2023: {average_age_distribution}")

# Estimate the age-specific number of donors for 2024
age_specific_donors_2024 = estimated_donors_2024 * average_age_distribution
print(f"Age-specific distribution for 2024: {age_specific_donors_2024}")




# Ask the user to specify the available quarters for 2024
#available_quarters = input("Enter the quarters that data is available for 2024 (e.g., 1, 2): ")
#available_quarters = list(map(int, available_quarters.split(',')))
#available_quarters

# Forecast the remaining months in 2024 based on the first quarter data
first_quarter_data = filtered_data_df[(filtered_data_df['year'] == 2024)] #& (filtered_data_df['quarter'] == 1)]
print(f"The first quarter available data for 2024: {first_quarter_data}")

total_first_quarter_demand = first_quarter_data['demand'].sum()
print(f"The total first quarter available donor demands for 2024: {total_first_quarter_demand}")

remaining_demand_2024 = total_annual_receipts_2024 - total_first_quarter_demand
print(f"The remaining donor demands for 2024: {remaining_demand_2024}")

# Distribute the remaining demand across ages based on the average age distribution
remaining_age_specific_donors_2024 = remaining_demand_2024 * average_age_distribution
print(f"The remaining age-specific donor demands for 2024: {remaining_age_specific_donors_2024}")

# Create a DataFrame to store the forecasted age-specific donors for the remaining months of 2024
forecast_df = pd.DataFrame({
    'age': average_age_distribution.index,
    'forecasted_donors': remaining_age_specific_donors_2024
}).reset_index(drop=True)

print(f"The age-specific donor demands for 2024:")
print(forecast_df)
forecast_df.to_csv('estimated age-specific donor demands for 2024.csv')

In [None]:
# # Initialization: The class is initialized with the data file and constants for the number of working days and average daily receipts for 2024.
# # Calculate Ratios: The calculate_ratios method computes the ratio of total annual receipts to annual donors for 2022, 2023, and 2024.
# # Forecast Donors for 2024: The forecast_donors_2024 method estimates the total number of donors for 2024 based on the average ratio from 2022 and 2023.
# # Age Distribution: The age_distribution method calculates the age distribution of donors for a given year.
# # Forecast Age-Specific Donors for 2024: The forecast_age_specific_donors_2024 method estimates the age-specific count of donors for 2024 by applying the average age distribution from 2022 and 2023
# # to the estimated total number of donors for 2024.

# class DonorDemandForecast:
#     def __init__(self, data_file):
#         self.data = pd.read_csv(data_file)
#         self.num_working_days_2024 = 256
#         self.average_daily_receipts_2024 = 5600
#         self.total_annual_receipts_2024 = self.average_daily_receipts_2024 * self.num_working_days_2024
#         self.total_annual_receipts_2022 = 751967.5346260389
#         self.total_annual_receipts_2023 = 970035.0282485875
#         self.total_annual_donors_2022 = 675089
#         self.total_annual_donors_2023 = 852627
#         self.total_annual_donors_2024 = 568367

#     def calculate_ratios(self):
#         ratio_receipt_donor_2022 = self.total_annual_receipts_2022 / self.total_annual_donors_2022
#         ratio_receipt_donor_2023 = self.total_annual_receipts_2023 / self.total_annual_donors_2023
#         ratio_receipt_donor_2024 = self.total_annual_receipts_2024 / self.total_annual_donors_2024
#         return ratio_receipt_donor_2022, ratio_receipt_donor_2023, ratio_receipt_donor_2024

#     def forecast_donors_2024(self):
#         ratio_receipt_donor_2022, ratio_receipt_donor_2023, ratio_receipt_donor_2024 = self.calculate_ratios()
#         estimated_donors_2024 = self.total_annual_receipts_2024 / ((ratio_receipt_donor_2022 + ratio_receipt_donor_2023) / 2)
#         return estimated_donors_2024

#     def age_distribution(self, year):
#         age_dist = self.data[self.data['year'] == year].groupby('age')['demand'].sum()
#         return age_dist / age_dist.sum()

#     def forecast_age_specific_donors_2024(self):
#         estimated_donors_2024 = self.forecast_donors_2024()
#         age_dist_2022 = self.age_distribution(2022)
#         age_dist_2023 = self.age_distribution(2023)
#         average_age_dist = (age_dist_2022 + age_dist_2023) / 2
#         age_specific_donors_2024 = average_age_dist * estimated_donors_2024
#         return age_specific_donors_2024

# # Usage
# forecast = DonorDemandForecast('annual_unique_donors.csv')
# age_specific_donors_2024 = forecast.forecast_age_specific_donors_2024()
# print(age_specific_donors_2024)
# age_specific_donors_2024.to_csv('age_specific_donors_2024.csv')

In [None]:
# a function in python that apply uncertainty to a short-term forecsasts of number of demands from April 2024 for 2 years 
# by multiplying the SARIMA forecasted values by two ratios: 1. annual_donor_receipts (average daily demnads per year) divided 
# by annual_donor_demands (average daily reciepts per year) and 2. (age-specific demnads ratio (claculated by estimating the percentage 
#                                                                                               of demands per year for each age from 18 to 106). 
#                                                                   Also if there are no demands in a specific age in the corresponding year then consider 
#                                                                   a zero demand for that age. 
# Here is the age specific count of demnds in 2022, 2023, and January 2024, February 2024 and March 2024.  
# To accomplish this task, we need to:

# Forecast the number of demands for the next two years using a SARIMA model.
# Apply uncertainty by multiplying the forecasted values by the specified ratios:
# The ratio of annual donor receipts to annual donor demands.
# The age-specific demands ratio.
# Here's a step-by-step Python function to achieve this:

# Forecasting using SARIMA: I'll assume the SARIMA model is already built and forecasted values are available.
# Ratio Calculation:
# Calculate the ratio of annual donor receipts to annual donor demands.
# Calculate age-specific demands ratio.
# Apply Ratios: Adjust the forecasted values by these ratios.

# 1. Forecasting the overall demand.
# 2. Distributing the forecasted demand among different age groups using age-specific ratios.
# 3. Adjusting these distributed forecasts using the month and age-specific ratios.

import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings
import statsmodels.api as sm
# Suppress warnings from statsmodels
warnings.filterwarnings("ignore")

# Load the historical data
#historical_data_path = '/annual_unique_donors.csv'
#pd.read_csv(historical_data_path)

#historical_demand_daily['month'] = historical_demand_daily['month_year'].dt.month
#historical_demand_daily['day'] = historical_demand_daily['receiptdate'].dt.day
# Calculate annual donor receipts and demands
# annual_donor_receipts = historical_data_df.groupby('year')['demand'].sum().reset_index(name='Yearly Receipt')
# annual_donor_demands = historical_data_df.groupby('year')['demand'].sum()
# Annual receipts and demands DataFrames
annual_donor_receipts = filtered_annual_receipts  #pd.DataFrame({'year': [2022, 2023, 2024], 'Yearly Receipt': [7.519675e+05, 9.700350e+05, 1.124615e+06]})

annual_donor_demands = filtered_annual_demands  #pd.Series({2022: 601254, 2023: 752898, 2024: 505219}, name='unique_key')

# Age-specific demands DataFrame
age_specific_demands = filtered_annual_demands_age_specific #pd.DataFrame({
#     'year': [2022, 2022, 2022, 2023, 2023, 2023, 2024, 2024, 2024],
#     'age': [18, 19, 20, 18, 19, 20, 18, 19, 20],
#     'demand': [199, 286, 192, 249, 345, 243, 122, 267, 196]})

# Calculate age-specific demands
annual_donor_demands_age_specific = filtered_annual_demands_age_specific #historical_data_df.groupby(['year', 'age'])['demand'].sum().reset_index()

# # Define a function to fit SARIMA model and forecast future demands
# def fit_sarima_and_forecast(data, start_year, forecast_periods):
#     model = sm.tsa.statespace.SARIMAX(data,
#                                       order=(1, 1, 1),
#                                       seasonal_order=(1, 1, 1, 12),
#                                       enforce_stationarity=False,
#                                       enforce_invertibility=False)
#     results = model.fit(disp=False)
#     forecast = results.get_forecast(steps=forecast_periods)
#     forecast_values = forecast.predicted_mean
#     forecast_index = pd.date_range(start=start_year, periods=forecast_periods, freq='MS')
#     forecast_df = pd.DataFrame({'Forecast': forecast_values}, index=forecast_index)
#     return forecast_df

In [None]:


# Define a function to perform Naïve extrapolation
def naive_extrapolation(data, forecast_periods, freq='Y'):
    last_observed_value = data.iloc[-1]
    forecast_values = np.repeat(last_observed_value, forecast_periods)
    #data.set_index('year', inplace=True)
    #forecast_index = pd.date_range(start=f"{data.index[-1].year + 1}-01-01", periods=forecast_periods, freq='YS')
    last_year = data.index[-1] #data['year'].iloc[-1] #
    
    # Modified to generate forecasts for different frequencies (Y, M, D).
    if freq == 'Y': 
        forecast_index = pd.date_range(start=f"{last_year + 1}", periods=forecast_periods, freq='YS')
        #forecast_index = pd.date_range(start=datetime(last_year, 1, 1).strftime('%Y'), periods=forecast_periods, freq='YS')
        
    elif freq == 'M': 
        tup = data.index[-1]
        first, *middle, last = tup
        middle = last
        last = 1
        # Calculate the date 6 months from today's date and print it in ISO 8601 format
        #print((dt.date.today() + dt.timedelta(1 * 365 / 12)).isoformat())
        forecast_index = pd.date_range(start=f"{(dt.date(first, middle, last) + dt.timedelta((1 * 365 / 12)+1)).isoformat()}", periods=forecast_periods, freq='MS')
        #forecast_index = pd.date_range(start=f"{last_year}-01", periods=forecast_periods, freq='MS')
        #forecast_index = pd.date_range(start=datetime(last_year, 1, 1).strftime('%Y-%m'), periods=forecast_periods, freq='MS')
    elif freq == 'D':
        tup = data.index[-1]
        first, middle, last = tup 
        forecast_index = pd.date_range(start=f"{(dt.date(first, middle, last) + dt.timedelta(3)).isoformat()}", periods=forecast_periods, freq='D')
        #forecast_index = pd.date_range(start=f"{last_year}-01-01", periods=forecast_periods, freq='D')
        #forecast_index = pd.date_range(start=datetime(last_year, 1, 1).strftime('%Y-%m-%d'), periods=forecast_periods, freq='D')
        
    forecast_df = pd.DataFrame({'Forecast': forecast_values}, index=forecast_index)
    return forecast_df

In [None]:
# Forecast demand using SARIMA
#start_year = '2024-04'
#forecast_periods = 2 #24
# Forecast overall demand using Naïve extrapolation
forecast_periods_annual = 5  # 2 years of forecast
forecast_periods_monthly = 24  # 24 months of forecast (2 years)
forecast_periods_daily = 730  # 730 days of forecast (2 years)

# Annual forecast
historical_demand = historical_data_df.groupby('year')['demand'].sum()
forecast_df_annual = naive_extrapolation(historical_demand, forecast_periods_annual, freq='Y')

# Monthly forecast
historical_demand_mon = historical_demand_monthly.reset_index()
historical_demand_mon = historical_demand_monthly.groupby(['year', 'month'])['demand'].sum()
forecast_df_monthly = naive_extrapolation(historical_demand_mon, forecast_periods_monthly, freq='M')


# Daily forecast
historical_demand_day = historical_demand_daily.reset_index()
historical_demand_day = historical_demand_day.groupby(['year', 'month', 'day'])['demand'].sum() #.groupby(['year', 'month', 'day'])['demand'].sum()
forecast_df_daily = naive_extrapolation(historical_demand_day, forecast_periods_daily, freq='D')


In [None]:



#forecast_df = fit_sarima_and_forecast(historical_data_df.set_index('year')['demand'], start_year, forecast_periods)
#forecast_df = fit_sarima_and_forecast(historical_demand, start_year, forecast_periods)
#forecast_df = naive_extrapolation(historical_demand, forecast_periods)

# Calculate age-specific demand ratios
age_specific_ratios = {}
for year in historical_data_df['year'].unique():
    print(f"year: {year}")
    age_demand = annual_donor_demands_age_specific[annual_donor_demands_age_specific['year'] == year]
    #print(f"age_demand: {age_demand}")
    total_demand = age_demand['demand'].sum()
    #print(f"total_demand: {total_demand}")
    if total_demand > 0:
        age_specific_ratios[year] = age_demand.set_index('age')['demand'] / total_demand
        #print(f"age_specific_ratios[year]: {age_specific_ratios[year]}")
    else:
        age_specific_ratios[year] = pd.Series(0, index=range(18, 107))
        #print(f" Zero age: age_specific_ratios[year]: {pd.Series(0, index=range(18, 107))}")
# Function to apply uncertainty adjustments to the forecasts
def apply_uncertainty_adjustments(forecast_df, annual_donor_receipts, annual_donor_demands, age_specific_ratios, forecast_start_year, freq='Y'):
#def apply_uncertainty_adjustments(forecast_df, annual_donor_receipts, annual_donor_demands, age_specific_ratios, forecast_start_year):
    # Filter Ages: We filter the data to include only ages between 18 and 106.
    #adjusted_forecast = forecast_df.copy()
    adjusted_forecast = pd.DataFrame(index=forecast_df.index, columns=range(18, 107))
    
    start_year = pd.to_datetime(forecast_start_year).year -1
    
    month_ratio_list = []#list()
    
    for i in range(len(forecast_df)):
        if freq == 'Y':
            forecast_year = start_year - i
            print(f" i: {i}  forecast_year: {forecast_year}")
        elif freq == 'M':
            forecast_year = start_year - (i // 12)
            print(f" i: {i}  forecast_year: {forecast_year}")
        elif freq == 'D':
            forecast_year = start_year - (i // 365)
            print(f" i: {i}  forecast_year: {forecast_year}")
    # Forecasting: We fit the SARIMA model to the historical data and obtain the overall forecast.
#         Check Year Existence: Before calculating the month_ratio, we check if the forecast_year exists in both annual_donor_receipts and annual_donor_demands.
#         Assign Zero Demand: If the year does not exist in either DataFrame, we assign zero demand for that period.
        if forecast_year in annual_donor_receipts['year'].values and forecast_year in annual_donor_demands.index:
            print(f"## ------------------- ##")
            #print(f"## ------------------- ##")
            #print(f"## ------------------- ##")
            #print(f"forecast_year for receipts: {annual_donor_receipts['year'].values}")
            #print(f"forecast_year for donors: {annual_donor_demands.index}")
            #print(f"forecast_year: {forecast_year}")
            month_ratio = round((annual_donor_receipts[annual_donor_receipts['year'] == forecast_year]['Yearly Receipt'].values[0] / ##annual_donor_receipts[annual_donor_receipts['year'] == 2023]['Yearly Receipt'].values[0] / annual_donor_demands[2023] 
                           annual_donor_demands[forecast_year]), 2)
            #print(f"month_ratio: {month_ratio}")
            
            # Age-Specific Ratios: We calculate the demand ratios for each age group.
            #age_specific_ratio = age_specific_ratios.get(year, pd.Series(0, index=range(18, 107)))
            for age in range(18, 107):
                age_specific_ratio = age_specific_ratios.get(forecast_year, pd.Series(0, index=range(18, 107)))
                #print(f"age_specific_ratio: {age_specific_ratio}")
                # Adjusting the Forecast:
                # Initialize an empty DataFrame to hold the adjusted forecast values.
                # For each forecasted period, calculate the monthly ratio.
                if age in age_specific_ratio:
                     # Distribute the forecasted demand among different age groups using the calculated age-specific ratios.
                    #month_ratio_list = month_ratio_list.append(month_ratio) 
                    adjusted_forecast.iloc[i, age - 18] = (forecast_df.iloc[i]['Forecast'] * month_ratio * age_specific_ratio[age])
                    print(f" for i: {i}, year: {forecast_year}, age: {age} ==> month_ratio = {month_ratio}, age_specific_ratio[age]: {age_specific_ratio[age]}, adjusted_forecast: {adjusted_forecast.iloc[i, age - 18]}")
#                     if age not in dict:
#                         dict[age] = []
#                         dict[age].append(month_ratio)
#                         print(dict)
                   
                    #.to_csv(f"month_ratio_year{forecast_year}_age{age}.csv", index=False)
                    #age_specific_ratio[age].to_csv(f"age_specific_ratio_year{forecast_year}_age{age}.csv", index=False)
                else: # If a specific age has no data, we assign a zero demand.
                    adjusted_forecast.iloc[i, age - 18] = 0   
        else:
            # If the year is not found, assign zero demand
            adjusted_forecast.iloc[i, :] = 0
        
        #adjusted_forecast.iloc[i] = adjusted_forecast.iloc[i] * month_ratio * age_specific_ratio.values
        #forecast_value = adjusted_forecast.iloc[i]['Forecast']
        #adjusted_values = forecast_value * month_ratio * age_specific_ratio.values
        #adjusted_forecast.iloc[i]['Forecast'] = adjusted_values.sum()
    return adjusted_forecast #, month_ratio_list)

# Apply the uncertainty adjustments to the forecasts
#adjusted_forecast_df = apply_uncertainty_adjustments(forecast_df, annual_donor_receipts, annual_donor_demands, age_specific_ratios, start_year)

# Apply the uncertainty adjustments to the forecasts

start_year = dt.date.today() #2024
forecast_start_year = start_year
pd.to_datetime(forecast_start_year).year -1

# Adjusted annual forecast
adjusted_forecast_annual_df = apply_uncertainty_adjustments(forecast_df_annual, annual_donor_receipts, annual_donor_demands, age_specific_ratios, start_year, freq='Y')
adjusted_forecast_annual_df.to_csv(f"adjusted_forecast_annual.csv", index=False)

# Adjusted monthly forecast
#adjusted_forecast_monthly_df = apply_uncertainty_adjustments(forecast_df_monthly, annual_donor_receipts, annual_donor_demands, age_specific_ratios, start_year, freq='M')
#adjusted_forecast_monthly_df.to_csv(f"adjusted_forecast_monthly.csv", index=False)

# Adjusted daily forecast
#adjusted_forecast_daily_df = apply_uncertainty_adjustments(forecast_df_daily, annual_donor_receipts, annual_donor_demands, age_specific_ratios, start_year, freq='D')
#adjusted_forecast_daily_df.to_csv(f"adjusted_forecast_daily.csv", index=False)

# Display the adjusted forecasts
print("Adjusted Annual Forecast:")
print(adjusted_forecast_annual_df.head())

#print("Adjusted Monthly Forecast:")
#print(adjusted_forecast_monthly_df.head())

#print("Adjusted Daily Forecast:")
#print(adjusted_forecast_daily_df.head())

In [None]:
# Load the historical data
historical_data_df = annual_unique_donors


# Filter the data to include only ages from 18 to 106
filtered_data_df = historical_data_df[(historical_data_df['age'] >= 18) & (historical_data_df['age'] <= 106)]

# Calculate annual donor receipts and demands
annual_donor_receipts = filtered_data_df.groupby('year')['demand'].sum().reset_index(name='Yearly Receipt')
annual_donor_demands = filtered_data_df.groupby('year')['demand'].sum()

# Calculate age-specific demands
annual_donor_demands_age_specific = filtered_data_df.groupby(['year', 'age'])['demand'].sum().reset_index()

# Define a function to perform Naïve extrapolation
def naive_extrapolation(data, forecast_periods):
    last_observed_value = data.iloc[-1]
    forecast_values = np.repeat(last_observed_value, forecast_periods)
    #forecast_index = pd.date_range(start=f"{data.index[-1].year + 1}-01-01", periods=forecast_periods, freq='YS')
    forecast_index = pd.date_range(start=f"{data.index[-1] + 1}-01-01", periods=forecast_periods, freq='YS')

    forecast_df = pd.DataFrame({'Forecast': forecast_values}, index=forecast_index)
    return forecast_df

# Forecast overall demand using Naïve extrapolation
forecast_periods = 2  # 2 years of forecast
historical_demand = filtered_data_df.groupby('year')['demand'].sum()
historical_demand

In [None]:

forecast_df = naive_extrapolation(historical_demand, forecast_periods)
forecast_df

In [None]:


# # Calculate age-specific demand ratios
# age_specific_ratios = {}
# for year in filtered_data_df['year'].unique():
#     age_demand = annual_donor_demands_age_specific[annual_donor_demands_age_specific['year'] == year]
#     total_demand = age_demand['demand'].sum()
#     if total_demand > 0:
#         age_specific_ratios[year] = age_demand.set_index('age')['demand'] / total_demand
#     else:
#         age_specific_ratios[year] = pd.Series(0, index=range(18, 107))

# # Function to apply uncertainty adjustments to the forecasts
# def apply_uncertainty_adjustments(forecast_df, annual_donor_receipts, annual_donor_demands, age_specific_ratios, forecast_start_year):
#     adjusted_forecast = pd.DataFrame(index=forecast_df.index, columns=range(18, 107))
#     start_year = pd.to_datetime(forecast_start_year).year
#     for i in range(len(forecast_df)):
#         forecast_year = start_year + i
#         # Check if the year exists in both DataFrames
#         if forecast_year in annual_donor_receipts['year'].values and forecast_year in annual_donor_demands.index:
#             month_ratio = (annual_donor_receipts[annual_donor_receipts['year'] == forecast_year]['Yearly Receipt'].values[0] /
#                            annual_donor_demands[forecast_year])
#             for age in range(18, 107):
#                 age_specific_ratio = age_specific_ratios.get(forecast_year, pd.Series(0, index=range(18, 107)))
#                 if age in age_specific_ratio:
#                     adjusted_forecast.iloc[i, age - 18] = (forecast_df.iloc[i]['Forecast'] * month_ratio * age_specific_ratio[age])
#                 else:
#                     adjusted_forecast.iloc[i, age - 18] = 0
#         else:
#             # If the year is not found, assign zero demand
#             adjusted_forecast.iloc[i, :] = 0
#     return adjusted_forecast

# # Apply the uncertainty adjustments to the forecasts
# start_year = 2024
# adjusted_forecast_df = apply_uncertainty_adjustments(forecast_df, annual_donor_receipts, annual_donor_demands, age_specific_ratios, start_year)

# # Display the adjusted forecast
# print(adjusted_forecast_df.head())


In [None]:
# Creates a pivot table with age as the index and year as the columns.
# Calculates the growth rate for each age cohort.
# Computes the average growth rate for each age cohort.
# Uses the last year’s data to forecast the next year’s demand.
# Incorporates uncertainty by multiplying the forecast by the average Uncertainty_Ratio.
# Creates a pivot table with age as the index and year as the columns.
# Calculates the growth rate for each age cohort.
# Computes the average growth rate for each age cohort.
# Uses the last year’s data to forecast the next year’s demand.
# Incorporates uncertainty by multiplying the forecast by the average Uncertainty_Ratio.

# Creates a pivot table with age as the index and year as the columns.
# Applies seasonal decomposition to separate the trend and seasonal components.
# Calculates the average growth rate for the trend component.
# Forecasts the next year’s trend component and combines it with the seasonal component.
# Incorporates uncertainty by multiplying the forecast by the average Uncertainty_Ratio.

def cohort_analysis_forecast(df):
    # Create a pivot table with 'year' as columns, 'age' as index, and 'demand' as values
    pivot = df.pivot_table(index='age', columns='year', values='demand', aggfunc='sum')
    
    # Calculate the growth rate for each age cohort
    growth_rates = pivot.pct_change(axis=1)
    
    # Calculate the average growth rate for each age cohort
    avg_growth_rates = growth_rates.mean(axis=1)
    
    # Get the last year of data for each age cohort
    last_year_data = pivot.iloc[:, -1]
    
    # Forecast the next year's demand by applying the average growth rate to the last year's data
    forecast = last_year_data * (1 + avg_growth_rates)
    
    # Incorporate uncertainty by multiplying the forecast by the 'Uncertainty_Ratio'
    # Assuming 'Uncertainty_Ratio' is a column in the dataframe
    uncertainty_ratio = df['Uncertainty_Ratio'].mean() if 'Uncertainty_Ratio' in df.columns else 1
    forecast_with_uncertainty = forecast * uncertainty_ratio
    
    # Return the forecast with uncertainty
    return forecast_with_uncertainty

df = annual_donor_demands
# Call the function and print the forecast
forecast = cohort_analysis_forecast(df)
print(forecast)


In [None]:
# def apply_uncertainty(forecast, annual_donor_receipts, annual_donor_demands, age_specific_demands):
#     # Calculate the annual ratios
#     annual_ratios = annual_donor_receipts['Yearly Receipt'] / annual_donor_demands

#     # Calculate the age-specific demand ratios
#     age_specific_ratios = age_specific_demands.groupby('age')['demand'].sum() / age_specific_demands['demand'].sum()

#     # Create a DataFrame to store the adjusted forecast
#     adjusted_forecast = pd.DataFrame(index=forecast.index, columns=forecast.columns)

#     for year in range(2024, 2026):
#         # Get the annual ratio for the current year
#         annual_ratio = annual_ratios.loc[year]

#         for age in range(18, 107):
#             # Get the age-specific ratio for the current age
#             age_ratio = age_specific_ratios.get(age, 0)

#             # Apply the uncertainty to the forecast
#             adjusted_forecast.loc[(forecast.index.year == year) & (forecast.columns == age), age] = (
#                 forecast.loc[(forecast.index.year == year) & (forecast.columns == age), age] * annual_ratio * age_ratio
#             )

#     # Fill NaN values with 0
#     adjusted_forecast = adjusted_forecast.fillna(0)

#     return adjusted_forecast


# def cohort_analysis_forecast_with_seasonality(df):
#     # Create a pivot table with 'year' as columns, 'age' as index, and 'demand' as values
#     pivot = df.pivot_table(index='age', columns='year', values='demand', aggfunc='sum')
    
#     # Initialize a DataFrame to store the forecast
#     forecast_df = pd.DataFrame(index=pivot.index)
    
#     # Loop through each age cohort to apply seasonal decomposition and forecast
#     for age in pivot.index:
#         # Get the time series for the current age cohort
#         ts = pivot.loc[age].dropna()
        
#         # Apply seasonal decomposition
#         decomposition = seasonal_decompose(ts, model='multiplicative', period=12)
        
#         # Extract the trend and seasonal components
#         trend = decomposition.trend.dropna()
#         seasonal = decomposition.seasonal.dropna()
        
#         # Calculate the average growth rate for the trend component
#         growth_rate = trend.pct_change().mean()
        
#         # Forecast the next year's trend component
#         last_trend_value = trend.iloc[-1]
#         forecast_trend = last_trend_value * (1 + growth_rate)
        
#         # Get the seasonal component for the next period (assuming seasonality repeats)
#         next_seasonal_value = seasonal.iloc[-12 % len(seasonal)]
        
#         # Combine the forecasted trend and seasonal components
#         forecast_value = forecast_trend * next_seasonal_value
        
#         # Incorporate uncertainty by multiplying the forecast by the 'Uncertainty_Ratio'
#         uncertainty_ratio = df['Uncertainty_Ratio'].mean() if 'Uncertainty_Ratio' in df.columns else 1
#         forecast_with_uncertainty = forecast_value * uncertainty_ratio
        
#         # Store the forecast in the DataFrame
#         forecast_df.loc[age, 'Forecast'] = forecast_with_uncertainty
    
#     # Return the forecast DataFrame
#     return forecast_df

# # Call the function and print the forecast
# forecast = cohort_analysis_forecast_with_seasonality(df)
# print(forecast)


# Creates a pivot table with age as the index and year as the columns.
# Applies STL decomposition to separate the trend and seasonal components.
# Calculates the average growth rate for the trend component.
# Forecasts the next year’s trend component and combines it with the seasonal component.
# Incorporates uncertainty by multiplying the forecast by the average Uncertainty_Ratio.

# import pandas as pd
# from statsmodels.tsa.seasonal import seasonal_decompose
# from statsmodels.tsa.holtwinters import ExponentialSmoothing

# # Load the dataset
# df = pd.read_csv('annual_unique_donors.csv')

# # Assuming the dataframe has a 'year' column and a 'demand' column
# df['year'] = pd.to_datetime(df['year'], format='%Y')
# df.set_index('year', inplace=True)

# # Decompose the time series to observe seasonal patterns
# decomposition = seasonal_decompose(df['demand'], model='additive', period=1)

# # Fit the TSL model with seasonal components
# model = ExponentialSmoothing(df['demand'], trend='add', seasonal='add', seasonal_periods=1)
# fit_model = model.fit()

# # Forecast the next period
# forecast = fit_model.forecast(1)

# # Add the forecast to the original function
# def cohort_analysis_forecast(df, forecast_value):
#     # Create a pivot table with 'year' as columns, 'age' as index, and 'demand' as values
#     pivot = df.pivot_table(index='age', columns='year', values='demand', aggfunc='sum')
    
#     # Calculate the growth rate for each age cohort
#     growth_rates = pivot.pct_change(axis=1)
    
#     # Calculate the average growth rate for each age cohort
#     avg_growth_rates = growth_rates.mean(axis=1)
    
#     # Get the last year of data for each age cohort
#     last_year_data = pivot.iloc[:, -1]
    
#     # Forecast the next year's demand by applying the average growth rate to the last year's data
#     forecast = last_year_data * (1 + avg_growth_rates)
    
#     # Incorporate uncertainty by multiplying the forecast by the 'Uncertainty_Ratio'
#     # Assuming 'Uncertainty_Ratio' is a column in the dataframe
#     uncertainty_ratio = df['Uncertainty_Ratio'].mean() if 'Uncertainty_Ratio' in df.columns else 1
#     forecast_with_uncertainty = forecast * uncertainty_ratio
    
#     # Add the TSL forecast value
#     forecast_with_uncertainty['TSL_Forecast'] = forecast_value
    
#     # Return the forecast with uncertainty
#     return forecast_with_uncertainty

# # Call the function and print the forecast with TSL seasonal forecasting
# forecast_with_tsl = cohort_analysis_forecast(df, forecast.values[0])
# print(forecast_with_tsl)

# This code integrates STL seasonal forecasting into your existing cohort analysis forecast function. 
# It uses the STL decomposition from the statsmodels library to decompose the time series data and 
# then applies the Holt-Winters method for forecasting. 
# The forecast value is then adjusted by the age-specific Uncertainty_Ratio.

# import pandas as pd
# from statsmodels.tsa.seasonal import STL
# from statsmodels.tsa.holtwinters import ExponentialSmoothing

# # Load the dataset
# df = pd.read_csv('annual_unique_donors.csv')

# # Assuming the dataframe has 'year' and 'age' columns and a 'demand' column
# df['year'] = pd.to_datetime(df['year'], format='%Y')
# df.set_index(['age', 'year'], inplace=True)

# # Function to apply STL and forecast
# def cohort_analysis_forecast_with_stl(df):
#     # Initialize a DataFrame to store the forecast
#     forecast_df = pd.DataFrame()

#     # Loop through each age cohort to apply STL and forecast
#     for age in df.index.get_level_values(0).unique():
#         # Get the time series for the current age cohort
#         ts = df.xs(age, level='age')['demand'].dropna()
        
#         # Apply STL decomposition
#         stl = STL(ts, seasonal=13)
#         result = stl.fit()
        
#         # Forecast the next year's demand using Holt-Winters method
#         model = ExponentialSmoothing(result.trend, trend='add', seasonal='add', seasonal_periods=13)
#         fit_model = model.fit()
#         forecast_value = fit_model.forecast(1).iloc[0]
        
#         # Get the corresponding 'Uncertainty_Ratio' for the specific age and year
#         uncertainty_ratio = df.xs(age, level='age')['Uncertainty_Ratio'].mean() if 'Uncertainty_Ratio' in df.columns else 1
        
#         # Multiply the forecast value by the 'Uncertainty_Ratio'
#         forecast_with_uncertainty = forecast_value * uncertainty_ratio
        
#         # Store the forecast in the DataFrame
#         forecast_df.loc[age, 'Forecast'] = forecast_with_uncertainty
    
#     # Return the forecast DataFrame
#     return forecast_df

# # Call the function and print the forecast
# forecast = cohort_analysis_forecast_with_stl(df)
# print(forecast)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.holtwinters import ExponentialSmoothing

class DemandForecasting:
    def __init__(self, file_path):
        self.df = pd.read_csv(file_path)
        self.df['year'] = pd.to_datetime(self.df['year'], format='%Y')
        self.df = self.df[self.df['year'].isin([2020, 2021, 2022, 2023])]
        self.df.set_index(['age', 'year'], inplace=True)
        self.variability_measures = None
        self.forecast_df = None

    def calculate_variability_measures(self):
        measures = pd.DataFrame()
        for age in self.df.index.get_level_values(0).unique():
            ts = self.df.xs(age, level='age')['demand'].dropna()
            std_dev = ts.std()
            variance = ts.var()
            range_val = ts.max() - ts.min()
            iqr = ts.quantile(0.75) - ts.quantile(0.25)
            cv = (std_dev / ts.mean()) * 100 if ts.mean() != 0 else 0
            mad = ts.mad()
            qd = iqr / 2
            measures.loc[age, 'Standard Deviation'] = std_dev
            measures.loc[age, 'Variance'] = variance
            measures.loc[age, 'Range'] = range_val
            measures.loc[age, 'IQR'] = iqr
            measures.loc[age, 'CV'] = cv
            measures.loc[age, 'MAD'] = mad
            measures.loc[age, 'QD'] = qd
        self.variability_measures = measures
        return measures

    def cohort_analysis_forecast_with_stl(self):
        forecast_df = pd.DataFrame()
        for age in self.df.index.get_level_values(0).unique():
            print(f"age: {age}")
            ts = self.df.xs(age, level='age')['demand'].dropna()
            # Ensure that the time series has a frequency for the STL decomposition
            if not ts.index.inferred_freq:
                ts = ts.asfreq('Y')
            stl = STL(endog=ts, period=None, seasonal=13)
            result = stl.fit()
            model = ExponentialSmoothing(result.trend, trend='add', seasonal='add', seasonal_periods=13)
            fit_model = model.fit()
            forecast_value = fit_model.forecast(1).iloc[0]
            uncertainty_ratio = self.df.xs(age, level='age')['Uncertainty_Ratio'].mean() if 'Uncertainty_Ratio' in self.df.columns else 1
            forecast_with_uncertainty = forecast_value * uncertainty_ratio
            forecast_df.loc[age, 'Forecast'] = forecast_with_uncertainty
        self.forecast_df = forecast_df
        return forecast_df

    def visualize_forecast_with_uncertainty(self):
        plt.figure(figsize=(14, 7))
        for age in self.df.index.get_level_values(0).unique():
            ts = self.df.xs(age, level='age')['demand']
            plt.plot(ts.index, ts, label=f'Original Demand (Age {age})', marker='o')
        for age in self.forecast_df.index:
            plt.plot(self.forecast_df.columns, self.forecast_df.loc[age], label=f'Forecasted Demand (Age {age})', marker='x')
        for age in self.forecast_df.index:
            adjusted_forecast = self.forecast_df.loc[age] * self.df.xs(age, level='age')['Uncertainty_Ratio'].mean()
            plt.plot(self.forecast_df.columns, adjusted_forecast, label=f'Adjusted Forecast (Age {age})', linestyle='--')
        plt.xlabel('Year')
        plt.ylabel('Demand')
        plt.title('Demand Forecast with Uncertainty Adjustment')
        plt.legend()
        plt.show()

    def visualize_variability_measures(self):
        # Reset index to use 'age' as a column for the boxplot
        df_reset = self.df.reset_index()
        
        # Check if there are any NaN values in 'age' or 'demand' columns
        if df_reset['age'].isnull().any() or df_reset['demand'].isnull().any():
            # Drop rows with NaN values in 'age' or 'demand' columns
            df_reset = df_reset.dropna(subset=['age', 'demand'])
        
        # Check if there are enough data points to create a boxplot
        if len(df_reset) > 0:
            plt.figure(figsize=(12, 6))
            sns.boxplot(x='age', y='demand', data=df_reset)
            plt.title('Box Plot of Demand by Age')
            plt.xlabel('Age')
            plt.ylabel('Demand')
            plt.show()
        else:
            print("Not enough data to create a boxplot.")


        plt.figure(figsize=(12, 6))
        sns.histplot(self.df['demand'], bins=20, kde=True)
        plt.title('Histogram of Demand')
        plt.xlabel('Demand')
        plt.ylabel('Frequency')
        plt.show()

        plt.figure(figsize=(12, 6))
        sns.violinplot(x='age', y='demand', data=self.df.reset_index())
        plt.title('Violin Plot of Demand by Age')
        plt.xlabel('Age')
        plt.ylabel('Demand')
        plt.show()

        mean_demand = self.df.groupby('age')['demand'].mean()
        std_demand = self.df.groupby('age')['demand'].std()
        plt.figure(figsize=(12, 6))
        plt.errorbar(mean_demand.index, mean_demand, yerr=std_demand, fmt='o', capsize=5)
        plt.title('Mean Demand with Standard Deviation by Age')
        plt.xlabel('Age')
        plt.ylabel('Demand')
        plt.show()

        pivot_table = self.df.reset_index().pivot_table(index='age', columns='year', values='demand', aggfunc='mean')
        plt.figure(figsize=(12, 6))
        sns.heatmap(pivot_table, cmap='coolwarm', annot=True, fmt=".1f")
        plt.title('Heatmap of Mean Demand by Age and Year')
        plt.xlabel('Year')
        plt.ylabel('Age')
        plt.show()

    def apply_uncertainty_to_forecast(self,start_period,end_period,average_annual_receipts,total_annual_donors,annual_donor_demands_age_specific):
        # Load additional data
        average_annual_receipts = average_annual_receipts
        total_annual_donors = total_annual_donors 
        annual_donor_demands_age_specific = annual_donor_demands_age_specific

        # Calculate the ratios
        ratio_receipts_to_demands = round((average_annual_receipts['demand']) / (total_annual_donors), 2)
        ratio_age_specific = annual_donor_demands_age_specific.groupby(['year', 'age'])['demand'].sum() / annual_donor_demands_age_specific.groupby('year')['demand'].sum()

        # Naive extrapolation for the specified period
        forecast_dates = pd.date_range(start=start_period, end=end_period, freq='D')
        last_known_value = self.df['demand'].iloc[-1]
        forecast_values = pd.Series(last_known_value, index=forecast_dates)

        # Apply the ratios to the forecasted values
        forecast_with_uncertainty = forecast_values.copy()
        for date in forecast_dates:
            year = date.year
            age = self.df.loc[self.df.index.get_level_values('year') == date.year, 'age'].unique()[0] if 'age' in self.df.columns else None
            ratio_1 = ratio_receipts_to_demands.loc[year]
            ratio_2 = ratio_age_specific.loc[(year, age)] if age in ratio_age_specific.index.get_level_values('age') else 0
            forecast_with_uncertainty[date] = forecast_values[date] * ratio_1 * ratio_2

        # Return the forecast with applied uncertainty
        return forecast_with_uncertainty
    
# Calculate variability measures
variability_measures = forecasting.calculate_variability_measures()
print(variability_measures)

# Perform cohort analysis forecast with STL
forecast = forecasting.cohort_analysis_forecast_with_stl()
print(forecast)

# Visualize the forecast with uncertainty
forecasting.visualize_forecast_with_uncertainty()

# Visualize variability measures
#forecasting.visualize_variability_measures()

file_path = 'annual_unique_donors.csv'
forecasting = DemandForecasting(file_path)

# Apply uncertainty to forecast using naive extrapolation
start_period = pd.Timestamp('2024-04-01')
end_period = pd.Timestamp('2026-03-31')

annual_donor_receipts = pd.read_csv('annual_receipts.csv')
average_annual_receipts = annual_donor_receipts.rename(columns={'Yearly Receipt': 'demand'})
average_annual_receipts = average_annual_receipts[average_annual_receipts['year'].isin([2020, 2021, 2022, 2023])]
average_annual_receipts.set_index('year', inplace=True)
#pd.DataFrame({'year': [2022, 2023, 2024],'Yearly Receipt': [7.519675e+05, 9.700350e+05, 1.124615e+06]})

annual_donor_demands = pd.read_csv('annual_unique_donors.csv') #pd.Series([601254, 752898, 505219], index=[2022, 2023, 2024])
total_annual_donors = annual_donor_demands
total_annual_donors = total_annual_donors[total_annual_donors['year'].isin([2020, 2021, 2022, 2023])]
total_annual_donors = total_annual_donors.groupby('year')['demand'].sum()

annual_donor_demands_age_specific = pd.read_csv('annual_demands_age_specific.csv') #pd.DataFrame({})
annual_donor_demands_age_specific = annual_donor_demands_age_specific[annual_donor_demands_age_specific['year'].isin([2020, 2021, 2022, 2023])]

forecast_with_uncertainty = forecasting.apply_uncertainty_to_forecast(start_period, end_period, average_annual_receipts, total_annual_donors, annual_donor_demands_age_specific)
print(forecast_with_uncertainty)

In [None]:
annual_donor_demands.xs(18, level='age')['demand'].dropna()

In [None]:
annual_donor_demands['demand']

In [None]:
# import pandas as pd
# import numpy as np
# from statsmodels.tsa.statespace.sarimax import SARIMAX

# class DemandForecasting:
#     def __init__(self, file_path):
#         self.df = pd.read_csv(file_path)
#         self.df['year'] = pd.to_datetime(self.df['year'], format='%Y')
#         self.df = self.df[self.df['year'].isin([2020, 2021, 2022, 2023])]
#         self.df.set_index(['age', 'year'], inplace=True)
#         self.variability_measures = None
#         self.forecast_df = None

#     def calculate_variability_measures(self):
#         measures = pd.DataFrame()
#         for age in self.df.index.get_level_values(0).unique():
#             ts = self.df.xs(age, level='age')['demand'].dropna()
#             std_dev = ts.std()
#             variance = ts.var()
#             range_val = ts.max() - ts.min()
#             iqr = ts.quantile(0.75) - ts.quantile(0.25)
#             cv = (std_dev / ts.mean()) * 100 if ts.mean() != 0 else 0
#             mad = ts.mad()
#             qd = iqr / 2
#             measures.loc[age, 'Standard Deviation'] = std_dev
#             measures.loc[age, 'Variance'] = variance
#             measures.loc[age, 'Range'] = range_val
#             measures.loc[age, 'IQR'] = iqr
#             measures.loc[age, 'CV'] = cv
#             measures.loc[age, 'MAD'] = mad
#             measures.loc[age, 'QD'] = qd
#         self.variability_measures = measures
#         return measures

#     def cohort_analysis_forecast_with_stl(self):
#         forecast_df = pd.DataFrame()
#         for age in self.df.index.get_level_values(0).unique():
#             print(f"age: {age}")
#             ts = self.df.xs(age, level='age')['demand'].dropna()
#             stl = STL(ts, seasonal=13)
#             result = stl.fit()
#             model = ExponentialSmoothing(result.trend, trend='add', seasonal='add', seasonal_periods=13)
#             fit_model = model.fit()
#             forecast_value = fit_model.forecast(1).iloc[0]
#             uncertainty_ratio = self.df.xs(age, level='age')['Uncertainty_Ratio'].mean() if 'Uncertainty_Ratio' in self.df.columns else 1
#             forecast_with_uncertainty = forecast_value * uncertainty_ratio
#             forecast_df.loc[age, 'Forecast'] = forecast_with_uncertainty
#         self.forecast_df = forecast_df
#         return forecast_df

#     def visualize_forecast_with_uncertainty(self):
#         plt.figure(figsize=(14, 7))
#         for age in self.df.index.get_level_values(0).unique():
#             ts = self.df.xs(age, level='age')['demand']
#             plt.plot(ts.index, ts, label=f'Original Demand (Age {age})', marker='o')
#         for age in self.forecast_df.index:
#             plt.plot(self.forecast_df.columns, self.forecast_df.loc[age], label=f'Forecasted Demand (Age {age})', marker='x')
#         for age in self.forecast_df.index:
#             adjusted_forecast = self.forecast_df.loc[age] * self.df.xs(age, level='age')['Uncertainty_Ratio'].mean()
#             plt.plot(self.forecast_df.columns, adjusted_forecast, label=f'Adjusted Forecast (Age {age})', linestyle='--')
#         plt.xlabel('Year')
#         plt.ylabel('Demand')
#         plt.title('Demand Forecast with Uncertainty Adjustment')
#         plt.legend()
#         plt.show()

#     def visualize_variability_measures(self):
#         plt.figure(figsize=(12, 6))
#         sns.boxplot(x='age', y='demand', data=self.df.reset_index())
#         plt.title('Box Plot of Demand by Age')
#         plt.xlabel('Age')
#         plt.ylabel('Demand')
#         plt.show()

#         plt.figure(figsize=(12, 6))
#         sns.histplot(self.df['demand'], bins=20, kde=True)
#         plt.title('Histogram of Demand')
#         plt.xlabel('Demand')
#         plt.ylabel('Frequency')
#         plt.show()

#         plt.figure(figsize=(12, 6))
#         sns.violinplot(x='age', y='demand', data=self.df.reset_index())
#         plt.title('Violin Plot of Demand by Age')
#         plt.xlabel('Age')
#         plt.ylabel('Demand')
#         plt.show()

#         mean_demand = self.df.groupby('age')['demand'].mean()
#         std_demand = self.df.groupby('age')['demand'].std()
#         plt.figure(figsize=(12, 6))
#         plt.errorbar(mean_demand.index, mean_demand, yerr=std_demand, fmt='o', capsize=5)
#         plt.title('Mean Demand with Standard Deviation by Age')
#         plt.xlabel('Age')
#         plt.ylabel('Demand')
#         plt.show()

#         pivot_table = self.df.reset_index().pivot_table(index='age', columns='year', values='demand', aggfunc='mean')
#         plt.figure(figsize=(12, 6))
#         sns.heatmap(pivot_table, cmap='coolwarm', annot=True, fmt=".1f")
#         plt.title('Heatmap of Mean Demand by Age and Year')
#         plt.xlabel('Year')
#         plt.ylabel('Age')
#         plt.show()

#     def apply_uncertainty_to_forecast(
#         self, sarima_order, seasonal_order, start_period, end_period, average_annual_receipts, total_annual_donors, annual_donor_demands_age_specific):
#         # Load additional data
#         average_annual_receipts = average_annual_receipts
#         total_annual_donors = total_annual_donors 
#         annual_donor_demands_age_specific = annual_donor_demands_age_specific

#         # Calculate the ratios
#         ratio_receipts_to_demands = round((average_annual_receipts['demand']) / (total_annual_donors), 2)
#         ratio_age_specific = annual_donor_demands_age_specific.groupby(['year', 'age'])['demand'].sum() / annual_donor_demands_age_specific.groupby('year')['demand'].sum()

#         # Fit SARIMA model
#         sarima_model = SARIMAX(self.df['demand'], order=sarima_order, seasonal_order=seasonal_order)
#         sarima_results = sarima_model.fit()

#         # Forecast for the specified period
#         #forecast = sarima_results.get_forecast(steps=(end_period - start_period).days)
#         forecast_dates = pd.date_range(start=start_period, end=end_period, freq='D')
#         forecast = sarima_results.get_forecast(steps=len(forecast_dates))
#         forecast_values = forecast.predicted_mean

#         # Apply the ratios to the forecasted values
#         forecast_with_uncertainty = forecast_values.copy()
#         for date in forecast_dates: #forecast_values.index:
#             year = date.year
#             print(f"For year: {year}")
#             age = self.df.loc[self.df.index.get_level_values('year') == date.year, 'age'].unique()[0] if 'age' in self.df.columns else None
#             ratio_1 = ratio_receipts_to_demands.loc[year]
#             ratio_2 = ratio_age_specific.loc[(year, age)] if age in ratio_age_specific.index.get_level_values('age') else 0
#             forecast_with_uncertainty[date] = forecast_values[date] * ratio_1 * ratio_2
#             print(f"""For year: {year}, the ratio_receipts_to_demands is: {ratio_1} 
#             and the ratio_receipts_to_demands is: {ratio_2}""")           
#             #age = self.df.loc[date, 'age'] if 'age' in self.df.columns else None
#             #age = self.df.loc[self.df.index.get_level_values('year') == date.year, 'age'].unique()[0]
#             #ratio_1 = ratio_receipts_to_demands.loc[year]
#             #ratio_1 = ratio_receipts_to_demands.reset_index().loc[:].demand #.year
#             #ratio_2 = ratio_age_specific.loc[(year, age)] if age in ratio_age_specific.index.get_level_values('age') else 0

#         # Return the forecast with applied uncertainty
#         return forecast_with_uncertainty

# # Example usage
# file_path = 'annual_unique_donors.csv'
# forecasting = DemandForecasting(file_path)
# sarima_order = (1, 1, 1)
# seasonal_order = (1, 1, 1, 12)
# start_period = pd.Timestamp('2024-01-01')  #('2024-04-01')
# end_period = pd.Timestamp('2026-01-01')
# annual_donor_receipts = pd.read_csv('annual_receipts.csv')
# average_annual_receipts = annual_donor_receipts.rename(columns={'Yearly Receipt': 'demand'})
# average_annual_receipts = average_annual_receipts[average_annual_receipts['year'].isin([2020, 2021, 2022, 2023])]
# average_annual_receipts.set_index('year', inplace=True)
# #pd.DataFrame({'year': [2022, 2023, 2024],'Yearly Receipt': [7.519675e+05, 9.700350e+05, 1.124615e+06]})

# annual_donor_demands = pd.read_csv('annual_unique_donors.csv') #pd.Series([601254, 752898, 505219], index=[2022, 2023, 2024])
# total_annual_donors = annual_donor_demands
# total_annual_donors = total_annual_donors[total_annual_donors['year'].isin([2020, 2021, 2022, 2023])]
# total_annual_donors = total_annual_donors.groupby('year')['demand'].sum()

# annual_donor_demands_age_specific = pd.read_csv('annual_demands_age_specific.csv') #pd.DataFrame({})
# annual_donor_demands_age_specific = annual_donor_demands_age_specific[annual_donor_demands_age_specific['year'].isin([2020, 2021, 2022, 2023])]

# forecast_with_uncertainty = forecasting.apply_uncertainty_to_forecast(
#     sarima_order, seasonal_order, start_period, end_period, average_annual_receipts, total_annual_donors, annual_donor_demands_age_specific)
# print(forecast_with_uncertainty)


In [None]:
forecast_dates

In [None]:
ratio_receipts_to_demands = round((average_annual_receipts['demand']) / (total_annual_donors), 2)
#ratio_receipts_to_demands.reset_index()
#ratio_receipts_to_demands.loc['year']
#ratio_receipts_to_demands.reset_index()
ratio_1 = ratio_receipts_to_demands.reset_index().loc[:].year #ratio_receipts_to_demands.loc[year]
ratio_1 = ratio_receipts_to_demands.reset_index().loc[:].year

#ratio_age_specific = annual_donor_demands_age_specific.groupby(['year', 'age'])['demand'].sum() / annual_donor_demands_age_specific.groupby('year')['demand'].sum()
#ratio_2 = ratio_age_specific.loc[(year, age)] if age in ratio_age_specific.index.get_level_values('age') else 0
#forecast_with_uncertainty[date] = forecast_values[date] * ratio_1 * ratio_2


In [None]:
ratio_receipts_to_demands.loc[2020]

In [None]:
pd.date_range(start=start_period, end=end_period, freq='D')

In [None]:
forecast_values

In [None]:
# import pandas as pd
# import numpy as np
# from statsmodels.tsa.statespace.sarimax import SARIMAX

# class DemandForecasting:
#     def __init__(self, file_path):
#         self.df = pd.read_csv(file_path)
#         self.df['year'] = pd.to_datetime(self.df['year'], format='%Y')
#         self.df.set_index(['age', 'year'], inplace=True)
#         self.variability_measures = None
#         self.forecast_df = None

#     def calculate_variability_measures(self):
#         measures = pd.DataFrame()
#         for age in self.df.index.get_level_values(0).unique():
#             ts = self.df.xs(age, level='age')['demand'].dropna()
#             std_dev = ts.std()
#             variance = ts.var()
#             range_val = ts.max() - ts.min()
#             iqr = ts.quantile(0.75) - ts.quantile(0.25)
#             cv = (std_dev / ts.mean()) * 100 if ts.mean() != 0 else 0
#             mad = ts.mad()
#             qd = iqr / 2
#             measures.loc[age, 'Standard Deviation'] = std_dev
#             measures.loc[age, 'Variance'] = variance
#             measures.loc[age, 'Range'] = range_val
#             measures.loc[age, 'IQR'] = iqr
#             measures.loc[age, 'CV'] = cv
#             measures.loc[age, 'MAD'] = mad
#             measures.loc[age, 'QD'] = qd
#         self.variability_measures = measures
#         return measures

#     def cohort_analysis_forecast_with_stl(self):
#         forecast_df = pd.DataFrame()
#         for age in self.df.index.get_level_values(0).unique():
#             ts = self.df.xs(age, level='age')['demand'].dropna()
#             stl = STL(ts, seasonal=13)
#             result = stl.fit()
#             model = ExponentialSmoothing(result.trend, trend='add', seasonal='add', seasonal_periods=13)
#             fit_model = model.fit()
#             forecast_value = fit_model.forecast(1).iloc[0]
#             uncertainty_ratio = self.df.xs(age, level='age')['Uncertainty_Ratio'].mean() if 'Uncertainty_Ratio' in self.df.columns else 1
#             forecast_with_uncertainty = forecast_value * uncertainty_ratio
#             forecast_df.loc[age, 'Forecast'] = forecast_with_uncertainty
#         self.forecast_df = forecast_df
#         return forecast_df

#     def visualize_forecast_with_uncertainty(self):
#         plt.figure(figsize=(14, 7))
#         for age in self.df.index.get_level_values(0).unique():
#             ts = self.df.xs(age, level='age')['demand']
#             plt.plot(ts.index, ts, label=f'Original Demand (Age {age})', marker='o')
#         for age in self.forecast_df.index:
#             plt.plot(self.forecast_df.columns, self.forecast_df.loc[age], label=f'Forecasted Demand (Age {age})', marker='x')
#         for age in self.forecast_df.index:
#             adjusted_forecast = self.forecast_df.loc[age] * self.df.xs(age, level='age')['Uncertainty_Ratio'].mean()
#             plt.plot(self.forecast_df.columns, adjusted_forecast, label=f'Adjusted Forecast (Age {age})', linestyle='--')
#         plt.xlabel('Year')
#         plt.ylabel('Demand')
#         plt.title('Demand Forecast with Uncertainty Adjustment')
#         plt.legend()
#         plt.show()

#     def visualize_variability_measures(self):
#         plt.figure(figsize=(12, 6))
#         sns.boxplot(x='age', y='demand', data=self.df.reset_index())
#         plt.title('Box Plot of Demand by Age')
#         plt.xlabel('Age')
#         plt.ylabel('Demand')
#         plt.show()

#         plt.figure(figsize=(12, 6))
#         sns.histplot(self.df['demand'], bins=20, kde=True)
#         plt.title('Histogram of Demand')
#         plt.xlabel('Demand')
#         plt.ylabel('Frequency')
#         plt.show()

#         plt.figure(figsize=(12, 6))
#         sns.violinplot(x='age', y='demand', data=self.df.reset_index())
#         plt.title('Violin Plot of Demand by Age')
#         plt.xlabel('Age')
#         plt.ylabel('Demand')
#         plt.show()

#         mean_demand = self.df.groupby('age')['demand'].mean()
#         std_demand = self.df.groupby('age')['demand'].std()
#         plt.figure(figsize=(12, 6))
#         plt.errorbar(mean_demand.index, mean_demand, yerr=std_demand, fmt='o', capsize=5)
#         plt.title('Mean Demand with Standard Deviation by Age')
#         plt.xlabel('Age')
#         plt.ylabel('Demand')
#         plt.show()

#         pivot_table = self.df.reset_index().pivot_table(index='age', columns='year', values='demand', aggfunc='mean')
#         plt.figure(figsize=(12, 6))
#         sns.heatmap(pivot_table, cmap='coolwarm', annot=True, fmt=".1f")
#         plt.title('Heatmap of Mean Demand by Age and Year')
#         plt.xlabel('Year')
#         plt.ylabel('Age')
#         plt.show()

#     def apply_uncertainty_to_forecast(self, sarima_order, seasonal_order, start_period, end_period):
#         # Load additional data
#         annual_donor_receipts = pd.DataFrame({
#             'year': [2022, 2023, 2024],
#             'Yearly Receipt': [7.519675e+05, 9.700350e+05, 1.124615e+06]
#         })
#         annual_donor_demands = pd.Series([601254, 752898, 505219], index=[2022, 2023, 2024])
#         annual_donor_demands_age_specific = pd.DataFrame({
#             # Assuming this DataFrame contains the age-specific demand data as shown in the context
#         })

#         # Calculate the ratios
#         ratio_receipts_to_demands = annual_donor_receipts['Yearly Receipt'] / annual_donor_demands
#         ratio_age_specific = annual_donor_demands_age_specific.groupby(['year', 'age'])['demand'].sum() / annual_donor_demands_age_specific.groupby('year')['demand'].sum()

#         # Fit SARIMA model
#         sarima_model = SARIMAX(self.df['demand'], order=sarima_order, seasonal_order=seasonal_order)
#         sarima_results = sarima_model.fit()

#         # Forecast for the specified period
#         forecast = sarima_results.get_forecast(steps=(end_period - start_period).days)
#         forecast_values = forecast.predicted_mean

#         # Apply the ratios to the forecasted values
#         forecast_with_uncertainty = forecast_values.copy()
#         for date in forecast_values.index:
#             year = date.year
#             age = self.df.loc[date, 'age'] if 'age' in self.df.columns else None
#             ratio_1 = ratio_receipts_to_demands.loc[year]
#             ratio_2 = ratio_age_specific.loc[(year, age)] if age in ratio_age_specific.index.get_level_values('age') else 0
#             forecast_with_uncertainty[date] = forecast_values[date] * ratio_1 * ratio_2

#         # Return the forecast with applied uncertainty
#         return forecast_with_uncertainty

# # Example usage
# file_path = 'annual_unique_donors.csv'
# forecasting = DemandForecasting(file_path)
# sarima_order = (1, 1, 1)
# seasonal_order = (1, 1, 1, 12)
# start_period = pd.Timestamp('2024-04-01')
# end_period = pd.Timestamp('2026-03-31')
# forecast_with_uncertainty = forecasting.apply_uncertainty_to_forecast(sarima_order, seasonal_order, start_period, end_period)
# print(forecast_with_uncertainty)

In [None]:
end_period

In [None]:
annual_donor_demands_age_specific.groupby(['year', 'age'])['demand'].sum() / annual_donor_demands_age_specific.groupby('year')['demand'].sum()

In [None]:
# # Explanation
# # Class Initialization:
# # Loads the dataset and sets up the DataFrame.
# # Calculate Variability Measures:
# # Computes various measures of variability for each age cohort.
# # Cohort Analysis Forecast with STL:
# # Applies STL decomposition and Holt-Winters forecasting, adjusting the forecast with the uncertainty ratio.
# # Visualize Forecast with Uncertainty:
# # Plots the original demand, forecasted demand, and adjusted forecast with uncertainty.
# # Visualize Variability Measures:
# # Creates various plots (box plot, histogram, violin plot, scatter plot with error bars, and heatmap) to visualize the variability measures.
# # You can use this class to load your data, perform the analysis, and visualize the results. Feel free to run the code and let me know if you need any further assistance!

# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# from statsmodels.tsa.seasonal import STL
# from statsmodels.tsa.holtwinters import ExponentialSmoothing

# class DemandForecasting:
#     def __init__(self, historical_data):
#         self.df = historical_data
#         self.df['year'] = pd.to_datetime(self.df['year'], format='%Y')
#         self.df.set_index(['age', 'year'], inplace=True)
#         self.variability_measures = None
#         self.forecast_df = None

#     def calculate_variability_measures(self):
#         measures = pd.DataFrame()
#         for age in self.df.index.get_level_values(0).unique():
#             ts = self.df.xs(age, level='age')['demand'].dropna()
#             std_dev = ts.std()
#             variance = ts.var()
#             range_val = ts.max() - ts.min()
#             iqr = ts.quantile(0.75) - ts.quantile(0.25)
#             cv = (std_dev / ts.mean()) * 100 if ts.mean() != 0 else 0
#             mad = ts.mad()
#             qd = iqr / 2
#             measures.loc[age, 'Standard Deviation'] = std_dev
#             measures.loc[age, 'Variance'] = variance
#             measures.loc[age, 'Range'] = range_val
#             measures.loc[age, 'IQR'] = iqr
#             measures.loc[age, 'CV'] = cv
#             measures.loc[age, 'MAD'] = mad
#             measures.loc[age, 'QD'] = qd
#         self.variability_measures = measures
#         return measures

#     def cohort_analysis_forecast_with_stl(self):
#         forecast_df = pd.DataFrame()
#         for age in self.df.index.get_level_values(0).unique():
#             ts = self.df.xs(age, level='age')['demand'].dropna()
#             stl = STL(ts, seasonal=13)
#             result = stl.fit()
#             model = ExponentialSmoothing(result.trend, trend='add', seasonal='add', seasonal_periods=13)
#             fit_model = model.fit()
#             forecast_value = fit_model.forecast(1).iloc[0]
#             uncertainty_ratio = self.df.xs(age, level='age')['Uncertainty_Ratio'].mean() if 'Uncertainty_Ratio' in self.df.columns else 1
#             forecast_with_uncertainty = forecast_value * uncertainty_ratio
#             forecast_df.loc[age, 'Forecast'] = forecast_with_uncertainty
#         self.forecast_df = forecast_df
#         return forecast_df

#     def visualize_forecast_with_uncertainty(self):
#         plt.figure(figsize=(14, 7))
#         for age in self.df.index.get_level_values(0).unique():
#             ts = self.df.xs(age, level='age')['demand']
#             plt.plot(ts.index, ts, label=f'Original Demand (Age {age})', marker='o')
#         for age in self.forecast_df.index:
#             plt.plot(self.forecast_df.columns, self.forecast_df.loc[age], label=f'Forecasted Demand (Age {age})', marker='x')
#         for age in self.forecast_df.index:
#             adjusted_forecast = self.forecast_df.loc[age] * self.df.xs(age, level='age')['Uncertainty_Ratio'].mean()
#             plt.plot(self.forecast_df.columns, adjusted_forecast, label=f'Adjusted Forecast (Age {age})', linestyle='--')
#         plt.xlabel('Year')
#         plt.ylabel('Demand')
#         plt.title('Demand Forecast with Uncertainty Adjustment')
#         plt.legend()
#         plt.show()

#     def visualize_variability_measures(self):
#         plt.figure(figsize=(12, 6))
#         sns.boxplot(x='age', y='demand', data=self.df.reset_index())
#         plt.title('Box Plot of Demand by Age')
#         plt.xlabel('Age')
#         plt.ylabel('Demand')
#         plt.show()

#         plt.figure(figsize=(12, 6))
#         sns.histplot(self.df['demand'], bins=20, kde=True)
#         plt.title('Histogram of Demand')
#         plt.xlabel('Demand')
#         plt.ylabel('Frequency')
#         plt.show()

#         plt.figure(figsize=(12, 6))
#         sns.violinplot(x='age', y='demand', data=self.df.reset_index())
#         plt.title('Violin Plot of Demand by Age')
#         plt.xlabel('Age')
#         plt.ylabel('Demand')
#         plt.show()

#         mean_demand = self.df.groupby('age')['demand'].mean()
#         std_demand = self.df.groupby('age')['demand'].std()
#         plt.figure(figsize=(12, 6))
#         plt.errorbar(mean_demand.index, mean_demand, yerr=std_demand, fmt='o', capsize=5)
#         plt.title('Mean Demand with Standard Deviation by Age')
#         plt.xlabel('Age')
#         plt.ylabel('Demand')
#         plt.show()

#         pivot_table = self.df.reset_index().pivot_table(index='age', columns='year', values='demand', aggfunc='mean')
#         plt.figure(figsize=(12, 6))
#         sns.heatmap(pivot_table, cmap='coolwarm', annot=True, fmt=".1f")
#         plt.title('Heatmap of Mean Demand by Age and Year')
#         plt.xlabel('Year')
#         plt.ylabel('Age')
#         plt.show()

# # Example usage
# historical_data = annual_unique_donors
# forecasting = DemandForecasting(historical_data)
# variability_measures = forecasting.calculate_variability_measures()
# forecast = forecasting.cohort_analysis_forecast_with_stl()
# forecasting.visualize_forecast_with_uncertainty()
# forecasting.visualize_variability_measures()


In [None]:
can you add TSL seasonal forecasting to the code below based on the attched data?
text
annual_unique_donors.csv

Copilot
Executing Task:

add TSL seasonal forecasting to the code based on the attached data


You
Sent by you:
can you add TSL seasonal forecasting to the code below based on the attched data? def cohort_analysis_forecast(df):
    # Create a pivot table with 'year' as columns, 'age' as index, and 'demand' as values
    pivot = df.pivot_table(index='age', columns='year', values='demand', aggfunc='sum')
    
    # Calculate the growth rate for each age cohort
    growth_rates = pivot.pct_change(axis=1)
    
    # Calculate the average growth rate for each age cohort
    avg_growth_rates = growth_rates.mean(axis=1)
    
    # Get the last year of data for each age cohort
    last_year_data = pivot.iloc[:, -1]
    
    # Forecast the next year's demand by applying the average growth rate to the last year's data
    forecast = last_year_data * (1 + avg_growth_rates)
    
    # Incorporate uncertainty by multiplying the forecast by the 'Uncertainty_Ratio'
    # Assuming 'Uncertainty_Ratio' is a column in the dataframe
    uncertainty_ratio = df['Uncertainty_Ratio'].mean() if 'Uncertainty_Ratio' in df.columns else 1
    forecast_with_uncertainty = forecast * uncertainty_ratio
    
    # Return the forecast with uncertainty
    return forecast_with_uncertainty

# Call the function and print the forecast
forecast = cohort_analysis_forecast(df)
print(forecast)
text
annual_unique_donors.csv
Copilot
Executing Task:

add TSL seasonal forecasting to the code based on the attached data


Sent by Copilot:
It looks like there was an issue with the code execution because the dataset does not contain a ‘Date’ column. Let’s adjust the code to add TSL seasonal forecasting based on the data you provided.

Here’s the updated code:

Python

import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Load the dataset
df = pd.read_csv('annual_unique_donors.csv')

# Assuming the dataframe has a 'year' column and a 'demand' column
df['year'] = pd.to_datetime(df['year'], format='%Y')
df.set_index('year', inplace=True)

# Decompose the time series to observe seasonal patterns
decomposition = seasonal_decompose(df['demand'], model='additive', period=1)

# Fit the TSL model with seasonal components
model = ExponentialSmoothing(df['demand'], trend='add', seasonal='add', seasonal_periods=1)
fit_model = model.fit()

# Forecast the next period
forecast = fit_model.forecast(1)

# Add the forecast to the original function
def cohort_analysis_forecast(df, forecast_value):
    # Create a pivot table with 'year' as columns, 'age' as index, and 'demand' as values
    pivot = df.pivot_table(index='age', columns='year', values='demand', aggfunc='sum')
    
    # Calculate the growth rate for each age cohort
    growth_rates = pivot.pct_change(axis=1)
    
    # Calculate the average growth rate for each age cohort
    avg_growth_rates = growth_rates.mean(axis=1)
    
    # Get the last year of data for each age cohort
    last_year_data = pivot.iloc[:, -1]
    
    # Forecast the next year's demand by applying the average growth rate to the last year's data
    forecast = last_year_data * (1 + avg_growth_rates)
    
    # Incorporate uncertainty by multiplying the forecast by the 'Uncertainty_Ratio'
    # Assuming 'Uncertainty_Ratio' is a column in the dataframe
    uncertainty_ratio = df['Uncertainty_Ratio'].mean() if 'Uncertainty_Ratio' in df.columns else 1
    forecast_with_uncertainty = forecast * uncertainty_ratio
    
    # Add the TSL forecast value
    forecast_with_uncertainty['TSL_Forecast'] = forecast_value
    
    # Return the forecast with uncertainty
    return forecast_with_uncertainty

# Call the function and print the forecast with TSL seasonal forecasting
forecast_with_tsl = cohort_analysis_forecast(df, forecast.values[0])
print(forecast_with_tsl)
AI-generated code. Review and use carefully. More info on FAQ.
This code integrates TSL seasonal forecasting into your existing cohort analysis forecast function. It uses the ExponentialSmoothing model from the statsmodels library to forecast the next period based on the seasonal decomposition of the demand data. The forecast value is then incorporated into the final forecast with uncertainty.

Feel free to run this code and let me know if you encounter any issues or need further adjustments!

Learn more

1

annual_unique_donors.csv
You
Sent by you:
How can I visualize the forecast results?
Copilot
Sent by Copilot:
You can visualize the forecast results using various plotting libraries in Python, such as matplotlib or seaborn. Here’s an example of how you can visualize the forecast results using matplotlib:

Python

import pandas as pd
import matplotlib.pyplot as plt

# Assuming 'forecast_with_tsl' is the DataFrame returned by the cohort_analysis_forecast function
# and it includes the 'TSL_Forecast' column

def visualize_forecast(df, forecast_with_tsl):
    # Plot the original demand data
    plt.figure(figsize=(12, 6))
    plt.plot(df.index, df['demand'], label='Original Demand', marker='o')
    
    # Plot the forecasted demand
    plt.plot(forecast_with_tsl.index, forecast_with_tsl, label='Forecasted Demand', marker='x')
    
    # Highlight the TSL forecast
    plt.axvline(x=forecast_with_tsl.index[-1], color='r', linestyle='--', label='TSL Forecast')
    
    # Add labels and title
    plt.xlabel('Year')
    plt.ylabel('Demand')
    plt.title('Demand Forecast with TSL Seasonal Forecasting')
    plt.legend()
    
    # Show the plot
    plt.show()

# Call the visualization function
visualize_forecast(df, forecast_with_tsl)
AI-generated code. Review and use carefully. More info on FAQ.
This code will create a line plot showing the original demand data and the forecasted demand, with a vertical line highlighting the TSL forecast. You can customize the plot further by adding more details or adjusting the style to suit your needs.

Feel free to run this code and let me know if you need any further assistance!

You
Sent by you:
can you add STL for forecasting time series data forecasting model and adjust and change the above code to ensure that multiplying the forecast value for a specific age in a specific year to be corresponded to the related age and year when choosing age_specific Uncertainty_Ratio to the code below based on the attched data? # Load the dataset
df = pd.read_csv('annual_unique_donors.csv')                                                                                                         def cohort_analysis_forecast_with_seasonality(df):
    # Create a pivot table with 'year' as columns, 'age' as index, and 'demand' as values
    pivot = df.pivot_table(index='age', columns='year', values='demand', aggfunc='sum')
    
    # Initialize a DataFrame to store the forecast
    forecast_df = pd.DataFrame(index=pivot.index)
    
    # Loop through each age cohort to apply seasonal decomposition and forecast
    for age in pivot.index:
        # Get the time series for the current age cohort
        ts = pivot.loc[age].dropna()
        
        # Apply seasonal decomposition
        decomposition = seasonal_decompose(ts, model='multiplicative', period=12)
        
        # Extract the trend and seasonal components
        trend = decomposition.trend.dropna()
        seasonal = decomposition.seasonal.dropna()
        
        # Calculate the average growth rate for the trend component
        growth_rate = trend.pct_change().mean()
        
        # Forecast the next year's trend component
        last_trend_value = trend.iloc[-1]
        forecast_trend = last_trend_value * (1 + growth_rate)
        
        # Get the seasonal component for the next period (assuming seasonality repeats)
        next_seasonal_value = seasonal.iloc[-12 % len(seasonal)]
        
        # Combine the forecasted trend and seasonal components
        forecast_value = forecast_trend * next_seasonal_value
        
        # Incorporate uncertainty by multiplying the forecast by the 'Uncertainty_Ratio'
        uncertainty_ratio = df['Uncertainty_Ratio'].mean() if 'Uncertainty_Ratio' in df.columns else 1
        forecast_with_uncertainty = forecast_value * uncertainty_ratio
        
        # Store the forecast in the DataFrame
        forecast_df.loc[age, 'Forecast'] = forecast_with_uncertainty
    
    # Return the forecast DataFrame
    return forecast_df

# Call the function and print the forecast
forecast = cohort_analysis_forecast_with_seasonality(df)
print(forecast)
text
annual_unique_donors.csv
Copilot
Executing Task:

add STL for forecasting time series data forecasting model and adjust and change the code to ensure that multiplying the forecast value for a specific age in a specific year to be corresponded to the related age and year when choosing age_specific Uncertainty_Ratio to the code based on the attached data


Sent by Copilot:
It looks like there was an issue with the code execution because the dataset does not contain a ‘Date’ column. Let’s adjust the code to add STL seasonal forecasting based on the data you provided.

Here’s the updated code:

Python

import pandas as pd
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Load the dataset
df = pd.read_csv('annual_unique_donors.csv')

# Assuming the dataframe has 'year' and 'age' columns and a 'demand' column
df['year'] = pd.to_datetime(df['year'], format='%Y')
df.set_index(['age', 'year'], inplace=True)

# Function to apply STL and forecast
def cohort_analysis_forecast_with_stl(df):
    # Initialize a DataFrame to store the forecast
    forecast_df = pd.DataFrame()

    # Loop through each age cohort to apply STL and forecast
    for age in df.index.get_level_values(0).unique():
        # Get the time series for the current age cohort
        ts = df.xs(age, level='age')['demand'].dropna()
        
        # Apply STL decomposition
        stl = STL(ts, seasonal=13)
        result = stl.fit()
        
        # Forecast the next year's demand using Holt-Winters method
        model = ExponentialSmoothing(result.trend, trend='add', seasonal='add', seasonal_periods=13)
        fit_model = model.fit()
        forecast_value = fit_model.forecast(1).iloc[0]
        
        # Get the corresponding 'Uncertainty_Ratio' for the specific age and year
        uncertainty_ratio = df.xs(age, level='age')['Uncertainty_Ratio'].mean() if 'Uncertainty_Ratio' in df.columns else 1
        
        # Multiply the forecast value by the 'Uncertainty_Ratio'
        forecast_with_uncertainty = forecast_value * uncertainty_ratio
        
        # Store the forecast in the DataFrame
        forecast_df.loc[age, 'Forecast'] = forecast_with_uncertainty
    
    # Return the forecast DataFrame
    return forecast_df

# Call the function and print the forecast
forecast = cohort_analysis_forecast_with_stl(df)
print(forecast)
AI-generated code. Review and use carefully. More info on FAQ.
This code integrates STL seasonal forecasting into your existing cohort analysis forecast function. It uses the STL decomposition from the statsmodels library to decompose the time series data and then applies the Holt-Winters method for forecasting. The forecast value is then adjusted by the age-specific Uncertainty_Ratio.

Feel free to run this code and let me know if you encounter any issues or need further adjustments!

Learn more

1

annual_unique_donors.csv
You
Sent by you:
How can I interpret the forecast results?
Copilot
Sent by Copilot:
Interpreting the forecast results involves understanding the different components and what they represent. Here’s a breakdown of how to interpret the forecast results from the code provided:

Original Demand Data:
This represents the historical demand data for each age cohort over the years. It’s the baseline data used for forecasting.
STL Decomposition:
Seasonal Component: This shows the repeating patterns or cycles in the data, such as yearly seasonality.
Trend Component: This indicates the long-term progression of the data, showing whether the demand is generally increasing or decreasing over time.
Residual Component: This represents the random noise or irregular fluctuations in the data that are not explained by the trend or seasonal components.
Forecasted Demand:
This is the predicted demand for the next period based on the historical data and the STL decomposition. It combines the trend and seasonal components to project future demand.
Uncertainty Ratio:
This factor adjusts the forecasted demand to account for uncertainty. It can vary by age and year, reflecting the variability in the data.
Forecast with Uncertainty:
This is the final forecasted demand after adjusting for uncertainty. It provides a more realistic prediction by incorporating potential variations.
Example Interpretation
Let’s say you have the following forecast results for a specific age cohort:

Original Demand Data: Shows an increasing trend over the past 5 years.
Seasonal Component: Indicates higher demand during certain months of the year.
Trend Component: Shows a steady upward trend.
Residual Component: Contains some random fluctuations but no significant outliers.
Forecasted Demand: Predicts an increase in demand for the next year.
Uncertainty Ratio: Adjusts the forecast slightly higher due to observed variability.
Visualization
To visualize these results, you can plot the original demand data, the decomposed components (seasonal, trend, residual), and the forecasted demand. This will help you see how the forecast is derived and how the different components contribute to the final prediction.

Here’s an example of how to visualize the forecast results using matplotlib:

Python

import matplotlib.pyplot as plt

def visualize_forecast(df, forecast_df):
    # Plot the original demand data
    plt.figure(figsize=(14, 7))
    for age in df.index.get_level_values(0).unique():
        ts = df.xs(age, level='age')['demand']
        plt.plot(ts.index, ts, label=f'Original Demand (Age {age})', marker='o')
    
    # Plot the forecasted demand
    for age in forecast_df.index:
        plt.plot(forecast_df.columns, forecast_df.loc[age], label=f'Forecasted Demand (Age {age})', marker='x')
    
    # Add labels and title
    plt.xlabel('Year')
    plt.ylabel('Demand')
    plt.title('Demand Forecast with STL Seasonal Forecasting')
    plt.legend()
    
    # Show the plot
    plt.show()

# Call the visualization function
visualize_forecast(df, forecast)
AI-generated code. Review and use carefully. More info on FAQ.
This code will create a line plot showing the original demand data and the forecasted demand for each age cohort. You can customize the plot further to highlight specific components or add more details.

Feel free to run this code and let me know if you need any further assistance!

You
Sent by you:
How do I interpret the uncertainty ratio?
Copilot
Sent by Copilot:
The uncertainty ratio is a factor used to adjust the forecasted demand to account for variability and potential errors in the prediction. Here’s how you can interpret it:

Understanding the Uncertainty Ratio
Purpose:
The uncertainty ratio is used to incorporate the inherent uncertainty in the forecast. It adjusts the forecasted values to reflect the possible range of outcomes, considering the variability in the historical data.
Calculation:
In your code, the uncertainty ratio is calculated as the mean of the ‘Uncertainty_Ratio’ column for each specific age cohort. If this column is not present, a default value of 1 is used, indicating no adjustment.
Impact on Forecast:
The forecasted demand is multiplied by the uncertainty ratio. If the uncertainty ratio is greater than 1, it increases the forecasted value, indicating higher variability and potential for greater demand. If it is less than 1, it decreases the forecasted value, indicating lower variability and potential for lesser demand.
Example Interpretation
Let’s say you have a forecasted demand of 100 units for a specific age cohort, and the uncertainty ratio for that cohort is 1.2. The adjusted forecast would be:

[ \text{Adjusted Forecast} = \text{Forecasted Demand} \times \text{Uncertainty Ratio} ] [ \text{Adjusted Forecast} = 100 \times 1.2 = 120 ]

This means that, considering the variability in the data, the forecasted demand is adjusted to 120 units.

Practical Use
Higher Uncertainty Ratio: Indicates more variability in the historical data, suggesting that the forecasted demand could be higher than initially predicted.
Lower Uncertainty Ratio: Indicates less variability, suggesting that the forecasted demand could be lower or more stable.
Visualization
To visualize the impact of the uncertainty ratio, you can plot the original forecasted values and the adjusted values side by side. This will help you see how the uncertainty ratio modifies the forecast.

Here’s an example of how to visualize this using matplotlib:

Python

import matplotlib.pyplot as plt

def visualize_forecast_with_uncertainty(df, forecast_df):
    # Plot the original demand data
    plt.figure(figsize=(14, 7))
    for age in df.index.get_level_values(0).unique():
        ts = df.xs(age, level='age')['demand']
        plt.plot(ts.index, ts, label=f'Original Demand (Age {age})', marker='o')
    
    # Plot the forecasted demand
    for age in forecast_df.index:
        plt.plot(forecast_df.columns, forecast_df.loc[age], label=f'Forecasted Demand (Age {age})', marker='x')
    
    # Highlight the adjusted forecast with uncertainty
    for age in forecast_df.index:
        adjusted_forecast = forecast_df.loc[age] * df.xs(age, level='age')['Uncertainty_Ratio'].mean()
        plt.plot(forecast_df.columns, adjusted_forecast, label=f'Adjusted Forecast (Age {age})', linestyle='--')
    
    # Add labels and title
    plt.xlabel('Year')
    plt.ylabel('Demand')
    plt.title('Demand Forecast with Uncertainty Adjustment')
    plt.legend()
    
    # Show the plot
    plt.show()

# Call the visualization function
visualize_forecast_with_uncertainty(df, forecast)
AI-generated code. Review and use carefully. More info on FAQ.
This code will create a line plot showing the original demand data, the forecasted demand, and the adjusted forecast with uncertainty for each age cohort. This visualization helps in understanding how the uncertainty ratio affects the forecasted values.

Feel free to run this code and let me know if you need any further assistance!

You
Sent by you:
How can I calculate the uncertainty ratio for my specific data?
Copilot
Sent by Copilot:
To calculate the uncertainty ratio for your specific data, you need to quantify the variability or uncertainty in your historical demand data. This can be done using statistical measures such as the standard deviation or variance. Here’s a step-by-step guide on how to calculate the uncertainty ratio:

Step-by-Step Guide
Load Your Data: Ensure your data is loaded into a DataFrame and properly formatted.
Calculate the Standard Deviation: Compute the standard deviation of the demand for each age cohort. The standard deviation measures the amount of variation or dispersion in the data.
Calculate the Mean Demand: Compute the mean (average) demand for each age cohort.
Calculate the Uncertainty Ratio: The uncertainty ratio can be calculated as the ratio of the standard deviation to the mean demand. This gives a relative measure of variability.
Example Code
Here’s an example of how you can calculate the uncertainty ratio for each age cohort in your dataset:

Python

import pandas as pd

# Load the dataset
df = pd.read_csv('annual_unique_donors.csv')

# Assuming the dataframe has 'year', 'age', and 'demand' columns
df['year'] = pd.to_datetime(df['year'], format='%Y')
df.set_index(['age', 'year'], inplace=True)

# Calculate the uncertainty ratio for each age cohort
def calculate_uncertainty_ratio(df):
    # Initialize a DataFrame to store the uncertainty ratios
    uncertainty_ratios = pd.DataFrame()

    # Loop through each age cohort
    for age in df.index.get_level_values(0).unique():
        # Get the demand time series for the current age cohort
        ts = df.xs(age, level='age')['demand'].dropna()
        
        # Calculate the standard deviation and mean demand
        std_dev = ts.std()
        mean_demand = ts.mean()
        
        # Calculate the uncertainty ratio
        uncertainty_ratio = std_dev / mean_demand if mean_demand != 0 else 0
        
        # Store the uncertainty ratio in the DataFrame
        uncertainty_ratios.loc[age, 'Uncertainty_Ratio'] = uncertainty_ratio
    
    return uncertainty_ratios

# Call the function and print the uncertainty ratios
uncertainty_ratios = calculate_uncertainty_ratio(df)
print(uncertainty_ratios)
AI-generated code. Review and use carefully. More info on FAQ.
Explanation
Standard Deviation (std_dev): Measures the spread of the demand values around the mean. A higher standard deviation indicates more variability.
Mean Demand (mean_demand): The average demand for the age cohort.
Uncertainty Ratio (uncertainty_ratio): The ratio of the standard deviation to the mean demand. This provides a relative measure of the variability in the demand data.
Using the Uncertainty Ratio
Once you have calculated the uncertainty ratios, you can use them to adjust your forecasted demand values. This will help you account for the variability in the historical data and provide a more realistic forecast.

Feel free to run this code with your data and let me know if you need any further assistance!

You
Sent by you:
How can I interpret the uncertainty ratio in my specific context?
Copilot
Sent by Copilot:
Interpreting the uncertainty ratio in your specific context involves understanding how it reflects the variability and reliability of your forecasted demand. Here’s a detailed guide to help you interpret it:

What the Uncertainty Ratio Represents
Variability in Historical Data:
The uncertainty ratio is a measure of how much the demand has varied historically for a specific age cohort. A higher ratio indicates more variability, while a lower ratio indicates more stability.
Adjustment to Forecast:
The ratio adjusts the forecasted demand to account for this variability. It helps in providing a range of possible outcomes rather than a single point estimate, making the forecast more realistic.
Interpreting Different Values
High Uncertainty Ratio (> 1):
Implication: There is significant variability in the historical demand data.
Interpretation: The forecasted demand is adjusted upwards to account for potential higher demand. This suggests that the future demand could be more volatile and less predictable.
Example: If the forecasted demand is 100 units and the uncertainty ratio is 1.5, the adjusted forecast would be 150 units.
Low Uncertainty Ratio (< 1):
Implication: There is less variability in the historical demand data.
Interpretation: The forecasted demand is adjusted downwards, indicating more stable and predictable future demand.
Example: If the forecasted demand is 100 units and the uncertainty ratio is 0.8, the adjusted forecast would be 80 units.
Uncertainty Ratio = 1:
Implication: The variability in the historical data is average or negligible.
Interpretation: The forecasted demand remains unchanged, suggesting that the historical data is a good predictor of future demand without significant adjustments.
Example: If the forecasted demand is 100 units and the uncertainty ratio is 1, the adjusted forecast remains 100 units.
Practical Application
Decision Making: Use the adjusted forecast to make informed decisions about inventory, resource allocation, and planning. A higher uncertainty ratio might prompt you to prepare for a wider range of outcomes.
Risk Management: Understand the potential risks associated with high variability. A higher uncertainty ratio indicates a need for contingency plans to handle unexpected changes in demand.
Performance Evaluation: Evaluate the performance of your forecasting model by comparing the adjusted forecasts with actual outcomes. This can help in refining the model and improving accuracy.
Visualization
Visualizing the impact of the uncertainty ratio can help in better understanding its effect on the forecast. Here’s an example of how to visualize the original forecast and the adjusted forecast:

Python

import matplotlib.pyplot as plt

def visualize_forecast_with_uncertainty(df, forecast_df, uncertainty_ratios):
    plt.figure(figsize=(14, 7))
    
    for age in df.index.get_level_values(0).unique():
        ts = df.xs(age, level='age')['demand']
        plt.plot(ts.index, ts, label=f'Original Demand (Age {age})', marker='o')
    
    for age in forecast_df.index:
        plt.plot(forecast_df.columns, forecast_df.loc[age], label=f'Forecasted Demand (Age {age})', marker='x')
    
    for age in forecast_df.index:
        adjusted_forecast = forecast_df.loc[age] * uncertainty_ratios.loc[age, 'Uncertainty_Ratio']
        plt.plot(forecast_df.columns, adjusted_forecast, label=f'Adjusted Forecast (Age {age})', linestyle='--')
    
    plt.xlabel('Year')
    plt.ylabel('Demand')
    plt.title('Demand Forecast with Uncertainty Adjustment')
    plt.legend()
    plt.show()

# Assuming 'uncertainty_ratios' is the DataFrame containing the calculated uncertainty ratios
visualize_forecast_with_uncertainty(df, forecast, uncertainty_ratios)
AI-generated code. Review and use carefully. More info on FAQ.
This visualization will help you see how the uncertainty ratio modifies the forecasted values and provides a clearer picture of potential future demand.

Feel free to run this code and let me know if you need any further assistance!

You
Sent by you:
What other statistical measures can help assess variability?
Copilot
Sent by Copilot:
In addition to the standard deviation and mean, several other statistical measures can help assess variability in your data. Here are some key measures:

1. Variance
Definition: Variance is the average of the squared differences from the mean. It provides a measure of how much the data points differ from the mean.
Formula:
Variance
(
σ
2
)
=
∑
(
x
i
−
μ
)
2
N
Variance(σ2)=N∑(xi​−μ)2​

Interpretation: A higher variance indicates more spread out data points.
2. Range
Definition: The range is the difference between the maximum and minimum values in the dataset.
Formula:
Range
=
Max
−
Min
Range=Max−Min

Interpretation: A larger range indicates a wider spread of data points.
3. Interquartile Range (IQR)
Definition: The IQR is the range between the first quartile (Q1) and the third quartile (Q3). It measures the spread of the middle 50% of the data.
Formula:
IQR
=
Q
3
−
Q
1
IQR=Q3−Q1

Interpretation: A larger IQR indicates more variability in the middle 50% of the data.
4. Coefficient of Variation (CV)
Definition: The CV is the ratio of the standard deviation to the mean, expressed as a percentage. It provides a standardized measure of dispersion.
Formula:
CV
=
σ
μ
×
100
CV=μσ​×100

Interpretation: A higher CV indicates greater relative variability.
5. Mean Absolute Deviation (MAD)
Definition: MAD is the average of the absolute differences between each data point and the mean.
Formula:
MAD
=
∑
∣
x
i
−
μ
∣
N
MAD=N∑∣xi​−μ∣​

Interpretation: A higher MAD indicates more variability.
6. Quartile Deviation (QD)
Definition: QD is half of the IQR and provides a measure of the spread of the middle 50% of the data.
Formula:
QD
=
Q
3
−
Q
1
2
QD=2Q3−Q1​

Interpretation: A larger QD indicates more variability in the middle 50% of the data.
Example Code to Calculate These Measures
Here’s an example of how you can calculate these measures using Python:

Python

import pandas as pd

# Load the dataset
df = pd.read_csv('annual_unique_donors.csv')

# Assuming the dataframe has 'year', 'age', and 'demand' columns
df['year'] = pd.to_datetime(df['year'], format='%Y')
df.set_index(['age', 'year'], inplace=True)

# Function to calculate variability measures
def calculate_variability_measures(df):
    variability_measures = pd.DataFrame()

    for age in df.index.get_level_values(0).unique():
        ts = df.xs(age, level='age')['demand'].dropna()
        
        # Calculate measures
        std_dev = ts.std()
        variance = ts.var()
        range_val = ts.max() - ts.min()
        iqr = ts.quantile(0.75) - ts.quantile(0.25)
        cv = (std_dev / ts.mean()) * 100 if ts.mean() != 0 else 0
        mad = ts.mad()
        qd = iqr / 2
        
        # Store measures in DataFrame
        variability_measures.loc[age, 'Standard Deviation'] = std_dev
        variability_measures.loc[age, 'Variance'] = variance
        variability_measures.loc[age, 'Range'] = range_val
        variability_measures.loc[age, 'IQR'] = iqr
        variability_measures.loc[age, 'CV'] = cv
        variability_measures.loc[age, 'MAD'] = mad
        variability_measures.loc[age, 'QD'] = qd
    
    return variability_measures

# Call the function and print the variability measures
variability_measures = calculate_variability_measures(df)
print(variability_measures)
AI-generated code. Review and use carefully. More info on FAQ.
Interpretation of Results
Standard Deviation and Variance: Indicate the overall spread of the data.
Range and IQR: Provide insights into the spread of the data, with IQR focusing on the middle 50%.
Coefficient of Variation: Standardizes the measure of dispersion, making it easier to compare across different datasets.
Mean Absolute Deviation and Quartile Deviation: Offer alternative measures of variability that are less sensitive to outliers.
These measures can help you better understand the variability in your data and make more informed decisions based on the forecasted demand.

Feel free to run this code and let me know if you need any further assistance!

You
Sent by you:
How can I choose the most appropriate variability measure for my specific analysis?
Copilot
Sent by Copilot:
Choosing the most appropriate variability measure for your specific analysis depends on several factors, including the nature of your data, the goals of your analysis, and the specific characteristics you want to capture. Here are some guidelines to help you make an informed decision:

1. Nature of the Data
Continuous Data: Measures like standard deviation, variance, and coefficient of variation are suitable for continuous data, as they provide insights into the spread and relative variability.
Categorical Data: For categorical data, measures like range and interquartile range (IQR) might be more appropriate, as they can highlight the spread of categories.
2. Goals of the Analysis
Understanding Overall Spread: If your goal is to understand the overall spread of the data, standard deviation and variance are good choices. They provide a comprehensive measure of variability.
Comparing Relative Variability: If you need to compare the variability of different datasets or groups, the coefficient of variation (CV) is useful. It standardizes the measure of dispersion, making it easier to compare across different scales.
Identifying Outliers: If you are interested in identifying outliers or understanding the spread of the middle 50% of the data, the interquartile range (IQR) and mean absolute deviation (MAD) are helpful.
3. Specific Characteristics
Sensitivity to Outliers: Standard deviation and variance are sensitive to outliers, which can skew the results. If your data contains outliers, you might prefer measures like IQR or MAD, which are less affected by extreme values.
Ease of Interpretation: Some measures are easier to interpret than others. For example, the range is straightforward and easy to understand, while the coefficient of variation provides a relative measure that can be more informative in certain contexts.
Practical Examples
Standard Deviation and Variance:
Use When: You want to understand the overall spread of continuous data.
Example: Analyzing the variability in monthly sales figures.
Coefficient of Variation (CV):
Use When: You need to compare the relative variability of different datasets.
Example: Comparing the variability of sales figures across different regions.
Interquartile Range (IQR):
Use When: You want to focus on the spread of the middle 50% of the data and identify potential outliers.
Example: Analyzing the distribution of test scores to identify students who are performing significantly above or below the median.
Mean Absolute Deviation (MAD):
Use When: You need a measure of variability that is less sensitive to outliers.
Example: Assessing the consistency of delivery times for a logistics company.
Example Code to Calculate and Compare Measures
Here’s an example of how you can calculate and compare different variability measures using Python:

Python

import pandas as pd

# Load the dataset
df = pd.read_csv('annual_unique_donors.csv')

# Assuming the dataframe has 'year', 'age', and 'demand' columns
df['year'] = pd.to_datetime(df['year'], format='%Y')
df.set_index(['age', 'year'], inplace=True)

# Function to calculate and compare variability measures
def compare_variability_measures(df):
    measures = pd.DataFrame()

    for age in df.index.get_level_values(0).unique():
        ts = df.xs(age, level='age')['demand'].dropna()
        
        # Calculate measures
        std_dev = ts.std()
        variance = ts.var()
        range_val = ts.max() - ts.min()
        iqr = ts.quantile(0.75) - ts.quantile(0.25)
        cv = (std_dev / ts.mean()) * 100 if ts.mean() != 0 else 0
        mad = ts.mad()
        
        # Store measures in DataFrame
        measures.loc[age, 'Standard Deviation'] = std_dev
        measures.loc[age, 'Variance'] = variance
        measures.loc[age, 'Range'] = range_val
        measures.loc[age, 'IQR'] = iqr
        measures.loc[age, 'CV'] = cv
        measures.loc[age, 'MAD'] = mad
    
    return measures

# Call the function and print the measures
variability_measures = compare_variability_measures(df)
print(variability_measures)
AI-generated code. Review and use carefully. More info on FAQ.
Summary
Standard Deviation and Variance: Best for understanding overall spread.
Coefficient of Variation (CV): Ideal for comparing relative variability.
Interquartile Range (IQR) and Mean Absolute Deviation (MAD): Useful for identifying outliers and understanding the spread of the middle 50% of the data.
By considering these factors, you can choose the most appropriate variability measure for your specific analysis, ensuring that your insights are accurate and meaningful. If you have any specific questions or need further assistance, feel free to ask!

You
Sent by you:
How can I visualize different variability measures for better insights?
Copilot
Sent by Copilot:
Visualizing different variability measures can provide a clearer understanding of the data’s spread and help identify patterns or outliers. Here are some common ways to visualize these measures using Python’s matplotlib and seaborn libraries:

1. Box Plot
A box plot (or whisker plot) is useful for visualizing the distribution of data and highlighting the interquartile range (IQR), median, and potential outliers.

Python

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('annual_unique_donors.csv')

# Assuming the dataframe has 'year', 'age', and 'demand' columns
df['year'] = pd.to_datetime(df['year'], format='%Y')

# Create a box plot
plt.figure(figsize=(12, 6))
sns.boxplot(x='age', y='demand', data=df)
plt.title('Box Plot of Demand by Age')
plt.xlabel('Age')
plt.ylabel('Demand')
plt.show()
AI-generated code. Review and use carefully. More info on FAQ.
2. Histogram
A histogram shows the frequency distribution of the data, which helps in understanding the spread and central tendency.

Python

plt.figure(figsize=(12, 6))
sns.histplot(df['demand'], bins=20, kde=True)
plt.title('Histogram of Demand')
plt.xlabel('Demand')
plt.ylabel('Frequency')
plt.show()
AI-generated code. Review and use carefully. More info on FAQ.
3. Violin Plot
A violin plot combines aspects of a box plot and a kernel density plot. It shows the distribution of the data across different categories.

Python

plt.figure(figsize=(12, 6))
sns.violinplot(x='age', y='demand', data=df)
plt.title('Violin Plot of Demand by Age')
plt.xlabel('Age')
plt.ylabel('Demand')
plt.show()
AI-generated code. Review and use carefully. More info on FAQ.
4. Scatter Plot with Error Bars
A scatter plot with error bars can show the mean demand and the variability (e.g., standard deviation) for each age cohort.

Python

# Calculate mean and standard deviation for each age cohort
mean_demand = df.groupby('age')['demand'].mean()
std_demand = df.groupby('age')['demand'].std()

# Create a scatter plot with error bars
plt.figure(figsize=(12, 6))
plt.errorbar(mean_demand.index, mean_demand, yerr=std_demand, fmt='o', capsize=5)
plt.title('Mean Demand with Standard Deviation by Age')
plt.xlabel('Age')
plt.ylabel('Demand')
plt.show()
AI-generated code. Review and use carefully. More info on FAQ.
5. Heatmap
A heatmap can visualize the variability measures across different age cohorts and years.

Python

# Pivot the data to create a matrix for the heatmap
pivot_table = df.pivot_table(index='age', columns='year', values='demand', aggfunc='mean')

# Create a heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, cmap='coolwarm', annot=True, fmt=".1f")
plt.title('Heatmap of Mean Demand by Age and Year')
plt.xlabel('Year')
plt.ylabel('Age')
plt.show()
AI-generated code. Review and use carefully. More info on FAQ.
Summary
Box Plot: Visualizes the distribution, IQR, median, and outliers.
Histogram: Shows the frequency distribution of the data.
Violin Plot: Combines box plot and kernel density plot features.
Scatter Plot with Error Bars: Displays mean demand with variability.
Heatmap: Visualizes variability measures across different categories.
These visualizations can help you gain better insights into the variability of your data and make more informed decisions based on the analysis. Feel free to run these examples with your data and let me know if you need any further assistance!

In [None]:


import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings
import statsmodels.api as sm
# Suppress warnings from statsmodels
warnings.filterwarnings("ignore")


def apply_uncertainty_to_forecast(forecast, annual_receipts, annual_demands, age_specific_demands):
    # Calculate the annual donor receipt to demand ratio for each year
    annual_ratio = annual_receipts['Yearly Receipt'] / annual_demands
    
    # Calculate the age-specific demands ratio
    age_specific_ratio = age_specific_demands.groupby(['year', 'age']).sum().reset_index()
    age_specific_ratio['total_demands'] = age_specific_ratio.groupby('year')['demand'].transform('sum')
    age_specific_ratio['age_ratio'] = age_specific_ratio['demand'] / age_specific_ratio['total_demands']
    
    # Create an empty DataFrame to hold the adjusted forecast
    adjusted_forecast = pd.DataFrame(columns=['date', 'age', 'adjusted_forecast'])
    
    # Apply the ratios to the forecasted values
    for index, row in forecast.iterrows():
        year = row['date'].year
        age = row['age']
        forecast_value = row['forecast']

        # Calculate the adjusted forecast value
        if year in annual_ratio.index and year in age_specific_ratio['year'].unique():
            age_ratio = age_specific_ratio[(age_specific_ratio['year'] == year) & (age_specific_ratio['age'] == age)]
            if not age_ratio.empty:
                age_ratio_value = age_ratio['age_ratio'].values[0]
            else:
                age_ratio_value = 0
            adjusted_value = forecast_value * annual_ratio.loc[year] * age_ratio_value
        else:
            adjusted_value = 0
        
        # Append the adjusted value to the DataFrame
        adjusted_forecast = adjusted_forecast.append({'date': row['date'], 'age': age, 'adjusted_forecast': adjusted_value}, ignore_index=True)
    
    return adjusted_forecast

def forecast_demand_with_sarima(historical_data, steps, seasonal_order=(1,1,1,12)):
    # Fit a SARIMA model
    model = SARIMAX(historical_data, order=(1,1,1), seasonal_order=seasonal_order)
    model_fit = model.fit(disp=False)
    
    # Forecast future values
    forecast = model_fit.forecast(steps=steps)
    return forecast

# Lpa historical data for fitting SARIMA
historical_data = pd.Series(np.random.randint(100, 1000, 36), 
                            index=pd.date_range(start='2021-01-01', periods=36, freq='M'))

# Forecast for the next 24 months (2 years)
sarima_forecast = forecast_demand_with_sarima(historical_data, steps=24)

# Prepare the forecast DataFrame for the next 24 months for ages 18 to 106
forecast_dates = pd.date_range(start='2024-04-01', periods=24, freq='M')
forecast_ages = np.random.randint(18, 106, 24)
forecast_df = pd.DataFrame({'date': forecast_dates, 'age': forecast_ages, 'forecast': sarima_forecast})

# Annual receipts and demands DataFrames
annual_receipts = annual_donor_receipts  #pd.DataFrame({'year': [2022, 2023, 2024], 'Yearly Receipt': [7.519675e+05, 9.700350e+05, 1.124615e+06]})

annual_demands = annual_donor_demands  #pd.Series({2022: 601254, 2023: 752898, 2024: 505219}, name='unique_key')

# Age-specific demands DataFrame
age_specific_demands = annual_donor_demands_age_specific #pd.DataFrame({
#     'year': [2022, 2022, 2022, 2023, 2023, 2023, 2024, 2024, 2024],
#     'age': [18, 19, 20, 18, 19, 20, 18, 19, 20],
#     'demand': [199, 286, 192, 249, 345, 243, 122, 267, 196]})

# Apply uncertainty to the SARIMA forecast
adjusted_forecast = apply_uncertainty_to_forecast(forecast_df, annual_receipts, annual_demands, age_specific_demands)

print(adjusted_forecast)


In [None]:


#def uncertainty_ratio(self, lpa_df_index, month_year):


# Estimate the Uncertainty:


#covid_impact_ratio = pre_covid_mean_demand / post_covid_mean_demand
impact_ratio = filtered_annual_demands / filtered_annual_receipts 

# Apply the Uncertainty to Future Long-Term Forecasts:
# Multiply the future long-term forecasts by the estimated uncertainty to incorporate the impact of the short-term period.

print(f"""The ratio of {impact_ratio} indicates that the short-term demand during COVID 
        was {impact_ratio * 100}% higher than the long-term average.""")

# # Apply the Uncertainty:
# Adjusted_Long-Term_Forecast = Future_Long_Term_Forecast * covid_impact_ratio

   
# Extract and save data into a csv file
lpa_data = monthly_unique_donors.rename(columns={'count': 'demand'})

print(lpa_data.head(2))

In [None]:
#%%
# This script is designed to be modular, allowing you to call specific methods to perform different parts of the analysis.
# split the code into three separate classes:

# DataHandler: For handling file operations and initial data loading.
# DataPreprocessor: For preprocessing the data and performing calculations on the data.
# DataAnalyzer: For analyzing the data and Bayesian modeling.

# Load and Preprocess Data:

# Extract relevant columns.
# Handle missing values.
# Calculate age from the date of birth and receipt date.
# Filter data by year and age range.
# Calculate Age-Specific Demand:

# Group data by year, gender, and age.
# Calculate the number of unique donors.
# Calculate Annual Demands Ratio:

# Use the annual average daily application receipts.
# Apply the annual demands ratio to incorporate uncertainty.
# Forecast Age-Specific Demand:

# Use the Bayesian model to forecast future demands.
# Adjust the model to account for age-specific data.

# use the Pymc Python package and Bayesian criteria while using Monte Carlo simulation for time series forecasting of donor application demands to 
# account for uncertainty and make more robust predictions, especially in scenarios with significant disruptions like post-COVID impacts after March 2021, 
# by having age-specific demand data (column names: year, gender, age, number of unique donors) from 2006 to 2024? Also, how do you make sure that, 
# from the actual data, you choose the right prior based on the distribution of donor demands per year and fluctuation before and after COVID? 
# to learn from the pre-COVID data and post-COVID data to decide which distribution should be used for the priors for pre-COVID and post-COVID parameters? 
# In addition, imagine we already have calculated average annual daily application receipts and then you can use average annual daily application receipts 
# and annual demands ratio (without considering age) to deal with uncertainty to improve long term forecasting of  donor application demands?


## -------------------
#%%



## -------------------
#%%

#%%


#%%
## ------------------Step 3: DataAnalyzer Class-------------------- ##

class DataAnalyzer:
    def __init__(self, data):
        self.data = data

    def exploratory_data_analysis(self, data):
        data.reset_index()
        data.set_index('month_year', inplace=True)
        data['demand'].plot(figsize=(14, 7))
        plt.title('Donor Application Demands')
        plt.xlabel('Month-Year')
        plt.ylabel('Demand')
        plt.show()

        pre_covid = data.loc[:'Mar-21', 'demand']
        post_covid = data.loc['Mar-21':, 'demand']

        plt.figure(figsize=(14, 7))
        sns.histplot(pre_covid, kde=True, label='Pre-COVID', color='blue', bins=50)
        sns.histplot(post_covid, kde=True, label='Post-COVID', color='red', bins=50)
        plt.legend()
        plt.title('Distribution of Donor Application Demands')
        plt.xlabel('Demand')
        plt.ylabel('Frequency')
        plt.show()

    def determine_priors(self, pre_covid, post_covid):
        def test_distribution(data, dist, *args):
            return kstest(data, dist, args=args)

        print("Pre-COVID Data:")
        print("Normal:", test_distribution(pre_covid, 'norm', np.mean(pre_covid), np.std(pre_covid)))
        print("Log-Normal:", test_distribution(pre_covid, 'lognorm', np.std(np.log(pre_covid[pre_covid > 0])), 0, np.exp(np.mean(np.log(pre_covid[pre_covid > 0])))))
        print("Student's t:", test_distribution(pre_covid, 't', len(pre_covid)-1, np.mean(pre_covid), np.std(pre_covid)))

        print("\nPost-COVID Data:")
        print("Normal:", test_distribution(post_covid, 'norm', np.mean(post_covid), np.std(post_covid)))
        print("Log-Normal:", test_distribution(post_covid, 'lognorm', np.std(np.log(post_covid[post_covid > 0])), 0, np.exp(np.mean(np.log(post_covid[post_covid > 0])))))
        print("Student's t:", test_distribution(post_covid, 't', len(post_covid)-1, np.mean(post_covid), np.std(post_covid)))

    def bayesian_modeling(self, pre_covid, post_covid):
        model = pm.Model()
        with model:
            pre_covid_mean = pm.Normal('pre_covid_mean', mu=np.mean(pre_covid), sigma=np.std(pre_covid))
            pre_covid_sd = pm.HalfNormal('pre_covid_sd', sigma=1)
            post_covid_mean = pm.Normal('post_covid_mean', mu=np.mean(post_covid), sigma=np.std(post_covid))
            post_covid_sd = pm.HalfNormal('post_covid_sd', sigma=1)

            pre_covid_likelihood = pm.Normal('pre_covid_likelihood', mu=pre_covid_mean, sigma=pre_covid_sd, observed=pre_covid)
            post_covid_likelihood = pm.Normal('post_covid_likelihood', mu=post_covid_mean, sigma=post_covid_sd, observed=post_covid)

            trace = pm.sample(1000, tune=2000, return_inferencedata=True)
        
        az.plot_trace(trace)
        plt.show()
        
        return trace
    

data = lpa_data
print(data.head(2))
# Instantiate the DataAnalyzer
data_analyzer = DataAnalyzer(data)
data_analyzer.exploratory_data_analysis(data)
pre_covid, post_covid = data.loc[:'Mar-21', 'demand'], data.loc['Mar-21':, 'demand']
data_analyzer.determine_priors(pre_covid, post_covid)
trace= data_analyzer.bayesian_modeling(pre_covid, post_covid)
## ------------------


# %%

## ------------------Step 4: forecasting-------------------- ##
post_covid_data = data.loc[:'Mar-21', 'demand'], data.loc['Mar-21':, 'demand']



## ------------------

    
# %%

az.style.use("arviz-darkgrid")

RANDOM_SEED = 58
rng = np.random.default_rng(RANDOM_SEED)

model = pm.Model()
with model:
    pre_covid_mean = pm.Normal('pre_covid_mean', mu=np.mean(pre_covid), sigma=np.std(pre_covid))
    pre_covid_sd = pm.HalfNormal('pre_covid_sd', sigma=1)
    post_covid_mean = pm.Normal('post_covid_mean', mu=np.mean(post_covid), sigma=np.std(post_covid))
    post_covid_sd = pm.HalfNormal('post_covid_sd', sigma=1)

    pre_covid_likelihood = pm.Normal('pre_covid_likelihood', mu=pre_covid_mean, sigma=pre_covid_sd, observed=pre_covid)
    post_covid_likelihood = pm.Normal('post_covid_likelihood', mu=post_covid_mean, sigma=post_covid_sd, observed=post_covid)

    trace = pm.sample(1000, tune=2000, return_inferencedata=True)
    pm.sample_posterior_predictive(trace, extend_inferencedata=True, random_seed=rng)
    
print(trace.posterior_predictive)   #

az.plot_ppc(trace, num_pp_samples=100); 


# %%

## ------------------Step 5: Monte Carlo simulations for forecasting-------------------- ##
pre_covid_data, post_covid_data = data.loc[:'Mar-21', 'demand'], data.loc['Mar-21':, 'demand']

trace = data_analyzer.bayesian_modeling(pre_covid_data, post_covid_data)
pre_covid_data, post_covid_data = pre_covid.reset_index(), post_covid.reset_index() 
#trace = data_analyzer.bayesian_modeling(pre_covid_data, post_covid_data)
trace['pre_covid_pred'], trace['post_covid_pred'] = pre_covid_likelihood, post_covid_likelihood

with pm.Model() as model:
    post_pred = pm.sample_posterior_predictive(trace, var_names=['pre_covid_pred', 'post_covid_pred'])

# Plot the results
fig, axes = plt.subplots(2, 1, figsize=(12, 8))
axes[0].plot(pre_covid_data['month_year'], pre_covid_data['demand'], label='Observed Pre-COVID')
axes[0].plot(pre_covid_data['month_year'], post_pred['pre_covid_pred'].mean(axis=0), label='Predicted Pre-COVID Mean')
axes[0].fill_between(pre_covid_data['month_year'], np.percentile(post_pred['pre_covid_pred'], 5, axis=0), 
                     np.percentile(post_pred['pre_covid_pred'], 95, axis=0), alpha=0.2, label='95% CI')
axes[0].legend()
axes[0].set_title('Pre-COVID Predictions')

axes[1].plot(post_covid_data['month_year'], post_covid_data['demand'], label='Observed Post-COVID')
axes[1].plot(post_covid_data['month_year'], post_pred['post_covid_pred'].mean(axis=0), label='Predicted Post-COVID Mean')
axes[1].fill_between(post_covid_data['month_year'], np.percentile(post_pred['post_covid_pred'], 5, axis=0), 
                     np.percentile(post_pred['post_covid_pred'], 95, axis=0), alpha=0.2, label='95% CI')
axes[1].legend()
axes[1].set_title('Post-COVID Predictions')

plt.tight_layout()
plt.show()
## ------------------
## ------------------


# %%


## ------------------Step 6: Specify the Model with PyMC-------------------- ##

# Convert data to numpy array for PyMC
demand_data = data['demand'].values

# Define a Bayesian model with potential structural break after March 2021
with pm.Model() as model:
    # Priors for pre-COVID parameters (Normal distribution)
    mu_pre = pm.Normal('mu_pre', mu=np.mean(pre_covid), sigma=np.std(pre_covid))
    sigma_pre = pm.HalfNormal('sigma_pre', sigma=np.std(pre_covid))
    
    # Priors for post-COVID parameters (Normal distribution)
    mu_post = pm.Normal('mu_post', mu=np.mean(post_covid), sigma=np.std(post_covid))
    sigma_post = pm.HalfNormal('sigma_post', sigma=np.std(post_covid))
    
    # Indicator for structural break
    switchpoint = len(pre_covid)
    idx = np.arange(len(demand_data))
    
    # Likelihood
    mu = pm.math.switch(idx < switchpoint, mu_pre, mu_post)
    sigma = pm.math.switch(idx < switchpoint, sigma_pre, sigma_post)
    
    y = pm.Normal('y', mu=mu, sigma=sigma, observed=demand_data)




## -----------------
#%%

## ------------------Step 5: Fit the Model Using MCMC-------------------- ##

with model:
    trace = pm.sample(2000, tune=1000, return_inferencedata=True)

# Inspect the trace
az.plot_trace(trace)
plt.show()



## -----------------
#%%


## ------------------Step 6: Posterior Predictive Checks-------------------- ##

with model:
    ppc = pm.sample_posterior_predictive(trace, var_names=['y'])

# Simulate future demands
n_future = 365  # Simulate for one year
future_simulations = []

for sample in ppc['y']:
    future_path = np.concatenate([demand_data, np.zeros(n_future)])
    for t in range(len(demand_data), len(demand_data) + n_future):
        future_path[t] = sample[t - len(demand_data)] + future_path[t - 1]
    future_simulations.append(future_path[-n_future:])

future_simulations = np.array(future_simulations)

# Analyze future simulations
mean_forecast = np.mean(future_simulations, axis=0)
cred_interval = np.percentile(future_simulations, [2.5, 97.5], axis=0)


## -----------------
#%%

## ------------------Step 7: Analyse and Visualize Results-------------------- ##

plt.figure(figsize=(14, 7))
plt.plot(data.index, demand_data, label='Historical Data')
plt.plot(pd.date_range(data.index[-1], periods=n_future, freq='D'), mean_forecast, label='Mean Forecast')
plt.fill_between(pd.date_range(data.index[-1], periods=n_future, freq='D'), cred_interval[0], cred_interval[1], color='gray', alpha=0.5, label='95% Credible Interval')
plt.axvline(data.index[switchpoint], color='r', linestyle='--', label='Post-COVID Impact')
plt.legend()
plt.title('Forecast of Daily Application Demands')
plt.xlabel('Date')
plt.ylabel('Demand')
plt.show()


## -----------------

import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split

class LPAForecast:
    def __init__(self, post_covid_data, covid_data, historical_data): #, economic_data):
        self.post_covid_data = post_covid_data
        self.covid_data = covid_data
        #self.economic_data = economic_data
        self.historical_data = historical_data
        
    def preprocess_data(self):
        # Merge historical data with COVID-19 impact data and economic data
        self.data = historical_data #pd.merge(self.post_covid_data, self.covid_data, on='date', how='left')
        #self.data = pd.merge(self.data, self.economic_data, on='date', how='left')
        self.data.fillna(0, inplace=True)

    def short_term_forecast(self, order=(5,1,0)):
        # Split data into training and testing sets
        train, test = train_test_split(self.data, test_size=0.2, shuffle=False)
        
        # Fit ARIMA model
        model = ARIMA(train['demand'], order=order, exog=train[['covid_impact']]) #, 'economic_indicator']])
        self.model_fit = model.fit()
        
        # Forecast
        self.short_term_predictions = self.model_fit.forecast(steps=len(test), exog=test[['covid_impact']]) #, 'economic_indicator']])
        #baseline_forecast = self.model_fit.forecast(steps=24)  # Assuming 2 years monthly forecast
        self.test = test

    def quantify_uncertainty(self):
        # Calculate confidence intervals
        self.conf_int = self.model_fit.get_forecast(steps=len(self.test), exog=self.test[['covid_impact']]).conf_int() #, 'economic_indicator']]).conf_int()
        #self.conf_int = self.model_fit.get_forecast(steps=len(self.test)).conf_int()
        
    def long_term_forecast(self, adjustment_factor=1.2):
        # Baseline forecast using historical trend
        #future_economic_data = self.economic_data.tail(24)  # Assuming 2 years monthly forecast
        baseline_forecast = self.model_fit.forecast(steps=24, exog=future_economic_data[['covid_impact']]) #, 'economic_indicator']])
        #baseline_forecast = self.model_fit.forecast(steps=24)  # Assuming 2 years monthly forecast
        
        # Adjust for COVID-19 impact
        self.long_term_predictions = baseline_forecast * adjustment_factor

        
# Write a function to apply exponential smoothing applied to the age-specific short-term forecasts to incorporate covid impact and do sentivitity tests 
# follow the excel formula in LPA model and control assumption:
# =GB9*'LPA Control Assumptions'!$B$123+(1-'LPA Control Assumptions'!$B$123)*(GB9-GA9+GB9)
    #def age_specific_uncertainty(self)
    
    
    def forecast_by_age_group(self, age_groups):
        self.age_group_forecasts = {}
        for age_group in age_groups:
            # Filter data for the age group
            age_group_data = self.data[self.data['age_group'] == age_group]
            
            # Fit ARIMA model
            model = ARIMA(age_group_data['demand'], order=(5,1,0), exog=age_group_data[['covid_impact']]) #, 'economic_indicator']])
            model_fit = model.fit()
            
            # Forecast
            forecast = model_fit.forecast(steps=24, exog=future_economic_data[['covid_impact']]) #, 'economic_indicator']])
            self.age_group_forecasts[age_group] = forecast
#         for age_group in age_groups:
#             # Filter data for the age group
#             age_group_data = self.data[self.data['age_group'] == age_group]
#             # Fit ARIMA model
#             model = ARIMA(age_group_data['registrations'], order=(5,1,0))
#             model_fit = model.fit()
#             # Forecast
#             forecast = model_fit.forecast(steps=24)
#             self.age_group_forecasts[age_group] = forecast
    def validate_model(self):
        # Compare predictions with actual values
        self.validation_results = self.test['demand'] - self.short_term_predictions

    def aggregate_by_age_group(self, bins, labels):
        # Create a new column 'AgeGroup' with the age bins
        self.data['AgeGroup'] = pd.cut(self.data['age'], bins=bins, labels=labels, right=False)
        
        # Aggregate data within each age group
        aggregated_data = self.data.groupby('AgeGroup').agg({
            'demand': 'sum',
            'name': 'count'
        }).rename(columns={'name': 'count'})
        
        return aggregated_data

    def run_forecast(self, age_groups, bins, labels):
        self.preprocess_data()
        self.short_term_forecast()
        self.quantify_uncertainty()
        self.long_term_forecast()
        self.forecast_by_age_group(age_groups)
        self.validate_model()
        return self.aggregate_by_age_group(bins, labels)

# usage
#historical_data = pd.read_csv('historical_data.csv')  # Replace with actual data source
#covid_data = pd.read_csv('covid_data.csv')  # Replace with actual data source
#economic_data = pd.read_csv('economic_data.csv')  # Replace with actual data source
age_groups = ['18-20', '21-25', '26-30', '31-35', '36-40','41-45','46-50', '51-55', '56-60', '61-65', '66-70', '71+']
bins = [18, 20, 30, 40, 50, 60, 70, 100]
labels = ['18-20', '21-25', '26-30', '31-35', '36-40','41-45','46-50', '51-55', '56-60', '61-65', '66-70', '71+']

forecast = LPAForecast(post_covid_data, covid_data, historical_data)# , economic_data)
aggregated_data = forecast.run_forecast(age_groups, bins, labels)

print(aggregated_data)

#age_groups = ['18-30', '31-50', '51-70', '71+']

#forecast = LPAForecast(post_covid_data, covid_data)
#forecast.run_forecast(age_groups)
#print(forecast.long_term_predictions)
#print(forecast.age_group_forecasts)
