# **Example Notebook for Client Testing**

### **Imports**

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from pathlib import Path

# dummy data to test out donor imputation with
from sklearn.datasets import load_iris
import random

# for nearest neighbour imputation
from sklearn.impute import KNNImputer
from sklearn.neighbors import NearestNeighbors, BallTree
from sklearn.preprocessing import MinMaxScaler

# for type hints
import typing
from typing import List, Set, Dict, Tuple, Union, Optional

# for warning messages
import warnings

In [2]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 150

## Parse Imputation Specifications

We could define how the file containing the list of variables to impute, its imputation class variables, type of imputation to apply and restrictions on imputed value should be formatted. Use an YAML config file.

# **Donor Imputation**

## Utility Functions

Original data in: df

1st imputation: mortgage
    Run `data_preprocessing_store_donor_info` to format the data to prepare for imputation.
    df1 = data_preprocessing_store_donor_info(df, ...)
    Apply imputation on df1.
    

In [3]:
def data_preprocessing_store_donor_info(
    data: pd.DataFrame,
    var_to_impute: str,
    imputation_class_vars: List[str],
    knn_distance: bool = True,
) -> pd.DataFrame:
    """Function that pre-processes data set to include variables pertaining to imputation.

    Variables added include: duplicate column of the variable to impute with suffix '_post_imputation',
    imputation flag, imputation type (historical, KNN), imputed value, donor user ID,
    donor use count, donor distance, and donor imputation class details.

    Donor-specific variables only apply to KNN imputation.

    Default values for these variables are:
        - imputation flag: 0
        - imputation type: '' (blank string)
        - imputed value: NA
        - donor user ID: -1
        - donor use count: 0
        - donor distance: NA
        - donor imputation class details: NA
    These variables are prefixed by the name of the variable to be imputed.

    Parameters
    ----------

    data : pd.DataFrame
        Pandas DataFrame of input data.

    var_to_impute : str
        Name of variable to impute.

    imputation_class_vars : list of str
        Names of imputation class variables.

    knn_distance : boolean (default = True)
        Distance of donor.

    Returns
    -------

    data_prepped : pd.DataFrame
        Pandas DataFrame of input data with additional columns pertaining to imputation details.
    """

    data_prepped = data.copy()

    # Create duplicate of variable to impute
    var_to_impute_post_imputation_name = f"{var_to_impute}_post_imputation"
    data_prepped[var_to_impute_post_imputation_name] = data_prepped[var_to_impute]

    # Create imputation flag for variable to impute.
    imputation_flag_col_name = f"{var_to_impute}_imputation_flag"
    data_prepped[imputation_flag_col_name] = 0

    # Create column that denotes type of imputation (KNN, historical).
    imputation_type_col_name = f"{var_to_impute}_imputation_type"
    data_prepped[imputation_type_col_name] = ""

    # Create column that contains donated, imputed value.
    # If imputation not required or no donor found, then imputed_value is NaN.
    imputed_val_col_name = f"{var_to_impute}_imputed_value"
    data_prepped[imputed_val_col_name] = pd.NA

    # Create column that stores userid of donor, if applicable.
    # If imputation not required or no donor found, then userid = -1.
    donor_userid_col_name = f"{var_to_impute}_donor_userid"
    data_prepped[donor_userid_col_name] = -1

    # Create donor count variable.
    donor_count_col_name = f"{var_to_impute}_donor_count"
    data_prepped[donor_count_col_name] = 0

    # Create columns for donor characteristics: values of imputation class variables.
    for var in imputation_class_vars:
        donor_imputation_class_var = f"{var_to_impute}_donor_{var}"
        data_prepped[donor_imputation_class_var] = pd.NA

    # Create column for donor distance.
    # If donor not obtained via KNN, then distance will default to NaN.
    donor_dist_col_name = f"{var_to_impute}_donor_distance"
    data_prepped[donor_dist_col_name] = pd.NA

    return data_prepped

In [4]:
def continuous_to_range(
    data: pd.DataFrame,
    column: str,
    range_interval_index: pd.IntervalIndex,
    range_label_dict: Dict,
) -> pd.Series:
    """Function that converts continuous values to bucketed ranges.

    Parameters
    ----------

    data : pd.DataFrame
        Pandas DataFrame of input data.

    column : str
        Name of column to convert continuous values to bucketed ranges.

    range_interval_index : pd.IntervalIndex
        Pandas Interval Index type that defines the buckets.
        Note that the left bound is exclusive, right bound is inclusive.

    range_label_dict : dict
        Dictionary with Pandas Interval Indices as keys and corresponding integer labels as values.

    Returns
    -------

    continuous_to_bucketed_series: pd.Series
        Pandas Series of column with bucketed ranges.
    """

    # returns a pandas Series that converts continuous values to buckets

    continuous_to_bucketed_series = pd.cut(
        data[column],
        range_interval_index,
        labels=range(1, len(range_interval_index) + 1),
    )
    continuous_to_bucketed_series = continuous_to_bucketed_series.map(range_label_dict)
    return continuous_to_bucketed_series

In [5]:
def range_to_random_continuous(
    data: pd.DataFrame, column: str, range_to_continuous_dict: Dict
) -> pd.Series:
    """Function that converts bucketed ranges to a randomly selected value within the range.

    Parameters
    ----------

    data : pd.DataFrame
        Pandas DataFrame of input data.

    column : str
        Name of column to convert continuous values to bucketed ranges.

    range_to_continuous_dict : dict
        Dictionary with integer labels of buckets as keys and tuples of range .

    Returns
    -------

    bucketed_to_continuous_series: pd.Series
        Pandas Series of column with numbers randomly selected from bucketed range.
    """

    bucketed_to_continuous_series = pd.Series(pd.NA, index=data.index)

    for row_num, val in enumerate(data[column]):
        if not pd.isna(val):
            bucketed_to_continuous_series[row_num] = random.randint(
                *range_to_continuous_dict[val]
            )  # .astype(float)
        else:
            bucketed_to_continuous_series[row_num] = pd.NA

    return bucketed_to_continuous_series

In [6]:
# The following list of tuples defines the buckets for monetary columns ex. Purchase_Val_Amt, Value_Hse_Loan.
# Note that the lower bound is exclusive and the upper bound is inclusive.
# The lower bound of the first bucket is to allow for zeros.
monetary_buckets = [
    (-0.001, 20),
    (20, 40),
    (40, 60),
    (60, 80),
    (80, 100),
    (100, 200),
    (200, 300),
    (300, 400),
    (400, 500),
    (500, 600),
    (600, 700),
    (700, 800),
    (800, 900),
    (900, 1000),
    (1000, 1500),
    (1500, 2000),
    (2000, 2500),
    (2500, 3000),
    (3000, 3500),
    (3500, 4000),
    (4000, 4500),
    (4500, 5000),
    (5000, 5500),
    (5500, 6000),
    (6000, 6500),
    (6500, 7000),
    (7000, 7500),
    (7500, 8000),
    (8000, 8500),
    (8500, 9000),
    (9000, 9500),
    (9500, 10000),
    (10000, 15000),
    (15000, 20000),
    (20000, 25000),
    (25000, 30000),
    (30000, 35000),
    (35000, 40000),
    (40000, 45000),
    (45000, 50000),
    (50000, 55000),
    (55000, 60000),
    (60000, 65000),
    (65000, 70000),
    (70000, 75000),
    (75000, 80000),
    (80000, 85000),
    (85000, 90000),
    (90000, 95000),
    (95000, 100000),
    (100000, 125000),
    (125000, 150000),
    (150000, 175000),
    (175000, 200000),
    (200000, 225000),
    (225000, 250000),
    (250000, 275000),
    (275000, 300000),
    (300000, 325000),
    (325000, 350000),
    (350000, 375000),
    (375000, 400000),
    (400000, 425000),
    (425000, 450000),
    (450000, 475000),
    (475000, 500000),
    (500000, 550000),
    (550000, 600000),
    (600000, 650000),
    (650000, 700000),
    (700000, 750000),
    (750000, 800000),
    (800000, 850000),
    (850000, 900000),
    (900000, 950000),
    (950000, 1000000),
    (1000000, 1250000),
    (1250000, 1500000),
    (1500000, 1750000),
    (1750000, 2000000),
    (2000000, 2500000),
    (2500000, 3000000),
    (3000000, 4000000),
    (4000000, 5000000),
    (5000000, 6000000),
    (6000000, 7000000),
    (7000000, 8000000),
    (8000000, 9000000),
    (9000000, 10000000),
    (10000000, 11000000),
    (11000000, 12000000),
    (12000000, 13000000),
    (13000000, 14000000),
    (14000000, 15000000),
    (15000000, 16000000),
    (16000000, 17000000),
    (17000000, 18000000),
    (18000000, 19000000),
    (19000000, 20000000),
    (20000000, 40000000),
    (40000000, 60000000),
    (60000000, 80000000),
    (80000000, 100000000),
    (100000000, 200000000),
    (200000000, 300000000),
    (300000000, 400000000),
    (400000000, 500000000),
    (500000000, 600000000),
    (600000000, 700000000),
    (700000000, 800000000),
    (800000000, 900000000),
    (900000000, 1000000000),
]

In [7]:
# The following are buckets for variables that are strictly positive values.
# Note that buckets' first bin excludes 0
non_monetary_buckets = monetary_buckets.copy()
non_monetary_buckets[0] = (0, 20)

# Convert buckets to Pandas IntervalIndex
monetary_bins = pd.IntervalIndex.from_tuples(monetary_buckets)
non_monetary_bins = pd.IntervalIndex.from_tuples(non_monetary_buckets)

# Have dictionaries that convert Pandas IntervalIndex to integer labels
monetary_bins_to_enumerated_val = dict(
    zip(monetary_bins, range(1, len(monetary_bins) + 1))
)
non_monetary_bins_to_enumerated_val = dict(
    zip(non_monetary_bins, range(1, len(non_monetary_bins) + 1))
)

# Have dictionary that converts integer labels back to tuple ranges
enumerated_val_to_range = dict(
    zip(range(1, len(non_monetary_buckets) + 1), non_monetary_buckets)
)

## Nearest Neighbour Imputation

In [8]:
def nearest_neighbour_imputation_categorical(
    data: pd.DataFrame,
    var_to_impute: str,
    imputation_class_vars: List[str],
    categorical_imputation_class_vars: List[str],
    missing_values_to_impute: List[int],
    bucket_imputation: bool,
    bins: Optional[pd.IntervalIndex] = None,
    bins_to_labels: Optional[Dict] = None,
    multiple_possible_class_vars: Optional[List[str]] = None,
    max_donor_use: int = 5,
    userid_col_name: str = "userid",
) -> pd.DataFrame:
    """1-Nearest Neighbour imputation function for categorical imputation class variables.

    Parameters
    ----------

    data : pd.Dataframe
        Pandas DataFrame of input data.

    var_to_impute : str
        Name of variable to be imputed.

    imputation_class_vars : list of strings
        Names of all variables used to define the imputation class.

    categorical_imputation_class_vars : list of strings
        Names of categorical variables used to define the imputation class.

    missing_values_to_impute : list of integers
        List of missing values to impute for.

    bucket_imputation : bool
        Boolean indicator for whether to generate an additional, bucketed version of the imputed column.

    bins : pd.IntervalIndex, optional
        Only required if bucket_imputation == True.
        Pandas Interval Index type that defines the buckets.
        Note that the left bound is exclusive, right bound is inclusive.

    bins_to_labels : dict, optional
        Only required if bucket_imputation == True.
        Dictionary with Pandas Interval Indices as keys and corresponding integer labels as values.

    multiple_possible_class_vars : list of strings, optional
        Names of imputation class variables that can be used in place of the other
        ex. either work income or business earnings can be used as an imputation class variable.
        Note that the function only accomodates for ONE PAIR of interchangeable
        imputation class variables.

    max_donor_use : int, default = 5
        Maximum number of times a donor can be used.

    userid_col_name : str, default = 'userid'
        Name of user ID variable.

    Returns
    -------

    data_post_imputation : pd.DataFrame
        Pandas DataFrame of output data, with imputations applied to variable to be imputed.
    """

    # Make copy of data to implement imputations.
    data_post_imputation = data.copy()

    # get imputation details and donor details' variable names
    imputation_flag_col_name = f"{var_to_impute}_imputation_flag"
    imputation_type_col_name = f"{var_to_impute}_imputation_type"
    post_imputation_col_name = f"{var_to_impute}_post_imputation"
    imputed_val_col_name = f"{var_to_impute}_imputed_value"
    donor_userid_col_name = f"{var_to_impute}_donor_userid"
    donor_count_col_name = f"{var_to_impute}_donor_count"
    donor_characteristics_col_names = [
        f"{var_to_impute}_donor_{var}" for var in imputation_class_vars
    ]
    donor_distance_col_name = f"{var_to_impute}_donor_distance"

    # initialize donor use tracking dictionary
    donor_use_track_dict = {k: 0 for k in data_post_imputation[userid_col_name]}

    # Get list of imputation class variables that are numeric.
    numeric_imputation_class_vars = list(
        set(imputation_class_vars).difference(categorical_imputation_class_vars)
    )

    # Slice data by categorical imputation class variables.
    data_post_imputation_by_slices = {
        slice: sub_df.reset_index(names="original_index")
        for slice, sub_df in data_post_imputation.groupby(
            categorical_imputation_class_vars
        )
    }

    # get column numbers for indexing use later
    data_columns = data_post_imputation_by_slices[
        [k for k in data_post_imputation_by_slices.keys()][0]
    ].columns
    imputation_flag_col_idx = data_columns.get_loc(imputation_flag_col_name)
    imputation_type_col_idx = data_columns.get_loc(imputation_type_col_name)
    post_imputation_col_idx = data_columns.get_loc(post_imputation_col_name)
    imputed_val_col_idx = data_columns.get_loc(imputed_val_col_name)
    donor_userid_col_idx = data_columns.get_loc(donor_userid_col_name)
    donor_count_col_idx = data_columns.get_loc(donor_count_col_name)
    donor_characteristics_col_idxs = [
        data_columns.get_loc(var) for var in donor_characteristics_col_names
    ]
    donor_distance_col_idx = data_columns.get_loc(donor_distance_col_name)

    # initialize min-max scaler
    minmax_scaler = MinMaxScaler()

    # Slice data by categorical imputation class variables.
    data_post_imputation_by_slices = {
        slice: sub_df.reset_index(names="original_index")
        for slice, sub_df in data_post_imputation.groupby(
            categorical_imputation_class_vars
        )
    }

    # Get min-max scaled imputation class variables per slice of
    # categorical imputation class variables.
    imputation_class_vars_transformed_by_slices = {
        slice: minmax_scaler.fit_transform(sub_df[numeric_imputation_class_vars])
        for slice, sub_df in data_post_imputation_by_slices.items()
    }

    # Get BallTree per slice of categorical imputation class variables.
    neighbour_distances_indices_by_slices = {
        slice: BallTree(transformed_sub_df).query(
            transformed_sub_df,
            k=transformed_sub_df.shape[0],
            return_distance=True,
            dualtree=True,
        )
        for slice, transformed_sub_df in imputation_class_vars_transformed_by_slices.items()
    }

    for slice, subset_data in data_post_imputation_by_slices.items():

        for tree_index_num, data_index_num in enumerate(subset_data.index):

            # Search for neighbours if row has missing value
            if pd.notna(subset_data[post_imputation_col_name][data_index_num]) and (
                subset_data[post_imputation_col_name][data_index_num]
                in missing_values_to_impute
            ):

                neighbour_rank = 0
                continue_neighbour_search = True

                while continue_neighbour_search:

                    # Determine which category of the categorical imputation class variable
                    # the observation belongs to in order to select correct BallTree to work with.

                    if (
                        tuple(
                            subset_data.iloc[data_index_num][
                                categorical_imputation_class_vars
                            ]
                        )
                        == slice
                    ):
                        neighbour_distances_indices = (
                            neighbour_distances_indices_by_slices[slice]
                        )

                        donor_idx = neighbour_distances_indices[1][
                            tree_index_num, neighbour_rank
                        ]
                        donor_dist = neighbour_distances_indices[0][
                            tree_index_num, neighbour_rank
                        ]
                        donor_userid = subset_data[userid_col_name][donor_idx]

                        # first check: donor use does not exceed max_donor_use.
                        if donor_use_track_dict[donor_userid] > max_donor_use:
                            neighbour_rank += 1

                        # second check: if closest donor is itself, then continue neighbour search
                        elif donor_idx == data_index_num:
                            neighbour_rank += 1

                        # third check: if donor has missing value, then continue neighbour search
                        elif (
                            data_post_imputation_by_slices[slice][
                                post_imputation_col_name
                            ][donor_idx]
                            in missing_values_to_impute
                        ):
                            neighbour_rank += 1

                        # else, donor is eligible; end neighbour search
                        else:
                            continue_neighbour_search = False

                            # impute value into post-imputation variable
                            data_post_imputation_by_slices[slice].iloc[
                                data_index_num, post_imputation_col_idx
                            ] = data_post_imputation_by_slices[slice][
                                post_imputation_col_name
                            ][
                                donor_idx
                            ]

                            # indicate imputation
                            data_post_imputation_by_slices[slice].iloc[
                                data_index_num, imputation_flag_col_idx
                            ] = 1

                            # indicate type of imputation
                            data_post_imputation_by_slices[slice].iloc[
                                data_index_num, imputation_type_col_idx
                            ] = "KNN"

                            # save imputed value
                            data_post_imputation_by_slices[slice].iloc[
                                data_index_num, imputed_val_col_idx
                            ] = data_post_imputation_by_slices[slice][
                                post_imputation_col_name
                            ][
                                donor_idx
                            ]

                            # save donor's user id
                            data_post_imputation_by_slices[slice].iloc[
                                data_index_num, donor_userid_col_idx
                            ] = donor_userid

                            # increment donor's use count in donor use tracking dictionary
                            donor_use_track_dict[donor_userid] += 1

                            # save donor characteristics
                            for n, var in enumerate(imputation_class_vars):
                                data_post_imputation_by_slices[slice].iloc[
                                    data_index_num, donor_characteristics_col_idxs[n]
                                ] = data_post_imputation_by_slices[slice][var][
                                    donor_idx
                                ]

                            # save donor distance
                            data_post_imputation_by_slices[slice].iloc[
                                data_index_num, donor_distance_col_idx
                            ] = donor_dist

                            # # save sub-dataframe in dictionary tracking data post-imputation
                            # data_post_imputation_by_slices[slice] = subset_data

    for slice, sub_data in data_post_imputation_by_slices.items():
        # reset index for subsets post-imputation
        sub_data = sub_data.set_index("original_index")
        sub_data = sub_data.rename_axis(index=None)

        # update full data set to include post-imputation information
        data_post_imputation.update(sub_data)

    # populate donor use count column
    for row_num, user_id in enumerate(data_post_imputation[donor_userid_col_name]):
        if user_id != -1:
            data_post_imputation.iloc[row_num, (donor_count_col_idx - 1)] = (
                donor_use_track_dict[user_id]
            )

    return data_post_imputation

In [9]:
def nearest_neighbour_imputation(
    data: pd.DataFrame,
    var_to_impute: str,
    imputation_class_vars: List[str],
    missing_values_to_impute: List[int],
    bucket_imputation: bool,
    bins: Optional[pd.IntervalIndex] = None,
    bins_to_labels: Optional[Dict] = None,
    multiple_possible_class_vars: Optional[List[str]] = None,
    max_donor_use: int = 5,
    userid_col_name: str = "userid",
) -> pd.DataFrame:
    """1-Nearest Neighbour imputation function.

    Parameters
    ----------

    data : pd.Dataframe
        Pandas DataFrame of input data.

    var_to_impute : str
        Name of variable to be imputed.

    imputation_class_vars : list of strings
        Names of variables used to define the imputation class.

    missing_values_to_impute : list of integers
        List of missing values to impute for.

    bucket_imputation : bool
        Boolean indicator for whether to generate an additional, bucketed version of the imputed column.

    bins : pd.IntervalIndex, optional
        Only required if bucket_imputation == True.
        Pandas Interval Index type that defines the buckets.
        Note that the left bound is exclusive, right bound is inclusive.

    bins_to_labels : dict, optional
        Only required if bucket_imputation == True.
        Dictionary with Pandas Interval Indices as keys and corresponding integer labels as values.

    multiple_possible_class_vars : list of strings, optional
        Names of imputation class variables that can be used in place of the other
        ex. either work income or business earnings can be used as an imputation class variable.
        Note that the function only accomodates for ONE PAIR of interchangeable
        imputation class variables.

    max_donor_use : int, default = 5
        Maximum number of times a donor can be used.

    userid_col_name : str, default = 'userid'
        Name of user ID variable.

    Returns
    -------

    data_post_imputation : pd.DataFrame
        Pandas DataFrame of output data, with imputations applied to variable to be imputed.
    """

    # make a copy to apply imputation
    data_post_imputation = data.copy()

    # get imputation details and donor details' variable names
    imputation_flag_col_name = f"{var_to_impute}_imputation_flag"
    imputation_type_col_name = f"{var_to_impute}_imputation_type"
    post_imputation_col_name = f"{var_to_impute}_post_imputation"
    imputed_val_col_name = f"{var_to_impute}_imputed_value"
    donor_userid_col_name = f"{var_to_impute}_donor_userid"
    donor_count_col_name = f"{var_to_impute}_donor_count"
    donor_characteristics_col_names = [
        f"{var_to_impute}_donor_{var}" for var in imputation_class_vars
    ]
    donor_distance_col_name = f"{var_to_impute}_donor_distance"

    # initialize donor use tracking dictionary
    donor_use_track_dict = {k: 0 for k in data_post_imputation[userid_col_name]}

    # create copy of post-imputation variable column and imputation class variables with missing values replaced with NA
    imputation_class_vars_w_na = [f"{var}_w_na" for var in imputation_class_vars]
    post_imputation_col_name_w_na = f"{post_imputation_col_name}_w_na"
    data_post_imputation[imputation_class_vars_w_na] = data_post_imputation[
        imputation_class_vars
    ].replace(missing_values_to_impute, pd.NA)
    data_post_imputation[post_imputation_col_name_w_na] = data_post_imputation[
        post_imputation_col_name
    ].replace(missing_values_to_impute, pd.NA)

    if multiple_possible_class_vars is not None:
        multiple_possible_class_vars_w_na = [
            f"{var}_w_na" for var in multiple_possible_class_vars
        ]
        data_post_imputation[multiple_possible_class_vars_w_na] = data_post_imputation[
            multiple_possible_class_vars
        ].replace(missing_values_to_impute, pd.NA)

    # initialize min-max scaler
    minmax_scaler = MinMaxScaler()

    # subset data to remove blanks and potential donors with missing imputation class variables
    knn_impute_subset = data_post_imputation[
        (data_post_imputation[imputation_class_vars_w_na].notna().all(axis=1))
    ].copy()

    # Need to determine if any imputation class variables are strings.
    # If so, then call KNN imputation function with categorical variable.
    categorical_imputation_class_vars = list(
        knn_impute_subset[imputation_class_vars].columns[
            knn_impute_subset[imputation_class_vars].dtypes == "object"
        ]
    )
    if categorical_imputation_class_vars:
        data_post_imputation = nearest_neighbour_imputation_categorical(
            data=knn_impute_subset,
            var_to_impute=var_to_impute,
            imputation_class_vars=imputation_class_vars,
            categorical_imputation_class_vars=categorical_imputation_class_vars,
            missing_values_to_impute=missing_values_to_impute,
            bucket_imputation=bucket_imputation,
            bins=bins,
            bins_to_labels=bins_to_labels,
            multiple_possible_class_vars=multiple_possible_class_vars,
            max_donor_use=max_donor_use,
            userid_col_name=userid_col_name,
        )

    elif multiple_possible_class_vars is None:

        # subset data to remove blanks and potential donors with missing imputation class variables
        knn_impute_subset = data_post_imputation[
            (data_post_imputation[imputation_class_vars_w_na].notna().all(axis=1))
        ].copy()

        # reset index of subset, but retain the old index to facilitate updating the full imput data set later
        knn_impute_subset = knn_impute_subset.reset_index(names="original_index")

        # get column numbers for indexing use later
        data_columns = knn_impute_subset.columns
        imputation_flag_col_idx = data_columns.get_loc(imputation_flag_col_name)
        imputation_type_col_idx = data_columns.get_loc(imputation_type_col_name)
        post_imputation_col_idx = data_columns.get_loc(post_imputation_col_name)
        imputed_val_col_idx = data_columns.get_loc(imputed_val_col_name)
        donor_userid_col_idx = data_columns.get_loc(donor_userid_col_name)
        donor_count_col_idx = data_columns.get_loc(donor_count_col_name)
        donor_characteristics_col_idxs = [
            data_columns.get_loc(var) for var in donor_characteristics_col_names
        ]
        donor_distance_col_idx = data_columns.get_loc(donor_distance_col_name)

        # Need to min-max scale imputation class variables before determining nearest neighbours.
        imputation_class_vars_transformed = minmax_scaler.fit_transform(
            knn_impute_subset[imputation_class_vars_w_na]
        )

        # Use scikit-learn's BallTree function to get nearest neighbours and their indices.
        tree = BallTree(imputation_class_vars_transformed)
        neighbour_distances_indices = tree.query(
            imputation_class_vars_transformed,
            k=knn_impute_subset.shape[0],
            return_distance=True,
            dualtree=True,
        )

        for tree_index_num, data_index_num in enumerate(knn_impute_subset.index):

            # tree_index_num tracks the rows of the BallTree / distance + donor index array
            # data_index_num tracks the row of knn_impute_subset, the subset of the data
            # that does not contain missingness in imputation class variables

            # Search for neighbours if row has missing value
            # First condition disregards blanks.
            # Second condition filters for rows that contain a missing value as defined by missing_values_to_impute.
            # condition formerly pd.isna(data_post_imputation[post_imputation_col_name][data_index_num])
            if pd.notna(
                knn_impute_subset[post_imputation_col_name][data_index_num]
            ) and (
                knn_impute_subset[post_imputation_col_name][data_index_num]
                in missing_values_to_impute
            ):

                neighbour_rank = 0
                continue_neighbour_search = True

                while continue_neighbour_search:

                    donor_idx = neighbour_distances_indices[1][
                        tree_index_num, neighbour_rank
                    ]
                    donor_dist = neighbour_distances_indices[0][
                        tree_index_num, neighbour_rank
                    ]
                    donor_userid = knn_impute_subset[userid_col_name][donor_idx]

                    # first check: donor use does not exceed max_donor_use.
                    if donor_use_track_dict[donor_userid] > max_donor_use:
                        neighbour_rank += 1

                    # second check: if closest donor is itself, then continue neighbour search
                    elif donor_idx == data_index_num:
                        neighbour_rank += 1

                    # third check: if donor has missing value, then continue neighbour search
                    elif (
                        knn_impute_subset[post_imputation_col_name][donor_idx]
                        in missing_values_to_impute
                    ):
                        neighbour_rank += 1

                    # else, donor is eligible; end neighbour search
                    else:
                        continue_neighbour_search = False

                        # impute value into post-imputation variable
                        knn_impute_subset.iloc[
                            data_index_num, post_imputation_col_idx
                        ] = knn_impute_subset[post_imputation_col_name][donor_idx]

                        # indicate imputation
                        knn_impute_subset.iloc[
                            data_index_num, imputation_flag_col_idx
                        ] = 1

                        # indicate type of imputation
                        knn_impute_subset.iloc[
                            data_index_num, imputation_type_col_idx
                        ] = "KNN"

                        # save imputed value
                        knn_impute_subset.iloc[data_index_num, imputed_val_col_idx] = (
                            knn_impute_subset[post_imputation_col_name][donor_idx]
                        )

                        # save donor's user id
                        knn_impute_subset.iloc[data_index_num, donor_userid_col_idx] = (
                            donor_userid
                        )

                        # increment donor's use count in donor use tracking dictionary
                        donor_use_track_dict[donor_userid] += 1

                        # save donor characteristics
                        for n, var in enumerate(imputation_class_vars):
                            knn_impute_subset.iloc[
                                data_index_num, donor_characteristics_col_idxs[n]
                            ] = knn_impute_subset[var][donor_idx]

                        # save donor distance
                        knn_impute_subset.iloc[
                            data_index_num, donor_distance_col_idx
                        ] = donor_dist

                        # populate donor use count column
                        for row_num, user_id in enumerate(
                            knn_impute_subset[donor_userid_col_name]
                        ):
                            if user_id != -1:
                                knn_impute_subset.iloc[row_num, donor_count_col_idx] = (
                                    donor_use_track_dict[user_id]
                                )

                        # bucket imputed column if bucket_imputation == True
                        if bucket_imputation:
                            bucketed_post_imputation_col_name = (
                                f"{var_to_impute}_post_imputation_bucketed"
                            )
                            knn_impute_subset[bucketed_post_imputation_col_name] = (
                                continuous_to_range(
                                    knn_impute_subset,
                                    post_imputation_col_name,
                                    bins,
                                    bins_to_labels,
                                )
                            )

                        # reset index for subset that underwent imputation
                        knn_impute_subset = knn_impute_subset.set_index(
                            "original_index"
                        )
                        knn_impute_subset = knn_impute_subset.rename_axis(index=None)

                        # update full data set to include post-imputation information
                        data_post_imputation.update(knn_impute_subset)

    # if there are imputation class variables that can be used in lieu of the other,
    # then separate distance matrices need to be computed per variable in multiple_possible_class_vars
    else:
        # this list stores tuples of distances and neighbour indices per multiple_possible_class_vars variable
        list_neighbour_distances_indices = []

        for var in multiple_possible_class_vars_w_na:

            # subset data to remove blanks and potential donors with missing imputation class variables
            knn_impute_subset = data_post_imputation[
                (
                    data_post_imputation[imputation_class_vars_w_na + var]
                    .notna()
                    .all(axis=1)
                )
            ]

            # Need to min-max scale imputation class variables before determining nearest neighbours.
            imputation_class_vars_transformed = minmax_scaler.fit_transform(
                knn_impute_subset[imputation_class_vars_w_na + var]
            )

            # Use scikit-learn's BallTree function to get nearest neighbours and their indices.
            tree = BallTree(imputation_class_vars_transformed)
            list_neighbour_distances_indices.append(
                tree.query(
                    imputation_class_vars_transformed,
                    k=knn_impute_subset.shape[0],
                    return_distance=True,
                    dualtree=True,
                )
            )

        # re-subset the data to retain rows where none of imputation_class_vars are blank / missing
        # and at least one of multiple_possible_class_vars_w_na is populated
        knn_impute_subset = data_post_imputation[
            (data_post_imputation[imputation_class_vars_w_na].notna().all(axis=1))
        ].copy()

        knn_impute_subset = knn_impute_subset[
            (knn_impute_subset[multiple_possible_class_vars_w_na].notna().any(axis=1))
        ]

        # reset index of subset, but retain the old index to facilitate updating the full imput data set later
        knn_impute_subset = knn_impute_subset.reset_index(names="original_index")

        # get column numbers for indexing use later
        data_columns = knn_impute_subset.columns
        imputation_flag_col_idx = data_columns.get_loc(imputation_flag_col_name)
        imputation_type_col_idx = data_columns.get_loc(imputation_type_col_name)
        post_imputation_col_idx = data_columns.get_loc(post_imputation_col_name)
        imputed_val_col_idx = data_columns.get_loc(imputed_val_col_name)
        donor_userid_col_idx = data_columns.get_loc(donor_userid_col_name)
        donor_count_col_idx = data_columns.get_loc(donor_count_col_name)
        donor_characteristics_col_idxs = [
            data_columns.get_loc(var) for var in donor_characteristics_col_names
        ]
        donor_distance_col_idx = data_columns.get_loc(donor_distance_col_name)

        for tree_index_num, data_index_num in enumerate(knn_impute_subset.index):

            # Search for neighbours if row has missing value
            # condition formerly pd.isna(data_post_imputation[post_imputation_col_name][data_index_num])
            if pd.notna(
                knn_impute_subset[post_imputation_col_name][data_index_num]
            ) and (
                knn_impute_subset[post_imputation_col_name][data_index_num]
                in missing_values_to_impute
            ):

                neighbour_rank = 0
                continue_neighbour_search = True

                while continue_neighbour_search:

                    # Determine which multiple_possible_class_vars variable to use
                    # by finding the first variable with non-missing value
                    tree_to_use = 0
                    continue_class_vars_search = True

                    while continue_class_vars_search:
                        # This case only occurs if all variables in multiple_possible_class_vars are missing in a given row.
                        # This should not happen as a base assumption is that at least one of the imputation class variables
                        # for recipients have no missingness.
                        if tree_to_use >= len(multiple_possible_class_vars):
                            continue_class_vars_search = False
                            userid_missing_class_vars = knn_impute_subset[
                                data_index_num
                            ][userid_col_name]
                            warnings.warn(
                                f"All variables in multiple_possible_class_vars are missing for user id {userid_missing_class_vars}."
                            )

                        elif pd.isna(
                            knn_impute_subset[multiple_possible_class_vars[tree_to_use]]
                        ):
                            tree_to_use += 1

                        # first variable in multiple_possible_class_vars that is not missing
                        else:
                            continue_class_vars_search = False

                    neighbour_distances_indices = list_neighbour_distances_indices[
                        tree_to_use
                    ]

                    donor_idx = neighbour_distances_indices[1][
                        tree_index_num, neighbour_rank
                    ]
                    donor_dist = neighbour_distances_indices[0][
                        tree_index_num, neighbour_rank
                    ]
                    donor_userid = knn_impute_subset[userid_col_name][donor_idx]

                    # first check: donor use does not exceed max_donor_use.
                    if donor_use_track_dict[donor_userid] > max_donor_use:
                        neighbour_rank += 1

                    # second check: if closest donor is itself, then continue neighbour search
                    elif donor_idx == data_index_num:
                        neighbour_rank += 1

                    # third check: if donor has missing value, then continue neighbour search
                    elif (
                        knn_impute_subset[post_imputation_col_name][donor_idx]
                        in missing_values_to_impute
                    ):
                        neighbour_rank += 1

                    # else, donor is eligible; end neighbour search
                    else:
                        continue_neighbour_search = False

                        # impute value into post-imputation variable
                        knn_impute_subset.iloc[
                            data_index_num, post_imputation_col_idx
                        ] = knn_impute_subset[post_imputation_col_name][donor_idx]

                        # indicate imputation
                        knn_impute_subset.iloc[
                            data_index_num, imputation_flag_col_idx
                        ] = 1

                        # indicate type of imputation
                        knn_impute_subset.iloc[
                            data_index_num, imputation_type_col_idx
                        ] = "KNN"

                        # save imputed value
                        knn_impute_subset.iloc[data_index_num, imputed_val_col_idx] = (
                            knn_impute_subset[post_imputation_col_name][donor_idx]
                        )

                        # save donor's user id
                        knn_impute_subset.iloc[data_index_num, donor_userid_col_idx] = (
                            donor_userid
                        )

                        # increment donor's use count in donor use tracking dictionary
                        donor_use_track_dict[donor_userid] += 1

                        # save donor characteristics
                        for n, var in enumerate(imputation_class_vars):
                            knn_impute_subset.iloc[
                                data_index_num, donor_characteristics_col_idxs[n]
                            ] = knn_impute_subset[var][donor_idx]

                        # save donor distance
                        knn_impute_subset.iloc[
                            data_index_num, donor_distance_col_idx
                        ] = donor_dist

                        # populate donor use count column
                        for row_num, user_id in enumerate(
                            knn_impute_subset[donor_userid_col_name]
                        ):
                            if user_id != -1:
                                knn_impute_subset.iloc[row_num, donor_count_col_idx] = (
                                    donor_use_track_dict[user_id]
                                )

                        # bucket imputed column if bucket_imputation == True
                        if bucket_imputation:
                            bucketed_post_imputation_col_name = (
                                f"{var_to_impute}_post_imputation_bucketed"
                            )
                            knn_impute_subset[bucketed_post_imputation_col_name] = (
                                continuous_to_range(
                                    knn_impute_subset,
                                    post_imputation_col_name,
                                    bins,
                                    bins_to_labels,
                                )
                            )

                        # reset index for subset that underwent imputation
                        knn_impute_subset = knn_impute_subset.set_index(
                            "original_index"
                        )
                        knn_impute_subset = knn_impute_subset.rename_axis(index=None)

                        # update full data set to include post-imputation information
                        data_post_imputation.update(knn_impute_subset)

    return data_post_imputation

Read in iris data set for a mock-up example.

In [22]:
iris = load_iris(as_frame=True, return_X_y=True)
iris[0].head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [16]:
# easy dummy data to work with
iris = load_iris(as_frame=True, return_X_y=True)
iris_df = iris[0]
iris_df["species"] = iris[1]

na_mask = np.linspace(0, 145, 30, dtype=int)
iris_df.iloc[na_mask, 1] = -1

In [17]:
imputation_class_vars = ["species", "sepal length (cm)"]
var_to_impute = "sepal width (cm)"

In [18]:
iris_knn_imputed_df = iris_df.copy()
iris_knn_imputed_df["userid"] = range(0, len(iris_knn_imputed_df))
iris_knn_imputed_df.iloc[na_mask + 1, 4] = -1

iris_knn_imputed_df = data_preprocessing_store_donor_info(
    iris_knn_imputed_df, var_to_impute, imputation_class_vars
)

In [19]:
iris_knn_imputed_df.head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,userid,sepal width (cm)_post_imputation,sepal width (cm)_imputation_flag,sepal width (cm)_imputation_type,sepal width (cm)_imputed_value,sepal width (cm)_donor_userid,sepal width (cm)_donor_count,sepal width (cm)_donor_species,sepal width (cm)_donor_sepal length (cm),sepal width (cm)_donor_distance
0,5.1,-1.0,1.4,0.2,0,0,-1.0,0,,,-1,0,,,
1,4.9,3.0,1.4,0.2,-1,1,3.0,0,,,-1,0,,,
2,4.7,3.2,1.3,0.2,0,2,3.2,0,,,-1,0,,,
3,4.6,3.1,1.5,0.2,0,3,3.1,0,,,-1,0,,,
4,5.0,3.6,1.4,0.2,0,4,3.6,0,,,-1,0,,,
5,5.4,-1.0,1.7,0.4,0,5,-1.0,0,,,-1,0,,,
6,4.6,3.4,1.4,0.3,-1,6,3.4,0,,,-1,0,,,
7,5.0,3.4,1.5,0.2,0,7,3.4,0,,,-1,0,,,
8,4.4,2.9,1.4,0.2,0,8,2.9,0,,,-1,0,,,
9,4.9,3.1,1.5,0.1,0,9,3.1,0,,,-1,0,,,


In [20]:
iris_str_species = iris_knn_imputed_df.copy()
iris_str_species["species"] = iris_str_species["species"].replace(
    {0: "a", 1: "b", 2: "c"}
)

In [21]:
# apply nearest neighbour imputation
iris_knn_post_imputed_df = nearest_neighbour_imputation(
    iris_str_species,
    var_to_impute,
    imputation_class_vars,
    [-1],
    True,
    non_monetary_bins,
    non_monetary_bins_to_enumerated_val,
)

In [22]:
iris_knn_post_imputed_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,userid,sepal width (cm)_post_imputation,sepal width (cm)_imputation_flag,sepal width (cm)_imputation_type,sepal width (cm)_imputed_value,sepal width (cm)_donor_userid,sepal width (cm)_donor_count,sepal width (cm)_donor_species,sepal width (cm)_donor_sepal length (cm),sepal width (cm)_donor_distance,species_w_na,sepal length (cm)_w_na,sepal width (cm)_post_imputation_w_na
0,5.1,-1.0,1.4,0.2,a,0,3.3,1,KNN,3.3,23,1,a,5.1,0.0,a,5.1,
2,4.7,3.2,1.3,0.2,a,2,3.2,0,,,-1,0,,,,a,4.7,3.2
3,4.6,3.1,1.5,0.2,a,3,3.1,0,,,-1,0,,,,a,4.6,3.1
4,5.0,3.6,1.4,0.2,a,4,3.6,0,,,-1,0,,,,a,5.0,3.6
5,5.4,-1.0,1.7,0.4,a,5,4.2,1,KNN,4.2,33,1,a,5.5,0.066667,a,5.4,
7,5.0,3.4,1.5,0.2,a,7,3.4,0,,,-1,0,,,,a,5.0,3.4
8,4.4,2.9,1.4,0.2,a,8,2.9,0,,,-1,0,,,,a,4.4,2.9
9,4.9,3.1,1.5,0.1,a,9,3.1,0,,,-1,0,,,,a,4.9,3.1
10,5.4,-1.0,1.5,0.2,a,10,4.2,1,KNN,4.2,5,1,a,5.4,0.0,a,5.4,
12,4.8,3.0,1.4,0.1,a,12,3.0,0,,,-1,0,,,,a,4.8,3.0


In [23]:
iris_knn_post_imputed_df["sepal width (cm)_imputed_value"].value_counts()

sepal width (cm)_imputed_value
3.4    5
3.3    3
4.2    3
3.8    3
2.8    3
2.3    3
2.9    3
3.1    2
3.0    2
2.2    2
3.6    1
Name: count, dtype: int64

In [24]:
iris_knn_post_imputed_df["sepal width (cm)_donor_count"].value_counts()

sepal width (cm)_donor_count
0    90
1    15
2    12
3     3
Name: count, dtype: int64

In [25]:
iris_knn_post_imputed_df[
    iris_knn_post_imputed_df["sepal width (cm)_donor_count"].isin([2, 3])
]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,userid,sepal width (cm)_post_imputation,sepal width (cm)_imputation_flag,sepal width (cm)_imputation_type,sepal width (cm)_imputed_value,sepal width (cm)_donor_userid,sepal width (cm)_donor_count,sepal width (cm)_donor_species,sepal width (cm)_donor_sepal length (cm),sepal width (cm)_donor_distance,species_w_na,sepal length (cm)_w_na,sepal width (cm)_post_imputation_w_na
25,5.0,-1.0,1.6,0.2,a,25,3.4,1,KNN,3.4,7,3,a,5.0,0.0,a,5.0,
30,4.8,-1.0,1.6,0.2,a,30,3.4,1,KNN,3.4,24,2,a,4.8,0.0,a,4.8,
35,5.0,-1.0,1.2,0.2,a,35,3.4,1,KNN,3.4,7,3,a,5.0,0.0,a,5.0,
40,5.0,-1.0,1.3,0.3,a,40,3.4,1,KNN,3.4,7,3,a,5.0,0.0,a,5.0,
45,4.8,-1.0,1.4,0.3,a,45,3.4,1,KNN,3.4,24,2,a,4.8,0.0,a,4.8,
55,5.7,-1.0,4.5,1.3,b,55,2.8,1,KNN,2.8,99,2,b,5.7,0.0,b,5.7,
70,5.9,-1.0,4.8,1.8,b,70,2.2,1,KNN,2.2,62,2,b,6.0,0.047619,b,5.9,
80,5.5,-1.0,3.8,1.1,b,80,2.3,1,KNN,2.3,53,2,b,5.5,0.0,b,5.5,
85,6.0,-1.0,4.5,1.6,b,85,2.2,1,KNN,2.2,62,2,b,6.0,0.0,b,6.0,
90,5.5,-1.0,4.4,1.2,b,90,2.3,1,KNN,2.3,53,2,b,5.5,0.0,b,5.5,


# **Ratio/Trend Imputation**

See "Historical Imputation in RHS_forStatCan.docx" for ratio/trend imputation specification. Only work income (including bonus and net earnings/profits) and housing loan value will be subjected to this imputation approach.

**ASSUMPTIONS**

- want to impute -1 , 888, 999

In [26]:
data_folder = r"\\fld6filer\icmic-cciim\CONSULTATION\IMPUDON\python-redesign\data"
w3_w4_subset_file = "W3_W4_combined_for_trend_imputation.xlsx"
w3_w4_subset_path = Path(data_folder) / w3_w4_subset_file

In [27]:
tst = pd.read_excel(w3_w4_subset_path)

In [28]:
tst.head()

Unnamed: 0,userid,Couple_Tag,Q127,Q128_1,GO_Q130a_1,GO_Q131a_1,Q135_1,Q140a_1_1,Q140aCO_1_1,Q142a_1_1,Q142aCO_1_1,Q143a_1_1,Q143aCO_1_1,Whether_HDB_Housing,Status_Loan,Value_Hse_Loan,Total_Purchase_Price,Total_Prop,Couple_Tag_prev_wave,Q127_prev_wave,Q128_1_prev_wave,Q135_1_prev_wave,GO_Q130a_1_prev_wave,GO_Q131a_1_prev_wave,Q140a_1_1_prev_wave,Q140aCO_1_1_prev_wave,Q140a_1_1_c,Q142a_1_1_prev_wave,Q142aCO_1_1_prev_wave,Q142a_1_1_c,Q143a_1_1_prev_wave,Q143aCO_1_1_prev_wave,Q143a_1_1_c,Whether_HDB_Housing_prev_wave,Status_Loan_prev_wave,Value_Hse_Loan_prev_wave,Value_Hse_Loan_c,Total_Purchase_Price_prev_wave,Total_Prop_prev_wave
0,32823,,1,1.0,M,24.0,3.0,,,,,,21.0,,-2,0.0,,0,,1.0,1.0,3.0,M,24.0,,,,,,,4061.85,,4061.85,,-2.0,0.0,0.0,,0.0
1,32828,,3,,,,,,,,,,,HDB,2,0.0,14500.0,1,,3.0,,,,,,,,,,,,,,HDB,2.0,0.0,0.0,14500.0,1.0
2,32821,,1,1.0,S,51.0,3.0,,,,,,13.0,HDB,2,0.0,165000.0,1,,1.0,1.0,3.0,Q,51.0,,,,,,,1003.72,,1003.72,HDB,2.0,0.0,0.0,165000.0,1.0
3,32832,,3,,,,,,,,,,,HDB,2,0.0,14500.0,1,,1.0,1.0,3.0,G,52.0,,,,,,,1003.72,,1003.72,HDB,2.0,0.0,0.0,14500.0,1.0
4,32834,,3,,,,,,,,,,,,-2,0.0,,0,,3.0,,,,,,,,,,,,,,,-2.0,0.0,0.0,,0.0


## Utility Functions for Historical Imputation

In [29]:
def ratio_imputation(
    avg_current_wave: float, avg_previous_wave: float, val_past_wave: float
) -> float:
    """Function for applying ratio imputation.

    Parameters
    ----------

    avg_current_wave : float
        Average value in the current wave.

    avg_previous_wave : float
        Average value in the previous wave.

    val_past_wave : float
        Value in the previous wave.

    Returns
    -------

    (avg_current_wave / avg_previous_wave) * val_past_wave : float
        Imputed value using ratio imputation.

    """
    return (avg_current_wave / avg_previous_wave) * val_past_wave

## Housing Loan Imputation

First, impute for housing loan value.

Segment the previous wave's data by housing type outside of the row-by-row imputation step.

In [30]:
values_to_impute = [-1, 888, 999]

In [31]:
def construct_house_loan_donor_pool_one_house(
    data: pd.DataFrame,
    var_housing_loan_current_wave: str,
    var_housing_loan_previous_wave: str,
    var_number_properties_current_wave: str,
    var_number_properties_previous_wave: str,
    var_hdb_indicator_current_wave: str,
    var_hdb_indicator_previous_wave: str,
    missing_values_to_impute: List[int],
) -> pd.DataFrame:
    """Function for constructing donor pool for historical imputation of house loan values, one house only.

    Pandas DataFrame output of this function feedsin into the historical_imputation_housing function.
    These

    Parameters
    ----------

    data : pd.Dataframe
        Input data for housing loan historical imputation.

    var_housing_loan_current_wave : str
        Name of housing loan variable in current wave to be imputed.

    var_housing_loan_previous_wave : str
        Name of housing loan variable in previous wave.

    var_number_properties_current_wave : str
        Name of number of properties variable in current wave.

    var_number_properties_previous_wave : str
        Name of number of properties variable in previous wave.

    var_hdb_indicator_current_wave : str
        Name of HDB housing indicator variable in current wave.

    var_hdb_indicator_previous_wave : str
        Name of HDB housing indicator variable in previous wave.

    missing_values_to_impute : list[int]
        List of missing values to impute for.

    Returns
    -------

    house_loan_donor_pool_one_house : pd.DataFrame
        Pandas DataFrame that has average housing loan and number of observations
        per imputation class for the historical imputation of housing value in the case
        that respondent only has 1 house.
    """

    house_loan_donor_pool_one_house = (
        data[
            (~data[var_housing_loan_current_wave].isin(missing_values_to_impute))
            & (~data[var_housing_loan_previous_wave].isin(missing_values_to_impute))
            & (data[var_number_properties_current_wave] == 1)
            & (data[var_number_properties_previous_wave] == 1)
        ]
        .groupby([var_hdb_indicator_previous_wave])[
            [var_housing_loan_current_wave, var_housing_loan_previous_wave]
        ]
        .agg(["count", "mean"])
        .reset_index()
    )

    house_loan_donor_pool_one_house.columns = [
        "_".join(col).rstrip("_") for col in house_loan_donor_pool_one_house.columns
    ]

    return house_loan_donor_pool_one_house

In [32]:
house_loan_donor_pool_one_house = construct_house_loan_donor_pool_one_house(
    tst,
    "Value_Hse_Loan",
    "Value_Hse_Loan_prev_wave",
    "Total_Prop",
    "Total_Prop_prev_wave",
    "Whether_HDB_Housing",
    "Whether_HDB_Housing_prev_wave",
    values_to_impute,
)

In [33]:
house_loan_donor_pool_one_house

Unnamed: 0,Whether_HDB_Housing_prev_wave,Value_Hse_Loan_count,Value_Hse_Loan_mean,Value_Hse_Loan_prev_wave_count,Value_Hse_Loan_prev_wave_mean
0,HDB,6728,18302.902176,6728,23596.465559
1,Non-HDB,555,171270.412613,555,189812.625225


In [34]:
# leave comments to start with "in the future" to flag things that will have to be adjusted
# in the future

In [35]:
def construct_house_loan_donor_pool_multiple_houses(
    data: pd.DataFrame,
    var_housing_loan_current_wave: str,
    var_housing_loan_previous_wave: str,
    var_number_properties_current_wave: str,
    var_number_properties_previous_wave: str,
    var_hdb_indicator_current_wave: str,
    var_hdb_indicator_previous_wave: str,
    missing_values_to_impute: List[int],
) -> pd.DataFrame:
    """Function for constructing donor pool for historical imputation of house loan values for multiple homeowners.

    Pandas DataFrame output of this function feedsin into the historical_imputation_housing function.

    Parameters
    ----------

    data : pd.Dataframe
        Input data for housing loan historical imputation.

    var_housing_loan_current_wave : str
        Name of housing loan variable in current wave to be imputed.

    var_housing_loan_previous_wave : str
        Name of housing loan variable in previous wave.

    var_number_properties_current_wave : str
        Name of number of properties variable in current wave.

    var_number_properties_previous_wave : str
        Name of number of properties variable in previous wave.

    var_hdb_indicator_current_wave : str
        Name of HDB housing indicator variable in current wave.

    var_hdb_indicator_previous_wave : str
        Name of HDB housing indicator variable in previous wave.

    missing_values_to_impute : list[int]
        List of missing values to impute for.

    Returns
    -------

    house_loan_donor_pool_multiple : pd.DataFrame
        Pandas DataFrame that has housing loan values of respondents who meet the criteria for being
        in the donor pool for imputing housing loan values for individuals with multiple houses.
    """

    house_loan_donor_pool_multiple = data[
        (~data[var_housing_loan_current_wave].isin(missing_values_to_impute))
        & (~data[var_housing_loan_previous_wave].isin(missing_values_to_impute))
        & (~data[var_number_properties_current_wave].isin(missing_values_to_impute))
        & (data[var_number_properties_current_wave] > 1)
        & (
            data[var_number_properties_previous_wave]
            >= data[var_number_properties_current_wave]
        )
    ][[var_housing_loan_current_wave, var_housing_loan_previous_wave]]

    return house_loan_donor_pool_multiple

In [36]:
house_loan_donor_pool_multiple = construct_house_loan_donor_pool_multiple_houses(
    tst,
    "Value_Hse_Loan",
    "Value_Hse_Loan_prev_wave",
    "Total_Prop",
    "Total_Prop_prev_wave",
    "Whether_HDB_Housing",
    "Whether_HDB_Housing_prev_wave",
    values_to_impute,
)

In [37]:
house_loan_donor_pool_multiple.apply(np.mean)

Value_Hse_Loan              266178.619447
Value_Hse_Loan_prev_wave    372203.990783
dtype: float64

In [38]:
house_loan_donor_pool_multiple.head()

Unnamed: 0,Value_Hse_Loan,Value_Hse_Loan_prev_wave
271,0.0,0.0
382,600000.0,700000.0
493,0.0,0.0
540,246162.0,300000.0
574,0.0,0.0


In [39]:
def housing_historical_imputation_by_row(
    row: pd.DataFrame,
    var_current_wave: str,
    var_previous_wave: str,
    var_number_properties: str,
    var_hdb_indicator_current_wave: str,
    var_hdb_indicator_previous_wave: str,
    imputation_group_one_house: pd.DataFrame,
    imputation_group_multi_house: pd.DataFrame,
    missing_values_to_impute: list[int],
    min_records_req: int,
) -> pd.DataFrame:
    """Historical imputation function for housing loan value, applied to a single row.

    Row-wise function that will be applied to the entire column in the
    historical_imputation_housing function.

    Parameters
    ----------

    row : pd.Dataframe
        A single row of a Pandas DataFrame.

    var_current_wave : str
        Name of variable in current wave to be imputed.

    var_previous_wave : str
        Name of variable in previous wave to be used for ratio imputation.

    var_number_properties : str
        Name of number of properties variable in current wave.

    var_hdb_indicator_current_wave : str
        Name of HDB housing indicator variable in current wave.

    var_hdb_indicator_previous_wave : str
        Name of HDB housing indicator variable in previous wave.

    imputation_group_one_house : pd.DataFrame
        Pandas DataFrame containing the imputation groups for respondents with 1 house.
        Columns of this DataFrame are: HDB housing indicator, which defines the
        imputation classes, count of observations per imputation class, and
        mean housing loan values in the current and previous waves.

    imputation_group_multi_house : pd.DataFrame
        Pandas DataFrame containing the imputation groups for respondents with multiple houses.
        Columns of this DataFrame are: mean housing loan values in the current and previous waves.

    missing_values_to_impute : list[int]
        List of missing values to impute for.

    min_records_req : int
        Minimum number of non-missing housing loan values for previous and current waves in
        imputation class.

    Returns
    -------

    row_post_imputation : pd.DataFrame
        Pandas DataFrame row of output data, with imputation applied to a row of the
        housing loan value variable.
    """

    # make copy of row to apply imputation
    row_post_imputation = row.copy()

    # get name of post imputation column
    post_imputation_col_name = f"{var_current_wave}_post_imputation"
    imputed_val_col_name = f"{var_current_wave}_imputed_value"

    # get names of counts and mean / average columns
    record_count_current_wave = f"{var_current_wave}_count"
    record_count_previous_wave = f"{var_previous_wave}_count"
    avg_current_wave = f"{var_current_wave}_mean"
    avg_previous_wave = f"{var_previous_wave}_mean"

    # only impute if:
    # var_current_wave is missing for row
    # var_previous_wave was not imputed
    # same housing for previous and current waves
    if pd.isna(row_post_imputation[post_imputation_col_name]) & (
        ~pd.isna(row_post_imputation[var_previous_wave])
    ):

        # if there is only 1 property
        if row_post_imputation[var_number_properties] == 1:

            # only impute if there are sufficient records based on imputation parameters
            if (
                imputation_group_one_house[
                    imputation_group_one_house[var_hdb_indicator_previous_wave]
                    == row_post_imputation[var_hdb_indicator_current_wave]
                ][[record_count_current_wave, record_count_previous_wave]].values
                > min_records_req
            ).all():

                # average housing loan of current wave
                avg_val_current_wave = imputation_group_one_house[
                    imputation_group_one_house[var_hdb_indicator_previous_wave]
                    == row_post_imputation[var_hdb_indicator_current_wave]
                ][avg_current_wave].values[0]

                # average housing loan of previous wave
                avg_val_previous_wave = imputation_group_one_house[
                    imputation_group_one_house[var_hdb_indicator_previous_wave]
                    == row_post_imputation[var_hdb_indicator_current_wave]
                ][avg_previous_wave].values[0]

                # reported value in previous wave
                reported_val_previous_wave = row_post_imputation[var_previous_wave]

                # impute using historical ratio imputation
                imputed_val = ratio_imputation(
                    avg_val_current_wave,
                    avg_val_previous_wave,
                    reported_val_previous_wave,
                )
                row_post_imputation[post_imputation_col_name] = imputed_val
                row_post_imputation[imputed_val_col_name] = imputed_val

                # change imputation flag to 1
                imputation_flag_col_name = f"{var_current_wave}_imputation_flag"
                row_post_imputation[imputation_flag_col_name] = 1

                # set imputation type to Historical
                imputation_type_col_name = f"{var_current_wave}_imputation_type"
                row_post_imputation[imputation_type_col_name] = "Historical"

        # if there are multiple properties
        elif row_post_imputation[var_number_properties] > 1:

            # only impute if there are sufficient records based on imputation parameters
            if imputation_group_multi_house.shape[0] > min_records_req:

                # average housing loan of current wave
                avg_val_current_wave = np.mean(
                    imputation_group_multi_house[var_current_wave]
                )

                # average housing loan of previous wave
                avg_val_previous_wave = np.mean(
                    imputation_group_multi_house[var_previous_wave]
                )

                # reported value in previous wave
                reported_val_previous_wave = row_post_imputation[var_previous_wave]

                # impute using historical ratio imputation
                imputed_val = ratio_imputation(
                    avg_val_current_wave,
                    avg_val_previous_wave,
                    reported_val_previous_wave,
                )
                row_post_imputation[post_imputation_col_name] = imputed_val
                row_post_imputation[imputed_val_col_name] = imputed_val

                # change imputation flag to 1
                imputation_flag_col_name = f"{var_current_wave}_imputation_flag"
                row_post_imputation[imputation_flag_col_name] = 1

                # set imputation type to Historical
                imputation_type_col_name = f"{var_current_wave}_imputation_type"
                row_post_imputation[imputation_type_col_name] = "Historical"

    return row_post_imputation

In [40]:
def historical_imputation_housing(
    data: pd.DataFrame,
    var_current_wave: str,
    var_previous_wave: str,
    var_number_properties: str,
    var_hdb_indicator_current_wave: str,
    var_hdb_indicator_previous_wave: str,
    imputation_group_one_house: pd.DataFrame,
    imputation_group_multi_house: pd.DataFrame,
    missing_values_to_impute: list[int],
    min_records_req: int = 30,
) -> pd.DataFrame:
    """Historical imputation function for housing loan value for entire data set.

    Wrapper function to housing_historical_imputation_by_row function.
    Applies historical imputation to entire housing loan value column.

    Parameters
    ----------

    data : pd.Dataframe
        Pandas DataFrame of input data.

    var_current_wave : str
        Name of variable in current wave to be imputed.

    var_previous_wave : str
        Name of variable in previous wave to be used for ratio imputation.

    var_number_properties : str
        Name of number of properties variable in current wave.

    var_hdb_indicator_current_wave : str
        Name of HDB housing indicator variable in current wave.

    var_hdb_indicator_previous_wave : str
        Name of HDB housing indicator variable in previous wave.

    imputation_group_one_house : pd.DataFrame
        Pandas DataFrame containing the imputation groups for respondents with 1 house.
        Columns of this DataFrame are: HDB housing indicator, which defines the
        imputation classes, count of observations per imputation class, and
        mean housing loan values in the current and previous waves.

    imputation_group_multi_house : pd.DataFrame
        Pandas DataFrame containing the imputation groups for respondents with multiple houses.
        Columns of this DataFrame are: mean housing loan values in the current and previous waves.

    missing_values_to_impute : list[int]
        List of missing values to impute for.

    min_records_req : int (default = 30)
        Minimum number of non-missing housing loan values for previous and current waves in
        imputation class.

    Returns
    -------

    data_post_imputation : pd.DataFrame
        Pandas DataFrame of output data, with imputations applied to housing loan value.
    """

    # make copy of data to do imputations on
    data_post_imputation = data.copy()

    # get name of post imputation column
    post_imputation_col_name = f"{var_current_wave}_post_imputation"

    # replace missing values in post-imputation variable and its previous wave's variable columns with NA
    data_post_imputation[post_imputation_col_name] = data_post_imputation[
        post_imputation_col_name
    ].replace(missing_values_to_impute, pd.NA)
    data_post_imputation[var_previous_wave] = data_post_imputation[
        var_previous_wave
    ].replace(missing_values_to_impute, pd.NA)

    # Do historical imputation for housing variables
    data_post_imputation = data_post_imputation.apply(
        lambda row: housing_historical_imputation_by_row(
            row,
            var_current_wave,
            var_previous_wave,
            var_number_properties,
            var_hdb_indicator_current_wave,
            var_hdb_indicator_previous_wave,
            imputation_group_one_house,
            imputation_group_multi_house,
            missing_values_to_impute,
            min_records_req,
        ),
        axis=1,
    )

    return data_post_imputation

In [41]:
tst_post_imputation_housing = data_preprocessing_store_donor_info(
    tst, "Value_Hse_Loan", ["Total_Purchase_Price", "Q140a_1_1"]
)
tst_post_imputation_housing = historical_imputation_housing(
    tst_post_imputation_housing,
    "Value_Hse_Loan",
    "Value_Hse_Loan_prev_wave",
    "Total_Prop",
    "Whether_HDB_Housing",
    "Whether_HDB_Housing_prev_wave",
    house_loan_donor_pool_one_house,
    house_loan_donor_pool_multiple,
    [-1, 88],
)

In [42]:
tst_post_imputation_housing[
    tst_post_imputation_housing.Value_Hse_Loan_imputation_flag == 1
][
    [
        "Total_Purchase_Price",
        "Total_Purchase_Price_prev_wave",
        "Whether_HDB_Housing",
        "Whether_HDB_Housing_prev_wave",
        "Total_Prop",
        "Total_Prop_prev_wave",
        "Value_Hse_Loan",
        "Value_Hse_Loan_prev_wave",
        "Value_Hse_Loan_post_imputation",
        "Value_Hse_Loan_imputation_flag",
        "Value_Hse_Loan_imputation_type",
        "Value_Hse_Loan_imputed_value",
    ]
]

Unnamed: 0,Total_Purchase_Price,Total_Purchase_Price_prev_wave,Whether_HDB_Housing,Whether_HDB_Housing_prev_wave,Total_Prop,Total_Prop_prev_wave,Value_Hse_Loan,Value_Hse_Loan_prev_wave,Value_Hse_Loan_post_imputation,Value_Hse_Loan_imputation_flag,Value_Hse_Loan_imputation_type,Value_Hse_Loan_imputed_value
199,415000.0,,HDB,,1,0.0,-1.0,0.0,0.0,1,Historical,0.0
261,795000.0,250000.0,HDB,Non-HDB,1,1.0,-1.0,0.0,0.0,1,Historical,0.0
893,285000.0,285000.0,HDB,HDB,1,1.0,-1.0,41745.0,32380.046471,1,Historical,32380.046471
1077,700000.0,700000.0,HDB,HDB,1,1.0,-1.0,500000.0,387831.434559,1,Historical,387831.434559
1202,388500.0,388500.0,HDB,HDB,1,1.0,-1.0,225000.0,174524.145551,1,Historical,174524.145551
1526,461300.0,461300.0,HDB,HDB,1,1.0,-1.0,250000.0,193915.717279,1,Historical,193915.717279
1614,625000.0,445000.0,HDB,HDB,2,1.0,-1.0,150109.06,107349.258328,1,Historical,107349.258328
1655,168000.0,167000.0,HDB,HDB,1,1.0,-1.0,60000.0,46539.772147,1,Historical,46539.772147
2168,1058997.0,,Non-HDB,,1,0.0,-1.0,0.0,0.0,1,Historical,0.0
2355,200000.0,200000.0,HDB,HDB,1,1.0,-1.0,71437.46,55411.385186,1,Historical,55411.385186


In [43]:
tst_post_imputation_housing[
    tst_post_imputation_housing.Value_Hse_Loan_imputation_flag == 1
].Total_Prop.value_counts()

Total_Prop
1    58
2     3
Name: count, dtype: int64

## Work income, bonuses, net earnings imputation

In [44]:
def construct_incomes_donor_pool(
    data: pd.DataFrame,
    var_current_wave: str,
    var_previous_wave: str,
    var_number_jobs_current_wave: str,
    var_number_jobs_previous_wave: str,
    var_ssic_current_wave: str,
    missing_values_to_impute: List[int],
) -> pd.DataFrame:
    """Function for constructing donor pool for historical imputation of house loan values, one house only.

    Pandas DataFrame output of this function feedsin into the historical_imputation_housing function.
    These

    Parameters
    ----------

    data : pd.Dataframe
        Input data for housing loan historical imputation.

    var_current_wave : str
        Name of variable to be imputed in the current wave.

    var_previous_wave : str
        Name of variable to be imputed in the previous wave.

    var_number_jobs_current_wave : str
        Name of number of jobs variable in current wave.

    var_number_jobs_previous_wave : str
        Name of number of jobs variable in current wave.

    var_ssic_current_wave : str
        Name of SSIC variable in current wave.

    missing_values_to_impute : list[int]
        List of missing values to impute for.

    Returns
    -------

    imputation_group : pd.DataFrame
        Pandas DataFrame that has average value of variable to be imputed and number of observations
        per imputation class for the historical imputation of income / earnings related variables.
    """

    imputation_group = (
        tst[
            (
                ~data[var_current_wave].isin(missing_values_to_impute)
            )  # no missing values in current wave
            & (
                ~data[var_previous_wave].isin(missing_values_to_impute)
            )  # no missing values in previous wave
            & (
                data[var_number_jobs_current_wave]
                == data[var_number_jobs_previous_wave]
            )  # number of jobs equal in previous and current waves
        ]
        .groupby([var_ssic_current_wave])[[var_current_wave, var_previous_wave]]
        .agg(["count", "mean"])
        .reset_index()
    )

    imputation_group.columns = [
        "_".join(col).rstrip("_") for col in imputation_group.columns
    ]

    return imputation_group

In [45]:
work_income_imputation_group = construct_incomes_donor_pool(
    tst,
    "Q140a_1_1",
    "Q140a_1_1_prev_wave",
    "Q128_1",
    "Q128_1_prev_wave",
    "GO_Q130a_1",
    values_to_impute,
)

In [46]:
work_income_imputation_group.head()

Unnamed: 0,GO_Q130a_1,Q140a_1_1_count,Q140a_1_1_mean,Q140a_1_1_prev_wave_count,Q140a_1_1_prev_wave_mean
0,88,2,1194.0,4,5015.75
1,A,1,2108.0,1,1210.0
2,C,383,4140.775457,366,3799.718579
3,D,5,5773.0,5,5611.8
4,E,37,3021.540541,36,2720.027778


In [47]:
def income_historical_imputation_by_row(
    row: pd.DataFrame,
    var_current_wave: str,
    var_previous_wave: str,
    num_jobs_current_wave: str,
    num_jobs_previous_wave: str,
    ssic: str,
    imputation_group: pd.DataFrame,
    missing_values_to_impute: list[int],
    min_records_req: int,
) -> pd.DataFrame:
    """Historical imputation function for earnings-related variables, applied to a single row.

    Row-wise function that will be applied to the entire column in the
    historical_imputation_income function.

    Parameters
    ----------

    row : pd.Dataframe
        A single row of a Pandas DataFrame.

    var_current_wave : str
        Name of variable in current wave to be imputed.

    var_previous_wave : str
        Name of variable in previous wave to be used for ratio imputation.

    num_jobs_current_wave : str
        Number of jobs / businesses held by respondent in the current wave.

    num_jobs_previous_wave : str
        Number of jobs / businesses held by respondent in the previous wave.

    ssic : str
        SSIC of respondent in current wave.

    imputation_group : pd.DataFrame
        Pandas DataFrame containing the imputation groups, which are defined by
        the job's / business' Singapore Standard Industrial Classification (SSIC).
        Columns of this DataFrame are: SSIC, which defines the imputation classes,
        count of observations per imputation class, and mean earnings values
        in the current and previous waves.

    missing_values_to_impute : list[int]
        List of missing values to impute for.

    min_records_req : int
        Minimum number of non-missing earnings value for previous and current waves in
        imputation class.

    Returns
    -------

    row_post_imputation : pd.DataFrame
        data_post_imputation : pd.DataFrame
        Pandas DataFrame of output data, with imputations applied to housing loan value.
    """

    # make copy of row to apply imputation
    row_post_imputation = row.copy()

    # get names of post imputation columns
    post_imputation_col_name = f"{var_current_wave}_post_imputation"
    imputed_val_col_name = f"{var_current_wave}_imputed_value"

    # get names of counts and mean / average columns
    record_count_current_wave = f"{var_current_wave}_count"
    record_count_previous_wave = f"{var_previous_wave}_count"
    avg_current_wave = f"{var_current_wave}_mean"
    avg_previous_wave = f"{var_previous_wave}_mean"

    # only impute if:
    # var_current_wave is missing for row
    # var_previous_wave was not imputed
    # same number of jobs in previous and current waves
    if (
        pd.isna(row_post_imputation[post_imputation_col_name])
        & (~pd.isna(row_post_imputation[var_previous_wave]))
        & (
            row_post_imputation[num_jobs_current_wave]
            == row_post_imputation[num_jobs_previous_wave]
        )
    ):

        # only impute if there are sufficient records based on imputation parameters
        if (
            imputation_group[imputation_group[ssic] == row_post_imputation[ssic]][
                [record_count_current_wave, record_count_previous_wave]
            ].values
            > min_records_req
        ).all():

            # average value of current wave's imputation group
            avg_val_current_wave = imputation_group[
                imputation_group[ssic] == row_post_imputation[ssic]
            ][avg_current_wave]

            # average value of previous wave's imputation group
            avg_val_previous_wave = imputation_group[
                imputation_group[ssic] == row_post_imputation[ssic]
            ][avg_previous_wave]

            # reported value in previous wave
            reported_val_previous_wave = row_post_imputation[var_previous_wave]

            # impute using historical ratio imputation
            imputed_val = ratio_imputation(
                avg_val_current_wave, avg_val_previous_wave, reported_val_previous_wave
            ).values[0]
            row_post_imputation[post_imputation_col_name] = imputed_val
            row_post_imputation[imputed_val_col_name] = imputed_val

            # change imputation flag to 1
            # check with clients: do we want to name flag variable as f'{var_current_wave}_c'?
            imputation_flag_col_name = f"{var_current_wave}_imputation_flag"
            row_post_imputation[imputation_flag_col_name] = 1

            # set imputation type to Historical
            imputation_type_col_name = f"{var_current_wave}_imputation_type"
            row_post_imputation[imputation_type_col_name] = "Historical"

    return row_post_imputation

In [48]:
def historical_imputation_income(
    data: pd.DataFrame,
    var_current_wave: str,
    var_previous_wave: str,
    num_jobs_current_wave: str,
    num_jobs_previous_wave: str,
    ssic: str,
    imputation_group: pd.DataFrame,
    missing_values_to_impute: list[int],
    min_records_req: int = 30,
):
    """Historical imputation function for earnings-related variable for entire data set.

    Wrapper function to income_historical_imputation_by_row function.
    Applies historical imputation to entire earnings-related variable column.

    Parameters
    ----------

    data : pd.Dataframe
        Pandas DataFrame of input data.

    var_current_wave : str
        Name of variable in current wave to be imputed.

    var_previous_wave : str
        Name of variable in previous wave to be used for ratio imputation.

    num_jobs_current_wave : str
        Number of jobs / businesses held by respondent in the current wave.

    num_jobs_previous_wave : str
        Number of jobs / businesses held by respondent in the previous wave.

    ssic : str
        SSIC of respondent in current wave.

    imputation_group : pd.DataFrame
        Pandas DataFrame containing the imputation groups, which are defined by
        the job's / business' Singapore Standard Industrial Classification (SSIC).
        Columns of this DataFrame are: SSIC, which defines the imputation classes,
        count of observations per imputation class, and mean earnings values
        in the current and previous waves.

    missing_values_to_impute : list[int]
        List of missing values to impute for.

    min_records_req : int (default = 30)
        Minimum number of non-missing earnings values for previous and current waves in
        imputation class.

    Returns
    -------

    data_post_imputation : pd.DataFrame
        Pandas DataFrame of output data, with imputations applied to earnings-related variable.
    """

    # TODO: check parameters for errors outside of function

    # make copy of data to do imputations on
    data_post_imputation = data.copy()

    # get name of post imputation column
    post_imputation_col_name = f"{var_current_wave}_post_imputation"

    # replace missing values in post-imputation variable and its previous wave's variable columns with NA
    data_post_imputation[post_imputation_col_name] = data_post_imputation[
        post_imputation_col_name
    ].replace(missing_values_to_impute, pd.NA)
    data_post_imputation[var_previous_wave] = data_post_imputation[
        var_previous_wave
    ].replace(missing_values_to_impute, pd.NA)

    # Do historical imputation for income variables
    data_post_imputation = data_post_imputation.apply(
        lambda row: income_historical_imputation_by_row(
            row,
            var_current_wave,
            var_previous_wave,
            num_jobs_current_wave,
            num_jobs_previous_wave,
            ssic,
            imputation_group,
            missing_values_to_impute,
            min_records_req,
        ),
        axis=1,
    )

    return data_post_imputation

In [49]:
tst_post_imputation_work_income = data_preprocessing_store_donor_info(
    tst, "Q140a_1_1", ["GO_Q130a_1", "GO_Q131a_1"]
)
tst_post_imputation_work_income = historical_imputation_income(
    tst_post_imputation_work_income,
    "Q140a_1_1",
    "Q140a_1_1_prev_wave",
    "Q128_1",
    "Q128_1_prev_wave",
    "GO_Q130a_1",
    work_income_imputation_group,
    values_to_impute,
)

In [50]:
tst_post_imputation_work_income.Q140a_1_1_imputation_flag.value_counts()

Q140a_1_1_imputation_flag
0    14110
1      595
Name: count, dtype: int64

Look at work income specific columns and their imputation class variables (number of jobs).

In [51]:
tst_post_imputation_work_income[
    tst_post_imputation_work_income.Q140a_1_1_imputation_flag == 1
][
    [
        "Q128_1",
        "Q128_1_prev_wave",
        "GO_Q130a_1",
        "GO_Q130a_1_prev_wave",
        "Q140a_1_1",
        "Q140a_1_1_prev_wave",
        "Q140a_1_1_post_imputation",
        "Q140a_1_1_imputed_value",
        "Q140a_1_1_imputation_flag",
        "Q140a_1_1_imputation_type",
    ]
]

Unnamed: 0,Q128_1,Q128_1_prev_wave,GO_Q130a_1,GO_Q130a_1_prev_wave,Q140a_1_1,Q140a_1_1_prev_wave,Q140a_1_1_post_imputation,Q140a_1_1_imputed_value,Q140a_1_1_imputation_flag,Q140a_1_1_imputation_type
11,1.0,1.0,L,L,,3550.0,3391.009663,3391.009663,1,Historical
17,1.0,1.0,I,I,,592.0,604.525906,604.525906,1,Historical
19,1.0,1.0,N,N,,1210.0,1214.781938,1214.781938,1,Historical
21,1.0,1.0,S,S,,1877.0,1850.555786,1850.555786,1,Historical
27,1.0,1.0,E,E,,4283.0,4757.766903,4757.766903,1,Historical
...,...,...,...,...,...,...,...,...,...,...
11181,1.0,1.0,G,G,,1877.0,1929.710841,1929.710841,1,Historical
11192,1.0,1.0,G,G,,2796.0,2874.518653,2874.518653,1,Historical
11207,1.0,1.0,K,K,,5083.0,5785.172002,5785.172002,1,Historical
11226,1.0,1.0,G,G,,4060.0,4174.014926,4174.014926,1,Historical


Can also test out historical imputation on bonuses and business profits.

In [52]:
bonus_imputation_group = construct_incomes_donor_pool(
    tst,
    "Q142a_1_1",
    "Q142a_1_1_prev_wave",
    "Q128_1",
    "Q128_1_prev_wave",
    "GO_Q130a_1",
    values_to_impute,
)

In [53]:
bonus_imputation_group.head()

Unnamed: 0,GO_Q130a_1,Q142a_1_1_count,Q142a_1_1_mean,Q142a_1_1_prev_wave_count,Q142a_1_1_prev_wave_mean
0,88,2,1059.5,3,17732.0
1,A,1,2432.0,1,1283.0
2,C,300,8277.81,311,7795.858521
3,D,4,15922.25,4,17540.0
4,E,31,3523.548387,28,4362.214286


In [54]:
tst_post_imputation_bonus = data_preprocessing_store_donor_info(
    tst, "Q142a_1_1", ["GO_Q130a_1", "GO_Q131a_1"]
)
tst_post_imputation_bonus = historical_imputation_income(
    tst_post_imputation_bonus,
    "Q142a_1_1",
    "Q142a_1_1_prev_wave",
    "Q128_1",
    "Q128_1_prev_wave",
    "GO_Q130a_1",
    bonus_imputation_group,
    values_to_impute,
)

In [55]:
tst_post_imputation_bonus.Q142a_1_1_imputation_flag.value_counts()

Q142a_1_1_imputation_flag
0    13993
1      712
Name: count, dtype: int64

In [56]:
tst_post_imputation_bonus[tst_post_imputation_bonus.Q142a_1_1_imputation_flag == 1][
    [
        "Q128_1",
        "Q128_1_prev_wave",
        "GO_Q130a_1",
        "GO_Q130a_1_prev_wave",
        "Q142a_1_1",
        "Q142a_1_1_prev_wave",
        "Q142a_1_1_post_imputation",
        "Q142a_1_1_imputed_value",
        "Q142a_1_1_imputation_flag",
        "Q142a_1_1_imputation_type",
    ]
]

Unnamed: 0,Q128_1,Q128_1_prev_wave,GO_Q130a_1,GO_Q130a_1_prev_wave,Q142a_1_1,Q142a_1_1_prev_wave,Q142a_1_1_post_imputation,Q142a_1_1_imputed_value,Q142a_1_1_imputation_flag,Q142a_1_1_imputation_type
8,1.0,1.0,K,I,,1028.0,1126.666881,1126.666881,1,Historical
9,1.0,1.0,I,G,,850.0,793.834491,793.834491,1,Historical
11,1.0,1.0,L,L,,10035.0,5566.491768,5566.491768,1,Historical
19,1.0,1.0,N,N,,1611.0,1253.803385,1253.803385,1,Historical
21,1.0,1.0,S,S,,1883.0,2098.296539,2098.296539,1,Historical
...,...,...,...,...,...,...,...,...,...,...
11148,1.0,1.0,Q,P,,5085.0,4862.699485,4862.699485,1,Historical
11174,1.0,1.0,N,K,,7057.0,5492.297015,5492.297015,1,Historical
11175,1.0,1.0,C,C,,12070.0,12816.185213,12816.185213,1,Historical
11204,1.0,1.0,Q,O,,2573.0,2460.516377,2460.516377,1,Historical


In [57]:
business_profits_imputation_group = construct_incomes_donor_pool(
    tst,
    "Q143a_1_1",
    "Q143a_1_1_prev_wave",
    "Q128_1",
    "Q128_1_prev_wave",
    "GO_Q130a_1",
    values_to_impute,
)

In [58]:
tst_post_imputation_profit = data_preprocessing_store_donor_info(
    tst, "Q143a_1_1", ["GO_Q130a_1", "GO_Q131a_1"]
)
tst_post_imputation_profit = historical_imputation_income(
    tst_post_imputation_profit,
    "Q143a_1_1",
    "Q143a_1_1_prev_wave",
    "Q128_1",
    "Q128_1_prev_wave",
    "GO_Q130a_1",
    business_profits_imputation_group,
    values_to_impute,
)

In [59]:
tst_post_imputation_profit.Q143a_1_1_imputation_flag.value_counts()

Q143a_1_1_imputation_flag
0    14577
1      128
Name: count, dtype: int64

In [60]:
tst_post_imputation_profit[tst_post_imputation_profit.Q143a_1_1_imputation_flag == 1][
    [
        "Q128_1",
        "Q128_1_prev_wave",
        "GO_Q130a_1",
        "GO_Q130a_1_prev_wave",
        "Q143a_1_1",
        "Q143a_1_1_prev_wave",
        "Q143a_1_1_post_imputation",
        "Q143a_1_1_imputed_value",
        "Q143a_1_1_imputation_flag",
        "Q143a_1_1_imputation_type",
    ]
]

Unnamed: 0,Q128_1,Q128_1_prev_wave,GO_Q130a_1,GO_Q130a_1_prev_wave,Q143a_1_1,Q143a_1_1_prev_wave,Q143a_1_1_post_imputation,Q143a_1_1_imputed_value,Q143a_1_1_imputation_flag,Q143a_1_1_imputation_type
70,1.0,1.0,H,H,,1225.62,1045.958757,1045.958757,1,Historical
74,1.0,1.0,I,F,,2039.42,1459.658475,1459.658475,1,Historical
263,1.0,1.0,H,H,,8022.91,6846.847286,6846.847286,1,Historical
264,1.0,1.0,N,N,,8022.91,6182.427848,6182.427848,1,Historical
435,1.0,1.0,G,G,,1003.72,1532.890805,1532.890805,1,Historical
783,1.0,1.0,G,G,,5001.09,7637.712588,7637.712588,1,Historical
801,1.0,1.0,H,H,,1946.32,1661.012751,1661.012751,1,Historical
812,1.0,1.0,G,G,,1541.73,2354.544835,2354.544835,1,Historical
820,1.0,1.0,I,I,,1225.62,877.203626,877.203626,1,Historical
841,1.0,1.0,H,F,,1887.11,1610.482229,1610.482229,1,Historical
