This shows how to add new attributes to the Transformer synthetic population using Synthetic Reconstruction (SR) method.

# Import libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import random
import numpy as np

# Util Functions

In [None]:
def plot_attribute_distribution(df, color=None):
    # check the distribution of each attribute
    # iterate over columns in dataframe
    for col in df.columns:
        # count the frequency of each attribute
        value_counts = df[col].value_counts()
        # create a bar chart of the frequency of each attribute
        plt.figure()
        if color is not None:
            plt.bar(value_counts.index, value_counts.values, color=color)
        else:
            plt.bar(value_counts.index, value_counts.values)
        plt.title(col)
        plt.show()

In [None]:
def print_uniq_val(df):
    # Create an empty dictionary to store unique values for each column
    unique_values_dict = {}

    # Iterate through each column of the dataframe
    for column_name in df.columns:
        # Get the unique values of the current column
        unique_values = df[column_name].unique().tolist()

        # Sort the unique values
        unique_values.sort()

        # Store the sorted unique values in the dictionary
        unique_values_dict[column_name] = unique_values

    # Print the unique values for each column
    for column_name, unique_values in unique_values_dict.items():
        print(f"Column name: {column_name}")
        print(f"Unique values: {unique_values}")
        print(f"Total unique values: {len(unique_values)}")
        print("\n")

In [None]:
def random_sample(df, n, random_state):
    """
    Returns a tuple of two dataframes:
    the first dataframe contains n randomly sampled rows from the input dataframe without replacement,
    the second dataframe contains the remaining rows that were not sampled.
    """
    sample_df = df.sample(n=n, replace=False, random_state=random_state)
    remaining_df = df.drop(sample_df.index)

    return sample_df, remaining_df

In [None]:
def count_unique_elements(my_list):
    # Create an empty list to store unique elements
    unique_list = []

    # Loop through each element in the list
    for element in my_list:
        # Check if the element is not already in the unique list
        if element not in unique_list:
            # If it's not, add it to the unique list
            unique_list.append(element)
            unique_list.sort()
            max_vocab = max(unique_list)

    # Return the length of the unique list and the unique list itself
    return len(unique_list), unique_list, max_vocab

# SR Functions

In [None]:
def get_cumulative_probabilities_grouped(df, column_name, group_by_columns):
  """
  This function takes a pandas DataFrame, a column name for calculating cumulative probabilities, and a list of column names to group by, and returns a dictionary containing the cumulative probabilities for each unique combination of group values and unique attribute in the specified column.

  Args:
      df (pd.DataFrame): The input DataFrame.
      column_name (str): The name of the column to calculate cumulative probabilities.
      group_by_columns (list): A list of column names for grouping the data.

  Returns:
      dict: A dictionary containing the unique combinations of group values and unique attributes as keys, and their corresponding cumulative probabilities as values, formatted with 3 decimal places.
  """

  # Group the DataFrame by specified columns
  grouped_df = df.groupby(group_by_columns)

  # Define a function to calculate cumulative probabilities within each group
  def calculate_cumulative_probabilities(group):
    value_counts = group[column_name].value_counts().sort_values(ascending=False)
    total_count = len(group)
    cumulative_sum = value_counts.cumsum() / total_count
    result_dict = cumulative_sum.to_dict()
    for key, value in result_dict.items():
      result_dict[key] = round(value, 3)
    return result_dict

  # Apply the function to each group and combine results into a dictionary
  result_dict = grouped_df.apply(calculate_cumulative_probabilities).to_dict()

  # Flatten the dictionary for desired output format (optional)
  # flattened_dict = {}
  # for group_values, group_dict in result_dict.items():
  #   for attr, prob in group_dict.items():
  #     flattened_dict[(group_values, attr)] = prob
  # return flattened_dict

  return result_dict


In [None]:
def add_new_attribute(known_attr_list, cumulative_prob):
    adding_attr = []
    for i in known_attr_list:
        for k,v in cumulative_prob.items():
            if i == k:
                compute_dict = v
        random_number = round(random.random(), 3)
        for key, value in compute_dict.items():
            if random_number <= value:
                adding_attr.append(key)
                break
    return adding_attr

# Import Data For SR

This data was obtained from IPUMS and has been prepared accordingly.

In [None]:
data_for_sr = pd.read_csv('data/data_for_SR.csv')

In [None]:
data_for_sr

Unnamed: 0,sex,age_group,edattain,classwk,marst,geo2_th2000
0,202,21,502,401,301,10035
1,201,4,501,400,301,10035
2,201,15,504,401,301,10035
3,201,22,502,401,302,10035
4,202,19,501,403,302,10035
...,...,...,...,...,...,...
66879,201,13,502,402,302,10023
66880,202,13,502,402,302,10023
66881,202,1,500,400,301,10023
66882,202,13,502,401,302,10023


# Import Transformer Population

In [None]:
transformer = pd.read_csv('results/tfm.csv')

In [None]:
transformer.head()

Unnamed: 0,sex,age,marst,classwk
0,202,24,301,400
1,202,36,309,401
2,201,43,302,401
3,202,44,302,402
4,202,8,301,400


In [None]:
# Define the age range categories and their corresponding codes
age_categories = [
    (0, 4, 1), (5, 9, 2), (10, 14, 3), (15, 19, 4), (20, 24, 12),
    (25, 29, 13), (30, 34, 14), (35, 39, 15), (40, 44, 16),
    (45, 49, 17), (50, 54, 18), (55, 59, 19), (60, 64, 20),
    (65, 69, 21), (70, 74, 22), (75, 79, 23), (80, 84, 24),
    (85, float('inf'), 25)
]

def get_age_code(age):
    if age < 0:
        return 98  # Unknown for negative ages
    for lower, upper, code in age_categories:
        if lower <= age <= upper:
            return code
    return 98  # Unknown for ages not fitting any category

In [None]:
transformer['age_group'] = transformer['age'].apply(get_age_code)

In [None]:
transformer.head()

Unnamed: 0,sex,age,marst,classwk,age_group
0,202,24,301,400,12
1,202,36,309,401,15
2,201,43,302,401,16
3,202,44,302,402,16
4,202,8,301,400,2


# Add attributes via SR

## add edattain

In [None]:
cumulative_edattain = get_cumulative_probabilities_grouped(data_for_sr.copy(), 'edattain', ['sex', 'age_group'])
print(cumulative_edattain)

{(201, 1): {500: 1.0}, (201, 2): {501: 0.992, 509: 1.0}, (201, 3): {501: 0.504, 502: 0.998, 509: 1.0}, (201, 4): {502: 0.705, 503: 0.972, 501: 0.989, 509: 0.999, 504: 1.0}, (201, 12): {503: 0.546, 502: 0.832, 504: 0.965, 501: 0.986, 509: 1.0}, (201, 13): {503: 0.336, 502: 0.665, 504: 0.949, 501: 0.982, 509: 1.0}, (201, 14): {502: 0.327, 503: 0.631, 504: 0.905, 501: 0.981, 509: 1.0}, (201, 15): {502: 0.291, 503: 0.557, 504: 0.805, 501: 0.977, 509: 1.0}, (201, 16): {502: 0.271, 501: 0.531, 503: 0.758, 504: 0.969, 509: 1.0}, (201, 17): {501: 0.337, 502: 0.564, 504: 0.786, 503: 0.979, 509: 1.0}, (201, 18): {501: 0.363, 502: 0.606, 504: 0.794, 503: 0.98, 509: 1.0}, (201, 19): {501: 0.422, 502: 0.655, 503: 0.807, 504: 0.958, 509: 1.0}, (201, 20): {501: 0.538, 502: 0.709, 503: 0.841, 504: 0.953, 509: 1.0}, (201, 21): {501: 0.591, 502: 0.765, 504: 0.87, 503: 0.97, 509: 1.0}, (201, 22): {501: 0.689, 502: 0.829, 503: 0.902, 509: 0.951, 504: 1.0}, (201, 23): {501: 0.641, 502: 0.82, 509: 0.91, 504

In [None]:
sex_age = transformer[['sex', 'age_group']]
sex_age_list = sex_age.to_records(index=False).tolist()
print(sex_age_list)

[(202, 12), (202, 15), (201, 16), (202, 16), (202, 2), (201, 19), (202, 18), (202, 1), (202, 14), (202, 16), (201, 2), (201, 3), (201, 14), (202, 12), (202, 15), (202, 3), (201, 2), (201, 1), (202, 4), (201, 16), (201, 17), (201, 2), (201, 4), (201, 4), (201, 14), (202, 15), (202, 17), (202, 23), (202, 14), (201, 17), (201, 3), (202, 1), (202, 12), (201, 15), (201, 18), (202, 22), (201, 12), (201, 2), (202, 19), (201, 17), (202, 3), (201, 16), (201, 19), (201, 13), (201, 19), (201, 17), (201, 25), (201, 12), (202, 14), (201, 12), (202, 14), (201, 2), (201, 12), (201, 15), (201, 13), (201, 14), (201, 14), (201, 17), (201, 17), (201, 4), (202, 15), (201, 1), (202, 14), (201, 13), (202, 19), (202, 3), (202, 21), (201, 1), (201, 2), (201, 15), (201, 15), (201, 23), (202, 14), (202, 17), (202, 4), (202, 4), (201, 17), (202, 1), (202, 12), (201, 12), (201, 13), (202, 12), (201, 3), (201, 1), (202, 16), (202, 19), (202, 15), (201, 13), (201, 4), (201, 2), (201, 12), (201, 17), (201, 2), (202,

In [None]:
edattain_list = add_new_attribute(sex_age_list, cumulative_edattain)
print(edattain_list)

[503, 502, 502, 501, 501, 504, 502, 500, 502, 504, 501, 502, 504, 503, 503, 501, 501, 500, 503, 502, 501, 501, 502, 502, 504, 504, 504, 501, 504, 501, 501, 500, 502, 501, 502, 501, 502, 501, 501, 502, 501, 503, 509, 503, 501, 504, 501, 503, 503, 503, 501, 509, 503, 503, 502, 503, 503, 502, 509, 502, 501, 500, 504, 502, 502, 501, 501, 500, 501, 509, 504, 501, 502, 502, 502, 503, 504, 500, 503, 504, 504, 502, 502, 500, 501, 501, 503, 502, 502, 501, 503, 503, 501, 501, 502, 503, 501, 502, 501, 502, 502, 501, 501, 504, 503, 504, 501, 503, 502, 501, 501, 503, 503, 503, 504, 503, 503, 503, 503, 503, 503, 501, 503, 502, 504, 503, 502, 500, 502, 502, 503, 503, 501, 502, 503, 503, 502, 501, 503, 502, 502, 503, 502, 503, 503, 503, 502, 501, 503, 500, 501, 501, 501, 504, 501, 501, 504, 500, 504, 504, 502, 502, 503, 504, 501, 501, 503, 502, 502, 500, 501, 501, 504, 501, 502, 502, 503, 501, 501, 501, 502, 501, 502, 501, 504, 500, 502, 503, 501, 502, 501, 501, 501, 502, 503, 501, 502, 501, 502, 503,

In [None]:
transformer['edattain'] = edattain_list

In [None]:
transformer.head()

Unnamed: 0,sex,age,marst,classwk,age_group,edattain
0,202,24,301,400,12,503
1,202,36,309,401,15,502
2,201,43,302,401,16,502
3,202,44,302,402,16,501
4,202,8,301,400,2,501


## add current_district

In [None]:
marst_work = transformer[['marst', 'classwk']]
marst_work_list = marst_work.to_records(index=False).tolist()
print(marst_work_list)

[(301, 400), (309, 401), (302, 401), (302, 402), (301, 400), (302, 401), (302, 402), (301, 400), (301, 403), (302, 401), (301, 400), (301, 400), (302, 402), (301, 402), (302, 402), (301, 400), (301, 400), (301, 400), (301, 400), (302, 401), (302, 401), (301, 400), (301, 403), (300, 400), (302, 401), (301, 402), (304, 401), (304, 400), (302, 403), (302, 401), (301, 400), (301, 400), (302, 403), (302, 402), (302, 402), (304, 400), (301, 402), (301, 400), (302, 403), (302, 401), (301, 402), (302, 401), (302, 401), (302, 402), (302, 401), (301, 400), (302, 401), (301, 400), (302, 403), (301, 400), (301, 402), (301, 400), (301, 402), (302, 403), (302, 409), (302, 401), (302, 402), (302, 402), (304, 400), (301, 400), (302, 400), (301, 400), (302, 402), (302, 403), (304, 402), (301, 400), (302, 403), (301, 400), (301, 400), (302, 409), (301, 400), (302, 402), (302, 402), (302, 401), (302, 403), (301, 402), (302, 401), (301, 400), (301, 402), (301, 400), (302, 403), (302, 403), (301, 400), (30

In [None]:
cumulative_geo2 = get_cumulative_probabilities_grouped(data_for_sr.copy(), 'geo2_th2000', ['marst', 'classwk'])
print(cumulative_geo2)

{(300, 400): {10016: 0.362, 10035: 0.678, 10038: 0.941, 10009: 1.0}, (301, 400): {10034: 0.035, 10006: 0.068, 10015: 0.099, 10009: 0.13, 10037: 0.161, 10021: 0.192, 10027: 0.221, 10019: 0.25, 10022: 0.277, 10014: 0.304, 10018: 0.331, 10016: 0.355, 10045: 0.378, 10026: 0.4, 10040: 0.422, 10017: 0.444, 10020: 0.466, 10003: 0.488, 10001: 0.509, 10041: 0.53, 10038: 0.551, 10031: 0.571, 10030: 0.591, 10039: 0.611, 10046: 0.631, 10024: 0.649, 10029: 0.668, 10012: 0.685, 10033: 0.703, 10036: 0.72, 10048: 0.738, 10042: 0.755, 10049: 0.771, 10050: 0.787, 10007: 0.803, 10010: 0.819, 10002: 0.835, 10043: 0.851, 10025: 0.866, 10023: 0.882, 10011: 0.896, 10035: 0.911, 10047: 0.925, 10004: 0.939, 10028: 0.952, 10032: 0.965, 10044: 0.977, 10008: 0.987, 10005: 0.995, 10013: 1.0}, (301, 401): {10006: 0.043, 10037: 0.083, 10035: 0.114, 10027: 0.145, 10034: 0.175, 10018: 0.206, 10014: 0.236, 10010: 0.265, 10041: 0.293, 10016: 0.321, 10021: 0.349, 10045: 0.375, 10049: 0.401, 10003: 0.427, 10019: 0.451, 10

In [None]:
geo2_list = add_new_attribute(marst_work_list, cumulative_geo2)
print(geo2_list)

[10014, 10026, 10009, 10001, 10029, 10002, 10016, 10045, 10043, 10026, 10022, 10036, 10018, 10026, 10034, 10019, 10026, 10031, 10014, 10026, 10033, 10029, 10015, 10035, 10032, 10015, 10021, 10041, 10032, 10046, 10022, 10014, 10029, 10031, 10001, 10014, 10006, 10006, 10030, 10014, 10018, 10026, 10035, 10023, 10013, 10017, 10037, 10050, 10018, 10019, 10016, 10009, 10009, 10017, 10027, 10035, 10005, 10027, 10029, 10003, 10010, 10039, 10038, 10035, 10016, 10036, 10045, 10037, 10018, 10006, 10003, 10020, 10011, 10021, 10026, 10007, 10050, 10034, 10029, 10020, 10047, 10046, 10003, 10044, 10035, 10032, 10006, 10044, 10027, 10049, 10039, 10046, 10009, 10042, 10015, 10015, 10017, 10036, 10015, 10041, 10042, 10020, 10027, 10015, 10015, 10001, 10023, 10040, 10050, 10003, 10050, 10028, 10038, 10018, 10019, 10001, 10031, 10001, 10050, 10002, 10017, 10034, 10009, 10014, 10037, 10001, 10010, 10017, 10024, 10002, 10021, 10035, 10021, 10018, 10023, 10041, 10019, 10025, 10014, 10015, 10018, 10031, 10020

In [None]:
transformer['geo2_th2000'] = geo2_list
print(transformer)

     sex  age  marst  classwk  age_group  edattain  geo2_th2000
0    202   24    301      400         12       503        10014
1    202   36    309      401         15       502        10026
2    201   43    302      401         16       502        10009
3    202   44    302      402         16       501        10001
4    202    8    301      400          2       501        10029
..   ...  ...    ...      ...        ...       ...          ...
987  201   77    302      401         23       502        10017
988  201    2    301      400          1       500        10017
989  201   19    301      400          4       502        10026
990  201   23    301      400         12       503        10015
991  201   62    302      402         20       504        10026

[992 rows x 7 columns]


# Export final result

In [None]:
transformer.to_csv('results/tfm_sr.csv', header=True, index=False)