# IAS analysis

Calculate the median inflation expectations for each bracket.

Inflation expectations are categorical data (answers are in buckets).

Therefore we need to do the following:
1. Split the responses into subsets by age
2. For each age, bucket the responses across the groups
3. Calculate the median for each age
4. Add it to our results table

In [1]:
import polars as pl
import pandas as pd
import plotly.graph_objects as go

from typing import Dict, Tuple, Optional

In [2]:
# Load data
ias_raw = pl.read_excel(
    '/Users/lukasalemu/Downloads/Inflation Attitudes Survey Feb 2025.xlsx', 
    sheet_name='Dataset', 
    columns=['weight', 'yyyyqq', 'age', 'q2a_agg1', 'q2b_agg1', 'q2c_agg1']
)
ias_raw.head()

Could not determine dtype for column 172, falling back to string
Could not determine dtype for column 174, falling back to string
Could not determine dtype for column 176, falling back to string


weight,yyyyqq,age,q2a_agg1,q2b_agg1,q2c_agg1
f64,str,i64,str,str,str
0.741,"""200101""",5,,,
1.282,"""200101""",3,,,
1.731,"""200101""",1,,,
0.877,"""200101""",3,,,
0.947,"""200101""",6,,,


In [3]:
ias_clean = ias_raw.filter(pl.col("q2a_agg1").is_not_null()) # Filter out nulls

In [4]:

q2_agg_ias_class_bounds = {
    "1": (-1.0, 1.0),   # Down by 1% or less
    "2": (-2.0, 1.0),   # Down by 1% to <2%
    "3": (-3.0, 1.0),
    "4": (-4.0, 1.0),
    "5": (-5.0, 1.0),
    "6": (-6.0, 1.0),   # Down by 5% or more (approx.)
    "7": (0.0, 0.0),    # Not changed
    "8": (0.0, 1.0),
    "9": (1.0, 1.0),
    "10": (2.0, 1.0),
    "11": (3.0, 1.0),
    "12": (4.0, 1.0),
    "13": (5.0, 1.0),
    "14": (6.0, 1.0),
    "15": (7.0, 1.0),
    "16": (8.0, 1.0),
    "17": (9.0, 1.0),
    "18": (10.0, 1.0),  # Up by 10% or more (approx.)
    "20": (10.0, 1.0),  # Up by 10% to <11%
    "21": (11.0, 1.0),
    "22": (12.0, 1.0),
    "23": (13.0, 1.0),
    "24": (14.0, 1.0),
    "25": (15.0, 5.0),  # Up by 15% or more (extended width)
}


In [5]:
def clean_ias(df: pl.DataFrame): 
    df = df.filter(pl.col("q2a_agg1").is_not_null())
    df = df.with_columns([
        pl.when(pl.col("age") == 8).then(1)
        .when(pl.col("age") == 7).then(6)
        .otherwise(pl.col("age"))
        .alias("age")
    ])
    return df

def convert_yyyyqq_to_datetime(yyyyqq: str) -> pd.Timestamp:
    year = int(yyyyqq[:4])
    quarter = int(yyyyqq[4:])
    month = {1: 1, 2: 4, 3: 7, 4: 10}[quarter]
    return pd.Timestamp(year=year, month=month, day=1)

def grouped_median_unequal_widths(counts: dict, class_bounds: Dict[str, Tuple[float, float]]) -> Optional[float]:
    """
    Interpolated grouped median with variable class widths.
    
    counts: dict of {category_label: count}
    class_bounds: dict of {category_label: (lower_bound, width)}
    Returns interpolated median as float, or None if no data.
    """
    if not counts:
        return None

    # Sort by lower bound of the class
    sorted_items = sorted(
        [(label, class_bounds[label][0], class_bounds[label][1], count)
         for label, count in counts.items() if label in class_bounds],
        key=lambda x: x[1]
    )

    total = sum(item[3] for item in sorted_items)
    if total == 0:
        return None

    median_pos = total / 2
    cum = 0
    for label, L, w, f in sorted_items:
        prev_cum = cum
        cum += f
        if cum >= median_pos:
            return L + ((median_pos - prev_cum) / f) * w
    return None

def compute_grouped_medians_polars(df: pl.DataFrame, class_bounds: Dict[str, Tuple[float, float]]):
    
    questions = ['q2a_agg1', 'q2b_agg1', 'q2c_agg1']
    result_frames = {q: {} for q in questions}

    # Group by timestamp and age
    grouped = df.group_by(["yyyyqq", "age"])

    for (yyyyqq, age), group in grouped:
        for question in questions:
            counts_df = group[question].value_counts()
            if counts_df.height == 0:
                median = None
            else:
                # Convert to dictionary
                counts = dict(zip(counts_df[question], counts_df["count"]))
                median = grouped_median_unequal_widths(counts, class_bounds)

            result_frames[question].setdefault(yyyyqq, {})[age] = median

    # Convert results into DataFrames
    output = {}
    for question, data in result_frames.items():
        rows = []
        for yyyyqq, age_dict in data.items():
            row = {
            "yyyyqq": convert_yyyyqq_to_datetime(str(yyyyqq))
        }
            row.update({str(age): val for age, val in age_dict.items()})
            rows.append(row)
        output[question] = pl.DataFrame(rows).sort("yyyyqq")

    return output['q2a_agg1'], output['q2b_agg1'], output['q2c_agg1']


In [6]:
ias_clean = clean_ias(ias_raw)

q2a_median_by_age, q2b_median_by_age, q2c_median_by_age = compute_grouped_medians_polars(ias_clean, q2_agg_ias_class_bounds)

In [7]:

def plot_question_medians(df: pl.DataFrame, label_mapping: dict = None, title: str = "Grouped Median by Age"):
    # Convert to Pandas for Plotly compatibility
    pdf = df.to_pandas()

    # Rename age columns
    if label_mapping:
        pdf = pdf.rename(columns={code: label for code, label in label_mapping.items() if code in pdf.columns})

    # Melt the DataFrame to long format for easier plotting
    df_melted = pdf.melt(id_vars="yyyyqq", var_name="Group", value_name="Median")

    # Create Plotly line chart
    fig = go.Figure()

    for grp in df_melted["Group"].unique():
        group_data = df_melted[df_melted["Group"] == grp]
        fig.add_trace(go.Scatter(
            x=group_data["yyyyqq"],
            y=group_data["Median"],
            mode='lines+markers',
            name=grp
        ))

    fig.update_layout(
        title=title,
        xaxis_title="Quarter",
        yaxis_title="Grouped Median (Interpolated)",
        legend_title="Group",
        template="plotly_white",
        hovermode="x unified"
    )

    fig.show()


age_map = {
        "1": "15-24",
        "2": "25-34",
        "3": "35-44",
        "4": "45-54",
        "5": "55-64",
        "6": "65+"
    }

In [8]:
q2a_median_by_age

yyyyqq,6,5,3,2,1,4
datetime[μs],f64,f64,f64,f64,f64,f64
2009-01-01 00:00:00,2.580645,2.384615,1.767857,1.967213,2.313725,2.255556
2009-04-01 00:00:00,2.675439,2.679487,1.980769,2.1640625,2.512195,2.404762
2009-07-01 00:00:00,2.66129,2.769231,2.232394,2.109375,2.384615,2.53
2009-10-01 00:00:00,2.463768,2.606383,2.404412,2.0,2.309524,2.674603
2010-01-01 00:00:00,2.734043,2.797414,2.399329,2.112319,2.13253,2.780822
…,…,…,…,…,…,…
2024-01-01 00:00:00,3.053333,2.906015,3.318627,2.791367,3.12069,3.090517
2024-04-01 00:00:00,2.931034,2.785714,2.730769,2.802469,2.714286,2.935897
2024-07-01 00:00:00,3.016129,2.761538,2.705479,2.613924,2.776119,2.62963
2024-10-01 00:00:00,3.0,3.224359,2.992958,2.712644,3.209677,2.908537


In [9]:
plot_question_medians(q2a_median_by_age, age_map, title="Year ahead inflation expectations by age")

In [10]:
plot_question_medians(q2b_median_by_age, age_map, title="2-year ahead inflation expectations by age")

In [11]:
plot_question_medians(q2c_median_by_age, age_map, title="5-Year ahead inflation expectations by age")

## Any category

In [12]:
def clean_ias(df: pl.DataFrame, col: str = "age", mapping: dict = None): 
    """
    Filters the DataFrame to rows where 'q2a_agg1' is not null, and optionally recodes
    the specified column (default "age") using the provided mapping.
    
    Parameters:
      df (pl.DataFrame): Input DataFrame.
      col (str): The name of the column to recode.
      mapping (dict): A dictionary where keys are original values and values are new codes.
    
    Returns:
      pl.DataFrame: Cleaned DataFrame.
    """
    df = df.filter(pl.col("q2a_agg1").is_not_null())
    
    if mapping is not None:
        # Build a conditional expression for recoding
        # Start with the first mapping condition:
        keys = list(mapping.keys())
        expr = pl.when(pl.col(col) == keys[0]).then(mapping[keys[0]])
        for key in keys[1:]:
            expr = expr.when(pl.col(col) == key).then(mapping[key])
        expr = expr.otherwise(pl.col(col))
        df = df.with_columns([expr.alias(col)])
    
    return df

def conv_to_datetime(yyyyqq: str) -> pd.Timestamp:
    year = int(yyyyqq[:4])
    quarter = int(yyyyqq[4:])
    month = {1: 1, 2: 4, 3: 7, 4: 10}[quarter]
    return pd.Timestamp(year=year, month=month, day=1)

def grouped_median(counts: dict, class_bounds: Dict[str, Tuple[float, float]]) -> Optional[float]:
    """
    Interpolated grouped median with variable class widths.
    
    counts: dict of {category_label: count}
    class_bounds: dict of {category_label: (lower_bound, width)}
    Returns interpolated median as float, or None if no data.
    """
    if not counts:
        return None

    # Sort by lower bound of the class
    sorted_items = sorted(
        [(label, class_bounds[label][0], class_bounds[label][1], count)
         for label, count in counts.items() if label in class_bounds],
        key=lambda x: x[1]
    )

    total = sum(item[3] for item in sorted_items)
    if total == 0:
        return None

    median_pos = total / 2
    cum = 0
    for label, L, w, f in sorted_items:
        prev_cum = cum
        cum += f
        if cum >= median_pos:
            return L + ((median_pos - prev_cum) / f) * w
    return None

def comp_grouped_medians(
    df: pl.DataFrame,
    class_bounds: Dict[str, Tuple[float, float]],
    disagg_col: str = "age"
):
    """
    Compute grouped medians for a set of question columns by timestamp and any disaggregation column.
    
    Parameters:
      df (pl.DataFrame): Input DataFrame.
      class_bounds (dict): Mapping from category labels to (lower_bound, width) tuples.
      disagg_col (str): Column name to disaggregate by (default is "age").
    
    Returns:
      Tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]: DataFrames for 'q2a_agg1', 'q2b_agg1', and 'q2c_agg1'.
    """
    questions = ['q2a_agg1', 'q2b_agg1', 'q2c_agg1']
    result_frames = {q: {} for q in questions}

    # Group by timestamp and the specified disaggregation column
    grouped = df.group_by(["yyyyqq", disagg_col])

    for (yyyyqq, category), group in grouped:
        for question in questions:
            counts_df = group[question].value_counts()
            if counts_df.height == 0:
                median = None
            else:
                # Convert to dictionary: category value -> count
                counts = dict(zip(counts_df[question], counts_df["count"]))
                median = grouped_median_unequal_widths(counts, class_bounds)

            result_frames[question].setdefault(yyyyqq, {})[category] = median

    # Convert results into DataFrames, one per question
    output = {}
    for question, data in result_frames.items():
        rows = []
        for yyyyqq, cat_dict in data.items():
            row = {"yyyyqq": convert_yyyyqq_to_datetime(str(yyyyqq))}
            # Each key in the row is now a string representation of the disaggregation category
            row.update({str(cat): val for cat, val in cat_dict.items()})
            rows.append(row)
        output[question] = pl.DataFrame(rows).sort("yyyyqq")

    return output['q2a_agg1'], output['q2b_agg1'], output['q2c_agg1']


In [13]:
ias_raw = pl.read_excel(
    '/Users/lukasalemu/Downloads/Inflation Attitudes Survey Feb 2025.xlsx', 
    sheet_name='Dataset', 
    columns=['weight', 'yyyyqq', 'work', 'q2a_agg1', 'q2b_agg1', 'q2c_agg1']
)
ias_raw.head()

df_clean = clean_ias(ias_raw, 'work')
median_dfs = comp_grouped_medians(df_clean, q2_agg_ias_class_bounds, disagg_col="work")


Could not determine dtype for column 172, falling back to string
Could not determine dtype for column 174, falling back to string
Could not determine dtype for column 176, falling back to string


In [14]:
emp_map = {
    "1": "Full or Part Time",
    "2": "Unemployed"
}

plot_question_medians(median_dfs[0], emp_map, '1 year ahead inflation expectations by employment status')

In [15]:
plot_question_medians(median_dfs[1], emp_map, '2 year ahead inflation expectations by employment status')

In [16]:
plot_question_medians(median_dfs[2], emp_map, '5 year ahead inflation expectations by employment status')

## Class

In [17]:
ias_raw = pl.read_excel(
    '/Users/lukasalemu/Downloads/Inflation Attitudes Survey Feb 2025.xlsx', 
    sheet_name='Dataset', 
    columns=['weight', 'yyyyqq', 'class', 'q2a_agg1', 'q2b_agg1', 'q2c_agg1']
)
ias_raw.head()

df_clean = clean_ias(ias_raw, 'class')
median_dfs = comp_grouped_medians(df_clean, q2_agg_ias_class_bounds, disagg_col="class")


Could not determine dtype for column 172, falling back to string
Could not determine dtype for column 174, falling back to string
Could not determine dtype for column 176, falling back to string


In [18]:
class_mapping = {
    "1": "AB",
    "2": "C1",
    "3": "C2",
    "4": "DE"    
}

plot_question_medians(median_dfs[0], class_mapping, '1 year ahead inflation expectations by class status')

In [19]:
plot_question_medians(median_dfs[1], class_mapping, '2 year ahead inflation expectations by class status')

In [20]:
plot_question_medians(median_dfs[2], class_mapping, '5 year ahead inflation expectations by class status')

## Housing / Tenure

In [21]:
ias_raw = pl.read_excel(
    '/Users/lukasalemu/Downloads/Inflation Attitudes Survey Feb 2025.xlsx', 
    sheet_name='Dataset', 
    columns=['yyyyqq', 'tenure', 'q2a_agg1', 'q2b_agg1', 'q2c_agg1']
)
ias_raw.head()

df_clean = clean_ias(ias_raw, 'tenure')
median_dfs = comp_grouped_medians(df_clean, q2_agg_ias_class_bounds, disagg_col="tenure")


Could not determine dtype for column 172, falling back to string
Could not determine dtype for column 174, falling back to string
Could not determine dtype for column 176, falling back to string


In [22]:
housing_mapping = {
    "1": "Owned outright",
    "2": "Mortgage",
    "3": "Council Rent",
    "4": "Other",
}

plot_question_medians(median_dfs[0], housing_mapping, '1 year ahead inflation expectations by housing status')

In [23]:
plot_question_medians(median_dfs[1], housing_mapping, '2 year ahead inflation expectations by housing status')

In [24]:
plot_question_medians(median_dfs[2], housing_mapping, '5 year ahead inflation expectations by housing status')

## Income

In [34]:
ias_raw = pl.read_excel(
    '/Users/lukasalemu/Downloads/Inflation Attitudes Survey Feb 2025.xlsx', 
    sheet_name='Dataset', 
    columns=['yyyyqq', 'income', 'q2a_agg1', 'q2b_agg1', 'q2c_agg1']
)
ias_raw.head()

df_clean = clean_ias(ias_raw, 'income')
median_dfs = comp_grouped_medians(df_clean, q2_agg_ias_class_bounds, disagg_col="income")


Could not determine dtype for column 172, falling back to string
Could not determine dtype for column 174, falling back to string
Could not determine dtype for column 176, falling back to string


In [35]:
income_bands = {
    "1": "<9500",
    "2": "9500-17499",
    "3": "17500-24999",
    "4": ">25000",
    "5": "25000-39999",
    "6": ">40000",
    "7": "<9999",
    "8": "10000-19999",
    "9": "20000-34999",
    "10": "35000-44999",
    "11": ">45000",
    "12": "Prefer not to answer"
}

plot_question_medians(median_dfs[0], income_bands, '1 year ahead inflation expectations by income band')

In [33]:
plot_question_medians(median_dfs[1], income_bands, '2 year ahead inflation expectations by income band')

In [36]:
plot_question_medians(median_dfs[2], housing_mapping, '5 year ahead inflation expectations by income band')

## Region

In [29]:
ias_raw = pl.read_excel(
    '/Users/lukasalemu/Downloads/Inflation Attitudes Survey Feb 2025.xlsx', 
    sheet_name='Dataset', 
    columns=['yyyyqq', 'sreg', 'q2a_agg1', 'q2b_agg1', 'q2c_agg1']
)
ias_raw.head()

df_clean = clean_ias(ias_raw, 'sreg')
median_dfs = comp_grouped_medians(df_clean, q2_agg_ias_class_bounds, disagg_col="sreg")

Could not determine dtype for column 172, falling back to string
Could not determine dtype for column 174, falling back to string
Could not determine dtype for column 176, falling back to string


In [30]:
sreg_bands = {
    "1": "Scotland",
    "2": "North & NI",
    "3": "Midlands",
    "4": "Wales and West",
    "5": "South East"
}


plot_question_medians(median_dfs[0], sreg_bands, '1 year ahead inflation expectations by region')

In [31]:
plot_question_medians(median_dfs[1], housing_mapping, '2 year ahead inflation expectations by housing status')

In [32]:
plot_question_medians(median_dfs[2], housing_mapping, '5 year ahead inflation expectations by housing status')