# Setup

In [16]:
import pandas as pd
from pathlib import Path

import numpy as np
from scipy.stats import entropy

In [6]:
input_data = pd.read_parquet('/data/eop/country_data/malawi/cleaned/test.parquet').reset_index(drop=True)
summary = pd.read_parquet('/data/eop/country_data/malawi/cleaned/summary.parquet')
all_continuous_gap = pd.read_csv(
    '/data/eop/simulation_results/malawi_2017/output_gt_continuous_gap.csv'
)
budget = all_continuous_gap[
    all_continuous_gap.post_transfer_poverty_rate <= 0.01
].sort_values('policy_cost_per_capita').budget.values[0]

continuous_gap = pd.read_csv(
    f'/data/eop/simulation_results/malawi_2017/output_gt_continuous_gap_budget={budget}.csv'
)
continuous_gap.columns = ['consumption', 'ev_transfer_continuous_gap']

merged = input_data.join(continuous_gap[['ev_transfer_continuous_gap']])

all_binary_gap = pd.read_csv(
    '/data/eop/simulation_results/malawi_2017/output_gt_binary_gap.csv'
)
budget = all_binary_gap[
    all_binary_gap.post_transfer_poverty_rate <= 0.01
].sort_values('policy_cost_per_capita').budget.values[0]

binary_gap = pd.read_csv(
    f'/data/eop/simulation_results/malawi_2017/output_gt_binary_gap_budget={budget}.csv'
)
binary_gap.columns = ['consumption', 'ev_transfer_binary_gap']
merged = merged.join(binary_gap[['ev_transfer_binary_gap']])

# Entropy

In [87]:
counts = merged.ev_transfer_continuous_gap.value_counts().to_frame().reset_index()
counts['probability'] = counts['count'] / counts['count'].sum()
entropy(counts.probability.values)

8.355579246339612

In [90]:
hypothetical = [1/len(merged)] * len(merged)
entropy(hypothetical)

8.428143374582728

In [89]:
counts = merged.ev_transfer_binary_gap.value_counts().to_frame().reset_index()
counts['probability'] = counts['count'] / counts['count'].sum()
entropy(counts.probability.values)

0.24554502412076645

In [82]:
merged.ev_transfer_binary_gap.value_counts()

ev_transfer_binary_gap
1.928621    4268
0.000000     306
Name: count, dtype: int64

# Variance decomposition 

In [29]:
results = []
for transfer_column in ['ev_transfer_continuous_gap', 'ev_transfer_binary_gap']:
    
    x = merged[transfer_column]
    g = merged['district']

    overall_mean = x.mean()

    # --- group-level stats ---
    group_stats = merged.groupby(g)[transfer_column].agg(['mean', 'count'])

    # --- sums of squares ---
    # within-group sum of squares
    ss_within = merged.groupby(g).apply(
        lambda df: np.sum((df[transfer_column] - df[transfer_column].mean())**2)
    ).sum()

    # between-group sum of squares
    ss_between = np.sum(group_stats['count'] * (group_stats['mean'] - overall_mean)**2)

    # total sum of squares
    ss_total = np.sum((x - overall_mean)**2)

    # --- convert to variances ---
    N = len(x)
    var_total = ss_total / (N - 1)
    var_within = ss_within / (N - 1)
    var_between = ss_between / (N - 1)

    results.append({
        'transfer_type': transfer_column,
        'var_within': var_within,
        'var_between': var_between,
        'var_total': var_total,
    })
results = pd.DataFrame(results)

  ss_within = merged.groupby(g).apply(
  ss_within = merged.groupby(g).apply(


In [34]:
merged.ev_transfer_continuous_gap.min()

0.9076091051101683