In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Load the dataset
df = pd.read_csv('homework_6.1.csv', index_col=0)

# Display the first few rows and basic statistics
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head(10))
print("\nDataset summary:")
print(df.describe())
print("\nTreatment distribution:")
print(df['X'].value_counts())

Dataset shape: (1000, 3)

First few rows:
          Z  X         Y
0  0.548814  0 -0.823220
1  0.715189  1  0.842405
2  0.602763  1  0.898618
3  0.544883  0 -0.817325
4  0.423655  0 -0.635482
5  0.645894  0 -0.968841
6  0.437587  0 -0.656381
7  0.891773  1  0.754113
8  0.963663  1  0.718169
9  0.383442  1  1.008279

Dataset summary:
                 Z            X            Y
count  1000.000000  1000.000000  1000.000000
mean      0.495922     0.491000     0.163240
std       0.290744     0.500169     0.754302
min       0.000546     0.000000    -1.459379
25%       0.247613     0.000000    -0.457581
50%       0.481323     0.000000    -0.017813
75%       0.737260     1.000000     0.855020
max       0.999809     1.000000     1.190403

Treatment distribution:
X
0    509
1    491
Name: count, dtype: int64


In [2]:
# Separate treated and untreated groups
treated = df[df['X'] == 1].copy()
untreated = df[df['X'] == 0].copy()

print(f"Number of treated units: {len(treated)}")
print(f"Number of untreated units: {len(untreated)}")
print(f"\nTreated group - Z statistics:")
print(treated['Z'].describe())
print(f"\nUntreated group - Z statistics:")
print(untreated['Z'].describe())

Number of treated units: 491
Number of untreated units: 509

Treated group - Z statistics:
count    491.000000
mean       0.647500
std        0.251813
min        0.019193
25%        0.453930
50%        0.681393
75%        0.878773
max        0.999809
Name: Z, dtype: float64

Untreated group - Z statistics:
count    509.000000
mean       0.349703
std        0.247878
min        0.000546
25%        0.138183
50%        0.301575
75%        0.526907
max        0.972919
Name: Z, dtype: float64


## 1. Average Treatment Effect (ATE)

For each item in the dataset, find its nearest neighbor in the opposite group based on the confounder Z. The counterfactual outcome is the Y value of the nearest neighbor. The individual treatment effect is the difference between the actual and counterfactual outcomes.

In [3]:
# 1. Average Treatment Effect (ATE)
# Match each treated to nearest untreated and vice versa

# For treated units, find nearest untreated neighbor
nn_untreated = NearestNeighbors(n_neighbors=1)
nn_untreated.fit(untreated[['Z']])
distances_t, indices_t = nn_untreated.kneighbors(treated[['Z']])

# For untreated units, find nearest treated neighbor
nn_treated = NearestNeighbors(n_neighbors=1)
nn_treated.fit(treated[['Z']])
distances_u, indices_u = nn_treated.kneighbors(untreated[['Z']])

# Calculate individual treatment effects for treated units
# ITE = Y_treated - Y_counterfactual (matched untreated)
treated_effects = treated['Y'].values - untreated.iloc[indices_t.flatten()]['Y'].values

# Calculate individual treatment effects for untreated units
# ITE = Y_counterfactual (matched treated) - Y_untreated
untreated_effects = treated.iloc[indices_u.flatten()]['Y'].values - untreated['Y'].values

# Average Treatment Effect (ATE): average across all units
ate = np.mean(np.concatenate([treated_effects, untreated_effects]))

print(f"Average Treatment Effect (ATE): {ate:.6f}")
print(f"\nBreakdown:")
print(f"  Mean effect for treated units: {np.mean(treated_effects):.6f}")
print(f"  Mean effect for untreated units: {np.mean(untreated_effects):.6f}")

Average Treatment Effect (ATE): 1.695270

Breakdown:
  Mean effect for treated units: 1.846409
  Mean effect for untreated units: 1.549477


## 2. Average Treatment Effect on the Treated (ATT)

For each treated item, find its counterfactual from the untreated group. We only consider the treatment effects for the treated units and ignore standalone untreated units.

In [4]:
# 2. Average Treatment Effect on the Treated (ATT)
# Only consider treated units and their matched counterfactuals

att = np.mean(treated_effects)

print(f"Average Treatment Effect on the Treated (ATT): {att:.6f}")
print(f"  Based on {len(treated)} treated units")

Average Treatment Effect on the Treated (ATT): 1.846409
  Based on 491 treated units


## 3. Average Treatment Effect on the Untreated (ATU)

For each untreated item, find its counterfactual from the treated group. We only consider the treatment effects for the untreated units and ignore standalone treated units.

In [5]:
# 3. Average Treatment Effect on the Untreated (ATU)
# Only consider untreated units and their matched counterfactuals

atu = np.mean(untreated_effects)

print(f"Average Treatment Effect on the Untreated (ATU): {atu:.6f}")
print(f"  Based on {len(untreated)} untreated units")

Average Treatment Effect on the Untreated (ATU): 1.549477
  Based on 509 untreated units


## 4. Optimal Treatment Effect

Find the maximum treatment effect across all untreated items. This identifies the single untreated individual who would benefit most from receiving the treatment.

In [6]:
# 4. Optimal Treatment Effect
# Find the maximum treatment effect among untreated units

optimal_te = np.max(untreated_effects)
optimal_idx = np.argmax(untreated_effects)
optimal_unit = untreated.iloc[optimal_idx]
matched_treated_unit = treated.iloc[indices_u[optimal_idx][0]]

print(f"Optimal Treatment Effect: {optimal_te:.6f}")
print(f"\nOptimal untreated unit (index {optimal_unit.name}):")
print(f"  Z = {optimal_unit['Z']:.6f}")
print(f"  Y (untreated) = {optimal_unit['Y']:.6f}")
print(f"\nMatched treated counterfactual (index {matched_treated_unit.name}):")
print(f"  Z = {matched_treated_unit['Z']:.6f}")
print(f"  Y (treated) = {matched_treated_unit['Y']:.6f}")
print(f"\nPotential gain from treatment: {optimal_te:.6f}")

Optimal Treatment Effect: 2.172470

Optimal untreated unit (index 298):
  Z = 0.972919
  Y (untreated) = -1.459379

Matched treated counterfactual (index 951):
  Z = 0.973819
  Y (treated) = 0.713091

Potential gain from treatment: 2.172470


## Summary of Results

In [7]:
# Summary of all treatment effects
print("=" * 60)
print("SUMMARY OF TREATMENT EFFECTS")
print("=" * 60)
print(f"\n1. Average Treatment Effect (ATE): {ate:.6f}")
print(f"   - Average effect across all units (treated + untreated)")
print(f"\n2. Average Treatment Effect on Treated (ATT): {att:.6f}")
print(f"   - Average effect for treated units only")
print(f"\n3. Average Treatment Effect on Untreated (ATU): {atu:.6f}")
print(f"   - Average effect for untreated units only")
print(f"\n4. Optimal Treatment Effect: {optimal_te:.6f}")
print(f"   - Maximum potential treatment effect")
print("=" * 60)

SUMMARY OF TREATMENT EFFECTS

1. Average Treatment Effect (ATE): 1.695270
   - Average effect across all units (treated + untreated)

2. Average Treatment Effect on Treated (ATT): 1.846409
   - Average effect for treated units only

3. Average Treatment Effect on Untreated (ATU): 1.549477
   - Average effect for untreated units only

4. Optimal Treatment Effect: 2.172470
   - Maximum potential treatment effect
