In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from scipy.spatial.distance import mahalanobis

df = pd.read_csv('homework_8.1.csv')

print("Dataset shape:", df.shape)
print(df.head())
print("\nBasic statistics:")
print(df.describe())

Dataset shape: (1000, 4)
   Unnamed: 0  X         Y         Z
0           0  1  4.109218  1.764052
1           1  0  2.259504  0.400157
2           2  0 -0.647584  0.978738
3           3  0  2.106071  2.240893
4           4  1  3.583464  1.867558

Basic statistics:
        Unnamed: 0            X            Y            Z
count  1000.000000  1000.000000  1000.000000  1000.000000
mean    499.500000     0.481000     1.014397    -0.045257
std     288.819436     0.499889     1.998531     0.987527
min       0.000000     0.000000    -5.491184    -3.046143
25%     249.750000     0.000000    -0.509696    -0.698420
50%     499.500000     0.000000     1.013902    -0.058028
75%     749.250000     1.000000     2.567960     0.606951
max     999.000000     1.000000     6.150865     2.759355


In [11]:
X_treatment = df['X']
Y_outcome = df['Y']

Z_covariates = df.filter(regex='^Z')

logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(Z_covariates, X_treatment)

print(f"Model coefficients: {logreg.coef_}")
print(f"Model intercept: {logreg.intercept_}")

Model coefficients: [[0.96576278]]
Model intercept: [-0.04458169]


In [12]:
propensity_scores = logreg.predict_proba(Z_covariates.values.reshape(-1, 1))[:, 1]





In [13]:
weights = np.where(X_treatment == 1, 
                   1 / propensity_scores,   
                   1 / (1 - propensity_scores))


In [14]:
treated_mask = (X_treatment == 1)
weighted_outcome_treated = (weights[treated_mask] * Y_outcome[treated_mask]).sum()
sum_weights_treated = weights[treated_mask].sum()
avg_outcome_treated = weighted_outcome_treated / sum_weights_treated

control_mask = (X_treatment == 0)
weighted_outcome_control = (weights[control_mask] * Y_outcome[control_mask]).sum()
sum_weights_control = weights[control_mask].sum()
avg_outcome_control = weighted_outcome_control / sum_weights_control

ATE = avg_outcome_treated - avg_outcome_control

print(f"Average Treatment Effect: {ATE:.6f}")


Average Treatment Effect: 2.274341


In [15]:
print(propensity_scores[:3])

[0.84011371 0.58464597 0.71108245]


In [16]:
from scipy.spatial.distance import mahalanobis

df2 = pd.read_csv('homework_8.2.csv')

print(f"Shape: {df2.shape}")
print("\nFirst few rows:")
print(df2.head(10))
print("\nBasic statistics:")
print(df2.describe())

Shape: (1000, 5)

First few rows:
   Unnamed: 0  X         Y        Z1        Z2
0           0  1  4.652085  1.764052  2.320015
1           1  1  2.590221  0.400157  1.292631
2           2  1  3.898981  0.978738  0.556423
3           3  1  5.857179  2.240893  2.345607
4           4  1  3.647489  1.867558  2.095611
5           5  1  2.813448 -0.977278 -0.775798
6           6  1  2.842384  0.950088  1.490862
7           7  0 -0.065011 -0.151357 -1.969435
8           8  0 -0.104002 -0.103219 -0.152543
9           9  1  4.003199  0.410599  0.649632

Basic statistics:
        Unnamed: 0            X            Y           Z1           Z2
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean    499.500000     0.483000     1.552274    -0.045257    -0.031640
std     288.819436     0.499961     2.650695     0.987527     1.361324
min       0.000000     0.000000    -5.797134    -3.046143    -4.024931
25%     249.750000     0.000000    -0.601263    -0.698420    -1.007259
50% 

In [17]:
X2 = df2['X']
Y2 = df2['Y']

Z_cols_2 = df2.filter(regex='^Z').columns.tolist()
print(f"Z columns found: {Z_cols_2}")

treated_df = df2[X2 == 1].copy()
untreated_df = df2[X2 == 0].copy()

Z_matrix = df2[Z_cols_2].values.T

cov_matrix = np.cov(Z_matrix)
inv_cov_matrix = np.linalg.inv(cov_matrix)


Z columns found: ['Z1', 'Z2']


In [18]:
matches = []

treated_Z = treated_df[Z_cols_2].values
untreated_Z = untreated_df[Z_cols_2].values

for i, treated_point in enumerate(treated_Z):
    min_distance = float('inf')
    best_match_idx = -1
    
    for j, untreated_point in enumerate(untreated_Z):
        distance = mahalanobis(treated_point, untreated_point, inv_cov_matrix)
        
        if distance < min_distance:
            min_distance = distance
            best_match_idx = j
    
    matches.append({
        'treated_idx': treated_df.index[i],
        'untreated_idx': untreated_df.index[best_match_idx],
        'distance': min_distance,
        'treated_Z1': treated_point[0],
        'treated_Z2': treated_point[1],
        'untreated_Z1': untreated_Z[best_match_idx][0],
        'untreated_Z2': untreated_Z[best_match_idx][1]
    })

matches_df = pd.DataFrame(matches)


In [19]:
treatment_effects = []

for idx, row in matches_df.iterrows():
    treated_idx = row['treated_idx']
    untreated_idx = row['untreated_idx']
    
    Y_treated = df2.loc[treated_idx, 'Y']
    Y_untreated = df2.loc[untreated_idx, 'Y']
    
    effect = Y_treated - Y_untreated
    treatment_effects.append(effect)

ATE_matched = np.mean(treatment_effects)

print(f"Average Treatment Effect: {ATE_matched:.6f}")


Average Treatment Effect: 3.437679


In [20]:
max_distance_idx = matches_df['distance'].idxmax()
max_distance_row = matches_df.loc[max_distance_idx]

untreated_idx = max_distance_row['untreated_idx']
untreated_Z1 = df2.loc[untreated_idx, 'Z1']
untreated_Z2 = df2.loc[untreated_idx, 'Z2']

print(f"  Z1 = {untreated_Z1:.6f}")
print(f"  Z2 = {untreated_Z2:.6f}")


  Z1 = 1.519995
  Z2 = -1.282208
