In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy.stats import ttest_ind

In [2]:
hw11 = pd.read_csv('homework_1.1.csv', index_col=0)

In [3]:
hw11

Unnamed: 0,X1,X2,X3,Y
0,-0.440646,-0.390227,0.156718,-0.877671
1,-3.810099,-1.304665,-1.105117,-10.130388
2,-1.425451,-0.340049,1.115908,0.284068
3,-1.325750,0.161906,-0.254670,-1.994344
4,3.120263,1.487343,-1.164839,2.030030
...,...,...,...,...
995,1.281625,-0.062024,-0.109231,1.206058
996,-3.047558,-1.121675,0.678439,-3.229262
997,-0.028757,-0.191722,1.191790,3.160897
998,0.415266,0.373086,1.172636,4.033302


In [4]:
X = hw11.drop('Y', axis=1)
y = hw11['Y']

In [5]:
# Linear regression to predict Y from X1, X2, and X3
model = sm.OLS(y, X).fit()
model.params

X1    1.007095
X2    1.964732
X3    2.975624
dtype: float64

In [6]:
# Using t-statistics to determine which coefficient is most significant
model.tvalues
# X3 is the most significant since it has the largest T value

X1     61.018570
X2     53.332565
X3    197.035586
dtype: float64

In [7]:
# Which Xi has the greatest difference between the amoutn Y increases for each
# 1 unit of Xi (regressing Y on Xi alone) vs the amount that Y increases for 
# each 1 unit of Xi on the dataset, on average (regressing Y on all Xis)

X_vars = ['X1', 'X2', 'X3']
results = []

for var in X_vars:
    # performing linear regression on just 1 Xi
    X_uni = hw11[[var]]
    model_uni = LinearRegression().fit(X_uni, y)
    coef_uni = model_uni.coef_[0]

    # performing linear regression on all Xi and obtaining the corresponding coef
    X_multi = hw11[X_vars]
    model_multi = LinearRegression().fit(X_multi, y)
    coef_multi = model_multi.coef_[X_vars.index(var)]

    # finding the difference between the two coefs
    diff = abs(coef_uni - coef_multi)
    results.append({
        'Xi': var,
        'Univariate Coef': coef_uni,
        'Multiple Coef': coef_multi,
        'Absolute Difference': diff
    })

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Xi,Univariate Coef,Multiple Coef,Absolute Difference
0,X1,1.841761,1.007138,0.834623
1,X2,4.083613,1.964569,2.119044
2,X3,3.097041,2.975489,0.121553


In [8]:
hw12 = pd.read_csv('homework_1.2.csv', index_col=0)

In [9]:
hw12

Unnamed: 0,X,Y,Z
0,0,0.548814,0.548814
1,1,1.215189,0.715189
2,0,0.602763,0.602763
3,0,0.544883,0.544883
4,0,0.423655,0.423655
...,...,...,...
95,0,0.183191,0.183191
96,1,1.086513,0.586513
97,0,0.020108,0.020108
98,1,1.328940,0.828940


In [10]:
X0 = hw12[hw12['X'] == 0].reset_index(drop=True)
X1 = hw12[hw12['X'] == 1].reset_index(drop=True)

In [11]:
from sklearn.neighbors import NearestNeighbors

In [29]:
# Matching Z values with X=1 with Z values with X=0

neigh = NearestNeighbors(n_neighbors=1).fit(X0[['Z']])
distances, indices = neigh.kneighbors(X1[['Z']])

matches = X1.copy()
matches['Matched_Index_in_X0'] = indices.flatten()
matches['Matched_Z_in_X0'] = X0.loc[indices.flatten(), 'Z'].values
matches['Y_in_X0'] = X0.loc[indices.flatten(), 'Y'].values
matches['Distance'] = distances.flatten()
matches


Unnamed: 0,X,Y,Z,Matched_Index_in_X0,Matched_Z_in_X0,Y_in_X0,Distance
0,1,1.215189,0.715189,48,0.716327,0.716327,0.001138
1,1,1.145894,0.645894,25,0.653108,0.653108,0.007214
2,1,0.937587,0.437587,18,0.437032,0.437032,0.000555
3,1,1.391773,0.891773,9,0.778157,0.778157,0.113616
4,1,1.463663,0.963663,9,0.778157,0.778157,0.185506
5,1,0.883442,0.383442,13,0.414662,0.414662,0.03122
6,1,1.291725,0.791725,9,0.778157,0.778157,0.013568
7,1,1.425597,0.925597,9,0.778157,0.778157,0.14744
8,1,1.33262,0.83262,9,0.778157,0.778157,0.054463
9,1,1.370012,0.870012,9,0.778157,0.778157,0.091855


In [33]:
# Farthest distance 
max(matches['Distance'])

0.2102170871093757

In [14]:
# Finding the effect (difference between avg Y value for X=0 vs avg Y value for X=1)
abs(np.mean(matches['Y_in_X0']) - np.mean(matches['Y']))

np.float64(0.5433600651913855)

In [15]:
# Approach B: All matches in X=0 that are within a distance of 0.2 of each X=1
# Duplicates okay

neigh_b = NearestNeighbors(radius=0.2).fit(X0[['Z']])
distances_radius, indices_radius = neigh_b.radius_neighbors(X1[['Z']])

duplicate_count = sum(len(match_list) - 1 for match_list in indices_radius if len(match_list) > 1)
duplicate_count

691

In [16]:
# Computing effect

neighbor_group_means = []

for i, match_indices in enumerate(indices_radius):
    if len(match_indices) > 0:
        matched_Y_values = X0.loc[match_indices, 'Y'].values
        neighbor_group_means.append(np.mean(matched_Y_values))
        
overall_effect = np.mean(neighbor_group_means)
overall_effect

np.float64(0.5411846604540438)