# Estimation of CATE conditional on Diversity of Neighborhood 

### (measured as proportion African American)

In [2]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
import sklearn
import os
from sklearn.linear_model import LogisticRegression

In [3]:
RANDOM_SEED=42
np.random.seed(RANDOM_SEED)

## Load and Format the Data

In [4]:
main_data = pd.read_csv("../data/clean_data/merged_with_hosts.csv")
main_data_cleaned = main_data.dropna(subset=["yes", "host_race_black", "host_gender_M", "multiple_listings", "shared_property", "ten_reviews", "log_price"])

In [5]:
main_data_cleaned.head()

Unnamed: 0,host_response,response_date,number_of_messages,automated_coding,latitude,longitude,bed_type,property_type,cancellation_policy,number_guests,...,baltimore,dallas,los_angeles,sl,dc,total_guests,raw_black,prop_black,any_black,past_guest_merge
0,1,2015-07-19 08:26:17,2.0,1,34.0815,-118.27,Real Bed,House,Flexible,3.0,...,0,0,1,0,0,11.0,0.0,0.0,0.0,matched (3)
1,0,2015-07-14 14:13:39,,1,38.9107,-77.0198,,House,Moderate,2.0,...,0,0,0,0,1,167.0,0.0,0.0,0.0,matched (3)
2,2,2015-07-20 16:24:08,2.0,0,34.0047,-118.481,Pull-out Sofa,Apartment,Strict,1.0,...,0,0,1,0,0,19.0,0.0,0.0,0.0,matched (3)
3,10,2015-07-20 06:47:38,,0,34.0917,-118.282,,House,Strict,8.0,...,0,0,1,0,0,41.0,0.0,0.0,0.0,matched (3)
5,4,2015-07-18 18:07:19,,0,34.0809,-118.367,,Apartment,Strict,3.0,...,0,0,1,0,0,263.0,1.0,0.003802,1.0,matched (3)


In [11]:
confounders = main_data_cleaned[["host_race_black", "host_gender_M", "multiple_listings", "shared_property", "ten_reviews", "log_price"]]
outcome = main_data_cleaned["yes"]
treatment = main_data_cleaned["guest_black"]
condition = main_data_cleaned["black_proportion"]

## Specify Nuisance Function Models

In [12]:
# specify a model for the conditional expected outcome

# make a function that returns a sklearn model for later use in k-folding
def make_Q_model():
    """Create outcome model for conditional expected outcome"""
    return RandomForestClassifier(n_estimators=100, max_depth=5, random_state=RANDOM_SEED)
Q_model = make_Q_model()

# Sanity check that chosen model actually improves test error
# A real analysis should give substantial attention to model selection and validation 

X_w_treatment = confounders.copy()
X_w_treatment["treatment"] = treatment

X_train, X_test, y_train, y_test = train_test_split(X_w_treatment, outcome, test_size=0.2)
Q_model.fit(X_train, y_train)
y_pred = Q_model.predict_proba(X_test)[:,1]

test_ce=log_loss(y_test, y_pred)
print(f"Test CE of fit model {test_ce}") 
baseline_ce=log_loss(y_test, y_train.mean()*np.ones_like(y_test))
print(f"Test CE of no-covariate model {baseline_ce}")

Test CE of fit model 0.680695717079893
Test CE of no-covariate model 0.6917462955382059


Because it is a randomized experiment. Treatment is randomly assigned and not confounded by X. Therefore, we can estimate the propensity score as g(x) = 0.5

## Use cross fitting to get get predicted outcomes and propensity scores for each unit

In [13]:
# helper functions to implement the cross fitting

def outcome_k_fold_fit_and_predict(make_model, X:pd.DataFrame, y:np.array, A:np.array, n_splits:int, output_type:str):
    """
    Implements K fold cross-fitting for the model predicting the outcome Y. 
    That is, 
    1. Split data into K folds
    2. For each fold j, the model is fit on the other K-1 folds
    3. The fitted model is used to make predictions for each data point in fold j
    Returns two arrays containing the predictions for all units untreated, all units treated  

    Args:
    model: function that returns sklearn model (that implements fit and either predict_prob or predict)
    X: dataframe of variables to adjust for
    y: array of outcomes
    A: array of treatments
    n_splits: number of splits to use
    output_type: type of outcome, "binary" or "continuous"

    """
    predictions0 = np.full_like(A, np.nan, dtype=float)
    predictions1 = np.full_like(y, np.nan, dtype=float)
    if output_type == 'binary':
      kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
    elif output_type == 'continuous':
      kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

    # include the treatment as input feature
    X_w_treatment = X.copy()
    X_w_treatment["A"] = A

    # for predicting effect under treatment / control status for each data point 
    X0 = X_w_treatment.copy()
    X0["A"] = 0
    X1 = X_w_treatment.copy()
    X1["A"] = 1

    
    for train_index, test_index in kf.split(X_w_treatment, y):
      X_train = X_w_treatment.iloc[train_index]
      if not isinstance(y, pd.Series):
        y = pd.Series(y, index=X.index)
      y_train = y.iloc[train_index]
      q = make_model()
      q.fit(X_train, y_train)

      if output_type =='binary':
        predictions0[test_index] = q.predict_proba(X0.iloc[test_index])[:, 1]
        predictions1[test_index] = q.predict_proba(X1.iloc[test_index])[:, 1]
      elif output_type == 'continuous':
        predictions0[test_index] = q.predict(X0.iloc[test_index])
        predictions1[test_index] = q.predict(X1.iloc[test_index])

    assert np.isnan(predictions0).sum() == 0
    assert np.isnan(predictions1).sum() == 0
    return predictions0, predictions1

In [14]:
# Because it is a randomized experiment. Treatment is randomly assigned and not confounded by X. Therefore, we can estimate the propensity score as g(x) = 0.5
g = 0.5

## Estimate CATEs

In [17]:
def cate_aiptw(Q0, Q1, g, A, Y):
    """
    AIPTW estimator for CATE (same as ATE but applied to subgroups)
    """
    tau_hat = (Q1 - Q0 + A*(Y-Q1)/g - (1-A)*(Y-Q0)/(1-g)).mean()
    scores = Q1 - Q0 + A*(Y-Q1)/g - (1-A)*(Y-Q0)/(1-g) - tau_hat
    n = Y.shape[0]
    std_hat = np.std(scores) / np.sqrt(n)
    return tau_hat, std_hat

In [18]:
def estimate_cate_by_continuous_condition(main_data_cleaned, confounders, outcome, treatment, condition, 
                                        condition_name="black_proportion", method="quantiles", n_groups=3):
    """
    Estimate CATE conditioned on a continuous variable (neighborhood diversity)
    
    Parameters:
    -----------
    method : str
        - "quantiles": Split into quantile-based groups
        - "threshold": Split based on specific thresholds
        - "bins": Split into equal-width bins
    n_groups : int
        Number of groups to create (for quantiles/bins methods)
    """
    results = {}
    
    print(f"=== CATE Estimation by {condition_name} ===\n")
    print(f"Continuous variable summary:")
    print(f"Mean: {condition.mean():.4f}")
    print(f"Std: {condition.std():.4f}")
    print(f"Min: {condition.min():.4f}, Max: {condition.max():.4f}")
    print(f"Quantiles: {condition.quantile([0.25, 0.5, 0.75]).values}\n")
    
    # Create groups based on the continuous condition
    if method == "quantiles":
        # Split into quantile-based groups
        condition_groups = pd.qcut(condition, q=n_groups, labels=False, duplicates='drop')
        group_labels = {}
        for i in range(n_groups):
            mask = condition_groups == i
            if mask.sum() > 0:
                min_val = condition[mask].min()
                max_val = condition[mask].max()
                group_labels[i] = f"Q{i+1}: [{min_val:.3f}, {max_val:.3f}]"
                
    elif method == "threshold":
        # Define meaningful thresholds for black proportion
        # Low: < 0.1, Medium: 0.1-0.3, High: > 0.3
        thresholds = [0.1, 0.3]
        condition_groups = pd.cut(condition, bins=[-np.inf] + thresholds + [np.inf], 
                                labels=False, include_lowest=True)
        group_labels = {
            0: f"Low Diversity: < {thresholds[0]}",
            1: f"Medium Diversity: {thresholds[0]}-{thresholds[1]}",
            2: f"High Diversity: > {thresholds[1]}"
        }
        
    elif method == "bins":
        # Equal-width bins
        condition_groups = pd.cut(condition, bins=n_groups, labels=False, include_lowest=True)
        group_labels = {}
        for i in range(n_groups):
            mask = condition_groups == i
            if mask.sum() > 0:
                min_val = condition[mask].min()
                max_val = condition[mask].max()
                group_labels[i] = f"Bin {i+1}: [{min_val:.3f}, {max_val:.3f}]"

                # Estimate CATE for each group
    unique_groups = np.unique(condition_groups[~pd.isna(condition_groups)])
    
    for group in unique_groups:
        group_label = group_labels.get(group, f'Group {group}')
        print(f"--- {group_label} ---")
        
        # Filter data for this group
        group_mask = condition_groups == group
        group_confounders = confounders[group_mask].copy()
        group_outcome = outcome[group_mask].copy()
        group_treatment = treatment[group_mask].copy()
        group_condition_values = condition[group_mask].copy()
        
        # Reset indices to avoid issues
        group_confounders = group_confounders.reset_index(drop=True)
        group_outcome = group_outcome.reset_index(drop=True)
        group_treatment = group_treatment.reset_index(drop=True)
        
        print(f"Sample size: {len(group_outcome)}")
        print(f"Treatment rate: {group_treatment.mean():.3f}")
        print(f"Outcome rate: {group_outcome.mean():.3f}")
        print(f"Mean {condition_name}: {group_condition_values.mean():.3f}")
        print(f"{condition_name} range: [{group_condition_values.min():.3f}, {group_condition_values.max():.3f}]")
        
        # Check if we have enough data
        if len(group_outcome) < 50:
            print(f"Warning: Small sample size for {group_label}")
        
        # Check treatment variation
        if group_treatment.var() == 0:
            print(f"No treatment variation for {group_label}")
            continue
            
        try:
            # Step 1: Estimate Q (outcome model) using cross-fitting
            print("Estimating outcome model (Q)...")
            Q0, Q1 = outcome_k_fold_fit_and_predict(
                make_Q_model, 
                X=group_confounders, 
                y=group_outcome, 
                A=group_treatment,
                n_splits=10,
                output_type='binary'
            )
            
            # Step 2: Define g (propensity score)
            # Since this is a randomized experiment, g = 0.5
            g = 0.5  
            print(f"Using propensity score g = {g} (randomized experiment)")
            
            # Alternative: Estimate propensity score if needed
            # g_estimated = group_treatment.mean()  # Empirical treatment probability
            
            # Step 3: Estimate CATE using AIPTW
            print("Estimating CATE...")
            tau_hat, std_hat = cate_aiptw(Q0, Q1, g, group_treatment, group_outcome)
            
            # Calculate confidence interval
            ci_lower = tau_hat - 1.96 * std_hat
            ci_upper = tau_hat + 1.96 * std_hat
            
            # Store results
            results[group] = {
                'group_label': group_label,
                'n': len(group_outcome),
                'treatment_rate': group_treatment.mean(),
                'outcome_rate': group_outcome.mean(),
                'mean_condition': group_condition_values.mean(),
                'condition_range': (group_condition_values.min(), group_condition_values.max()),
                'cate_estimate': tau_hat,
                'std_error': std_hat,
                'ci_lower': ci_lower,
                'ci_upper': ci_upper
            }
            
            print(f"CATE Estimate: {tau_hat:.4f}")
            print(f"Standard Error: {std_hat:.4f}")
            print(f"95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]")
            
            # Interpretation
            if ci_lower > 0:
                print("Significant positive effect of black-sounding names")
            elif ci_upper < 0:
                print("Significant negative effect of black-sounding names")
            else:
                print("No significant effect")
                
        except Exception as e:
            print(f"Error estimating CATE for {group_label}: {str(e)}")
            
        print("\n")
    
    return results

In [19]:
results_quantile = estimate_cate_by_continuous_condition(main_data_cleaned, confounders, outcome, treatment, 
                                                condition, condition_name="black_proportion", 
                                                method="quantiles", n_groups=3)
results_quantile

=== CATE Estimation by black_proportion ===

Continuous variable summary:
Mean: 0.1412
Std: 0.2042
Min: 0.0000, Max: 0.9835
Quantiles: [0.0297899  0.04958272 0.14394372]

--- Q1: [0.000, 0.034] ---
Sample size: 2064
Treatment rate: 0.492
Outcome rate: 0.459
Mean black_proportion: 0.022
black_proportion range: [0.000, 0.034]
Estimating outcome model (Q)...
Using propensity score g = 0.5 (randomized experiment)
Estimating CATE...
CATE Estimate: -0.0976
Standard Error: 0.0215
95% CI: [-0.1397, -0.0555]
Significant negative effect of black-sounding names


--- Q2: [0.034, 0.094] ---
Sample size: 2061
Treatment rate: 0.498
Outcome rate: 0.443
Mean black_proportion: 0.055
black_proportion range: [0.034, 0.094]
Estimating outcome model (Q)...
Using propensity score g = 0.5 (randomized experiment)
Estimating CATE...
CATE Estimate: -0.0694
Standard Error: 0.0216
95% CI: [-0.1118, -0.0270]
Significant negative effect of black-sounding names


--- Q3: [0.095, 0.984] ---
Sample size: 2036
Treatmen

{np.float64(0.0): {'group_label': 'Q1: [0.000, 0.034]',
  'n': 2064,
  'treatment_rate': np.float64(0.49176356589147285),
  'outcome_rate': np.float64(0.45930232558139533),
  'mean_condition': np.float64(0.021865520484275498),
  'condition_range': (np.float64(0.0), np.float64(0.0340476667334267)),
  'cate_estimate': np.float64(-0.09760216042001296),
  'std_error': np.float64(0.021500742208702586),
  'ci_lower': np.float64(-0.13974361514907002),
  'ci_upper': np.float64(-0.055460705690955896)},
 np.float64(1.0): {'group_label': 'Q2: [0.034, 0.094]',
  'n': 2061,
  'treatment_rate': np.float64(0.4978165938864629),
  'outcome_rate': np.float64(0.44347404172731686),
  'mean_condition': np.float64(0.05484460225331116),
  'condition_range': (np.float64(0.0340860673199829),
   np.float64(0.0944857496902106)),
  'cate_estimate': np.float64(-0.06939628968758131),
  'std_error': np.float64(0.02162625746849639),
  'ci_lower': np.float64(-0.11178375432583423),
  'ci_upper': np.float64(-0.027008825

In [20]:
results_threshold = estimate_cate_by_continuous_condition(main_data_cleaned, confounders, outcome, treatment, 
                                                condition, condition_name="black_proportion", 
                                                method="threshold", n_groups=3)
results_threshold

=== CATE Estimation by black_proportion ===

Continuous variable summary:
Mean: 0.1412
Std: 0.2042
Min: 0.0000, Max: 0.9835
Quantiles: [0.0297899  0.04958272 0.14394372]

--- Low Diversity: < 0.1 ---
Sample size: 4195
Treatment rate: 0.495
Outcome rate: 0.451
Mean black_proportion: 0.039
black_proportion range: [0.000, 0.100]
Estimating outcome model (Q)...
Using propensity score g = 0.5 (randomized experiment)
Estimating CATE...
CATE Estimate: -0.0831
Standard Error: 0.0151
95% CI: [-0.1127, -0.0535]
Significant negative effect of black-sounding names


--- Medium Diversity: 0.1-0.3 ---
Sample size: 1062
Treatment rate: 0.502
Outcome rate: 0.444
Mean black_proportion: 0.176
black_proportion range: [0.100, 0.298]
Estimating outcome model (Q)...
Using propensity score g = 0.5 (randomized experiment)
Estimating CATE...
CATE Estimate: -0.1290
Standard Error: 0.0299
95% CI: [-0.1876, -0.0703]
Significant negative effect of black-sounding names


--- High Diversity: > 0.3 ---
Sample size: 9

{np.float64(0.0): {'group_label': 'Low Diversity: < 0.1',
  'n': 4195,
  'treatment_rate': np.float64(0.49535160905840286),
  'outcome_rate': np.float64(0.4512514898688915),
  'mean_condition': np.float64(0.039307727972410424),
  'condition_range': (np.float64(0.0), np.float64(0.0996316758747698)),
  'cate_estimate': np.float64(-0.08309351340362497),
  'std_error': np.float64(0.015119547973875198),
  'ci_lower': np.float64(-0.11272782743242037),
  'ci_upper': np.float64(-0.05345919937482958)},
 np.float64(1.0): {'group_label': 'Medium Diversity: 0.1-0.3',
  'n': 1062,
  'treatment_rate': np.float64(0.5018832391713748),
  'outcome_rate': np.float64(0.4444444444444444),
  'mean_condition': np.float64(0.17568376830386453),
  'condition_range': (np.float64(0.1002004008016032),
   np.float64(0.2976942282273461)),
  'cate_estimate': np.float64(-0.12897705924612102),
  'std_error': np.float64(0.02991524895320026),
  'ci_lower': np.float64(-0.18761094719439353),
  'ci_upper': np.float64(-0.070

In [21]:
results_bins = estimate_cate_by_continuous_condition(main_data_cleaned, confounders, outcome, treatment, 
                                                condition, condition_name="black_proportion", 
                                                method="bins", n_groups=3)
results_bins

=== CATE Estimation by black_proportion ===

Continuous variable summary:
Mean: 0.1412
Std: 0.2042
Min: 0.0000, Max: 0.9835
Quantiles: [0.0297899  0.04958272 0.14394372]

--- Bin 1: [0.000, 0.326] ---
Sample size: 5344
Treatment rate: 0.497
Outcome rate: 0.451
Mean black_proportion: 0.071
black_proportion range: [0.000, 0.326]
Estimating outcome model (Q)...
Using propensity score g = 0.5 (randomized experiment)
Estimating CATE...
CATE Estimate: -0.0920
Standard Error: 0.0134
95% CI: [-0.1182, -0.0658]
Significant negative effect of black-sounding names


--- Bin 2: [0.330, 0.651] ---
Sample size: 511
Treatment rate: 0.499
Outcome rate: 0.438
Mean black_proportion: 0.468
black_proportion range: [0.330, 0.651]
Estimating outcome model (Q)...
Using propensity score g = 0.5 (randomized experiment)
Estimating CATE...
CATE Estimate: -0.0518
Standard Error: 0.0425
95% CI: [-0.1352, 0.0315]
No significant effect


--- Bin 3: [0.656, 0.984] ---
Sample size: 306
Treatment rate: 0.484
Outcome ra

{np.float64(0.0): {'group_label': 'Bin 1: [0.000, 0.326]',
  'n': 5344,
  'treatment_rate': np.float64(0.49719311377245506),
  'outcome_rate': np.float64(0.4505988023952096),
  'mean_condition': np.float64(0.07085605918464628),
  'condition_range': (np.float64(0.0), np.float64(0.3259127337488869)),
  'cate_estimate': np.float64(-0.09197959048605067),
  'std_error': np.float64(0.013377955332726867),
  'ci_lower': np.float64(-0.11820038293819533),
  'ci_upper': np.float64(-0.06575879803390601)},
 np.float64(1.0): {'group_label': 'Bin 2: [0.330, 0.651]',
  'n': 511,
  'treatment_rate': np.float64(0.49902152641878667),
  'outcome_rate': np.float64(0.4383561643835616),
  'mean_condition': np.float64(0.4682469768223934),
  'condition_range': (np.float64(0.3302966101694915),
   np.float64(0.6514781665310551)),
  'cate_estimate': np.float64(-0.05183022282798179),
  'std_error': np.float64(0.042538011171397196),
  'ci_lower': np.float64(-0.1352047247239203),
  'ci_upper': np.float64(0.031544279

It is interesting to observe that the selection of grouping method affects whether the effect is seen as significant or not. With quantile-based groups, all groups see a significant effect. With threshold-based groups, only the highest proportion group doesn't see a significant effect. With equal-size bins, only the lowest proportion group sees a significant effect. This assumes three groups (low, mid, high) which can be changed.