# The 2017 Equifax Data Breach: A Causal Impact Analysis

## 1. Introduction
This notebook quantifies the causal impact of the 2017 Equifax data breach announcement on the company's stock returns. On September 7, 2017, Equifax disclosed a massive data breach that exposed the personal information of 147 million people. This analysis uses quasi-experimental methods to estimate the financial repercussions of this event on Equifax's market value.

The primary research question is: **What was the causal effect of the data breach announcement on Equifax's daily stock returns?**

To answer this, two robust causal inference techniques are employed:
1.  **Difference-in-Differences (DiD)**
2.  **Synthetic Control Method (SCM)**

Both methods are validated using a comprehensive placebo testing framework to ensure the reliability of the findings and to compare model performance.

## 2. Setup and Configuration

In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import sys

# Assuming src folder is in the same directory
sys.path.append('src')
from data_collection import fetch_stock_data, prepare_analysis_data
from causal_methods import DifferenceInDifferences, SyntheticControl, run_placebo_test

# Set plot style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

# Configuration
TICKERS = ['EFX', 'MCO', 'TRU', 'SPY', 'VTI', 'EXPGY', 'BAH']
START_DATE = '2017-01-01'
END_DATE = '2017-09-30'
EVENT_DATE = '2017-09-08'
TREATED_UNIT = 'EFX'
WINDOW_DAYS = 180

## 3. Data Collection and Preparation

In [None]:
print("Fetching stock data...")
df = fetch_stock_data(TICKERS, START_DATE, END_DATE)
print(f"Fetched {len(df)} records for {len(TICKERS)} tickers")

print("\nPreparing analysis data...")
df_prepared = prepare_analysis_data(df, EVENT_DATE, TREATED_UNIT, WINDOW_DAYS)
print(f"Analysis window: {df_prepared['date'].min().date()} to {df_prepared['date'].max().date()}")
print(f"Pre-treatment: {(df_prepared['post']==0).sum()}")
print(f"Post-treatment: {(df_prepared['post']==1).sum()}")

df_prepared.head()

## 4. Causal Analysis

### 4.1 Difference-in-Differences (DiD)

In [None]:
print("Running DiD...")
did = DifferenceInDifferences(df_prepared)
did_results = did.estimate()

print(f"\nDiD Results:")
print(f"  Treatment Effect: {did_results['treatment_effect']:.4f}")
print(f"  Std Error: {did_results['std_error']:.4f}")
print(f"  P-value: {did_results['p_value']:.4f}")
print(f"  95% CI: [{did_results['conf_int'][0]:.4f}, {did_results['conf_int'][1]:.4f}]")

parallel_trends = did.test_parallel_trends()
print(f"\nParallel Trends Test: p = {parallel_trends['p_value']:.4f}")
print(f"  {parallel_trends['interpretation']}")

### 4.2 Synthetic Control Method (SCM)

In [None]:
print("Running SCM...")
scm = SyntheticControl(df_prepared, TREATED_UNIT)
scm_results = scm.estimate()

print(f"\nSCM Results:")
print(f"  Treatment Effect: {scm_results['treatment_effect']:.4f}")
print(f"  Pre-treatment RMSE: {scm_results['pre_treatment_rmse']:.6f}")

print(f"\nDonor Weights:")
for unit, weight in sorted(scm_results['weights'].items(), key=lambda x: x[1], reverse=True):
    if weight > 0.01:
        print(f"  {unit}: {weight:.3f}")

### 4.3 Model Validation: Placebo Tests

In [None]:
print("Running placebo tests...")
pre_data = df_prepared[df_prepared['post'] == 0].copy()
event_datetime = pd.to_datetime(EVENT_DATE).tz_localize('America/New_York')
n_placebo = 100
placebo_days = np.linspace(30, 150, n_placebo).astype(int)
did_placebo_effects = []
scm_placebo_effects = []

for i, days in enumerate(placebo_days):
    try:
        did_effect = run_placebo_test(pre_data, 'did', event_datetime, TREATED_UNIT, days)
        did_placebo_effects.append(did_effect)
        scm_effect = run_placebo_test(pre_data, 'scm', event_datetime, TREATED_UNIT, days)
        scm_placebo_effects.append(scm_effect)
    except Exception as e:
        continue

did_placebo_rmse = np.sqrt(np.mean(np.array(did_placebo_effects)**2))
scm_placebo_rmse = np.sqrt(np.mean(np.array(scm_placebo_effects)**2))

print(f"\nPlacebo Test Results:")
print(f"  DiD Placebo RMSE: {did_placebo_rmse:.6f}")
print(f"  SCM Placebo RMSE: {scm_placebo_rmse:.6f}")

## 5. Visualizations

In [None]:
placebo_df = pd.DataFrame({'days_before': placebo_days[:len(did_placebo_effects)], 'did_effect': did_placebo_effects, 'scm_effect': scm_placebo_effects})
results = {'did_treatment_effect': did_results['treatment_effect'], 'scm_treatment_effect': scm_results['treatment_effect'], 'did_placebo_rmse': did_placebo_rmse, 'scm_placebo_rmse': scm_placebo_rmse, 'parallel_trends_p_value': parallel_trends['p_value'], 'scm_weights': scm_results['weights']}

In [None]:
fig, ax = plt.subplots(figsize=(14, 7))
control_avg = df_prepared[df_prepared['treated']==0].groupby('date')['returns'].mean().reset_index()
treated_data = df_prepared[df_prepared['treated']==1][['date', 'returns']]
ax.plot(control_avg['date'], control_avg['returns'], label='Control Group Average', linewidth=2, alpha=0.7, color='#2E86AB')
ax.plot(treated_data['date'], treated_data['returns'], label='Equifax (EFX)', linewidth=2.5, color='#A23B72')
ax.axvline(pd.to_datetime(EVENT_DATE), color='red', linestyle='--', linewidth=2, label='Breach Announcement', alpha=0.7)
ax.set_title('Figure 1: Equifax Stock Returns vs Control Group')
ax.legend()
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
ax1.hist(placebo_df['did_effect'], bins=30, alpha=0.7, color='#2E86AB', edgecolor='black')
ax1.axvline(results['did_treatment_effect'], color='darkred', linestyle='-', linewidth=2.5, label=f"Actual Effect: {results['did_treatment_effect']:.4f}")
ax1.set_title(f'Figure 2a: DiD Placebo Distribution (RMSE: {results["did_placebo_rmse"]:.6f})')
ax1.legend()
ax2.hist(placebo_df['scm_effect'], bins=30, alpha=0.7, color='#A23B72', edgecolor='black')
ax2.axvline(results['scm_treatment_effect'], color='darkred', linestyle='-', linewidth=2.5, label=f"Actual Effect: {results['scm_treatment_effect']:.4f}")
ax2.set_title(f'Figure 2b: SCM Placebo Distribution (RMSE: {results["scm_placebo_rmse"]:.6f})')
ax2.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))
methods = ['DiD', 'SCM']
effects = [results['did_treatment_effect'], results['scm_treatment_effect']]
placebo_rmse = [results['did_placebo_rmse'], results['scm_placebo_rmse']]
ax.bar(methods, np.abs(effects)*100, color=['#2E86AB', '#A23B72'])
ax.set_ylabel('Treatment Effect (% decline)')
ax.set_title('Figure 3: Method Comparison')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
weights_df = pd.DataFrame(list(results['scm_weights'].items()), columns=['Unit', 'Weight']).sort_values('Weight', ascending=True)
ax.barh(weights_df['Unit'], weights_df['Weight'])
ax.set_xlabel('Weight')
ax.set_title('Figure 4: Synthetic Control Donor Weights')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14, 7))
pre_data = df_prepared[df_prepared['post']==0]
treated_pre = pre_data[pre_data['treated']==1].sort_values('date')
control_pre = pre_data[pre_data['treated']==0].groupby('date')['returns'].mean().reset_index().sort_values('date')
treated_pre['cumulative_return'] = (1 + treated_pre['returns']).cumprod() - 1
control_pre['cumulative_return'] = (1 + control_pre['returns']).cumprod() - 1
ax.plot(control_pre['date'], control_pre['cumulative_return']*100, label='Control Group')
ax.plot(treated_pre['date'], treated_pre['cumulative_return']*100, label='Equifax (EFX)')
ax.set_title(f'Figure 5: Parallel Trends Test (p-value = {results["parallel_trends_p_value"]:.4f})')
ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14, 7))
event_study = df_prepared.copy()
event_study['days_to_event'] = event_study['days_to_event'].astype(int)
treatment_effects_by_day = []
days_range = range(-30, 8)
for day in days_range:
    day_data = event_study[event_study['days_to_event'] == day]
    if len(day_data) > 0:
        treated_return = day_data[day_data['treated']==1]['returns'].mean()
        control_return = day_data[day_data['treated']==0]['returns'].mean()
        effect = treated_return - control_return
        treatment_effects_by_day.append({'day': day, 'effect': effect})
effect_df = pd.DataFrame(treatment_effects_by_day)
ax.plot(effect_df['day'], effect_df['effect']*100, marker='o')
ax.axvline(0, color='red', linestyle='--', label='Breach Announcement')
ax.set_title('Figure 6: Event Study Plot')
ax.legend()
plt.show()