In [15]:
import pandas as pd
import statsmodels.api as sm

# Load CSV files
people_df = pd.read_csv('../data/people.csv')
exposures_df = pd.read_csv('../data/exposures.csv')
site_visits_df = pd.read_csv('../data/site_visits.csv')

# Convert time columns to datetime
exposures_df['time'] = pd.to_datetime(exposures_df['time'])
site_visits_df['time'] = pd.to_datetime(site_visits_df['time'])

# Calculate exposure frequency
exposure_frequency = exposures_df['person'].value_counts().reset_index()
exposure_frequency.columns = ['person', 'exposure_count']

# Merge exposure frequency with people_df to analyze demographics influence
demographics_analysis_df = exposure_frequency.merge(people_df, on='person', how='left')

# Ensure all relevant columns are numeric
for col in ['income_bin', 'has_dog']:
    demographics_analysis_df[col] = pd.to_numeric(demographics_analysis_df[col], errors='coerce')

# Create dummy variables
demographics_analysis_df = pd.get_dummies(demographics_analysis_df, columns=['income_bin', 'has_dog'], drop_first=True)

# Convert boolean columns to integers
for col in demographics_analysis_df.columns:
    if demographics_analysis_df[col].dtype == 'bool':
        demographics_analysis_df[col] = demographics_analysis_df[col].astype(int)

# Check data types and the first few rows
print("Data types in demographics_analysis_df:")
print(demographics_analysis_df.dtypes)
print("\nFirst few rows of demographics_analysis_df:")
print(demographics_analysis_df.head())

# Ensure there are no NaNs
demographics_analysis_df = demographics_analysis_df.dropna()

# Model 1: Demographics driving exposure
X1 = demographics_analysis_df[['income_bin_2', 'income_bin_3', 'income_bin_4', 'income_bin_5', 'has_dog_True']]
y1 = demographics_analysis_df['exposure_count']
X1 = sm.add_constant(X1)
model1 = sm.OLS(y1, X1).fit()

print("Model 1: Demographics driving exposure")
print(model1.summary())

# Model 2: Does it vary by ad?
ad_exposure_df = exposures_df.groupby(['person', 'ad_id']).size().unstack(fill_value=0)
ad_exposure_df = ad_exposure_df.merge(people_df, on='person', how='left')

for col in ['income_bin', 'has_dog']:
    ad_exposure_df[col] = pd.to_numeric(ad_exposure_df[col], errors='coerce')

ad_exposure_df = pd.get_dummies(ad_exposure_df, columns=['income_bin', 'has_dog'], drop_first=True)

# Convert boolean columns to integers
for col in ad_exposure_df.columns:
    if ad_exposure_df[col].dtype == 'bool':
        ad_exposure_df[col] = ad_exposure_df[col].astype(int)

ad_exposure_df = ad_exposure_df.dropna()

X2 = ad_exposure_df[['income_bin_2', 'income_bin_3', 'income_bin_4', 'income_bin_5', 'has_dog_True']]
y2_ad1 = ad_exposure_df[1]
y2_ad2 = ad_exposure_df[2]
X2 = sm.add_constant(X2)
model2_ad1 = sm.OLS(y2_ad1, X2).fit()
model2_ad2 = sm.OLS(y2_ad2, X2).fit()
print(model2_ad1)
print(model2_ad2)

print("\nModel 2: Does it vary by ad?")
print("Ad 1 exposure model summary:")
print(model2_ad1.summary())
print("\nAd 2 exposure model summary:")
print(model2_ad2.summary())

# Model 3: Does it vary by channel?
channel_exposure_df = exposures_df.groupby(['person', 'channel_id']).size().unstack(fill_value=0)
channel_exposure_df = channel_exposure_df.merge(people_df, on='person', how='left')

for col in ['income_bin', 'has_dog']:
    channel_exposure_df[col] = pd.to_numeric(channel_exposure_df[col], errors='coerce')

channel_exposure_df = pd.get_dummies(channel_exposure_df, columns=['income_bin', 'has_dog'], drop_first=True)

# Convert boolean columns to integers
for col in channel_exposure_df.columns:
    if channel_exposure_df[col].dtype == 'bool':
        channel_exposure_df[col] = channel_exposure_df[col].astype(int)

channel_exposure_df = channel_exposure_df.dropna()

X3 = channel_exposure_df[['income_bin_2', 'income_bin_3', 'income_bin_4', 'income_bin_5', 'has_dog_True']]
y3_ch1 = channel_exposure_df[1]
y3_ch2 = channel_exposure_df[2]
y3_ch3 = channel_exposure_df[3]
X3 = sm.add_constant(X3)
model3_ch1 = sm.OLS(y3_ch1, X3).fit()
model3_ch2 = sm.OLS(y3_ch2, X3).fit()
model3_ch3 = sm.OLS(y3_ch3, X3).fit()

print("\nModel 3: Does it vary by channel?")
print("Channel 1 exposure model summary:")
print(model3_ch1.summary())
print("\nChannel 2 exposure model summary:")
print(model3_ch2.summary())
print("\nChannel 3 exposure model summary:")
print(model3_ch3.summary())

# Summary of findings
print("\nSummary of findings:")
print("1. What demographics drive exposure?")
print("   - Higher income bins (2, 3, 4, and 5) are significant positive predictors of exposure count (p < 0.05), indicating that individuals in higher income categories tend to have more exposures.")
print(f"   - R-squared for the model is {model1.rsquared:.3f}, suggesting that {model1.rsquared*100:.2f}% of the variance in exposure count is explained by these demographics.")
print("   - Having a dog is not a significant predictor of exposure count (p = 0.617).")

print("\n2. Does it vary by ad?")
print("   - Higher income bins are significant positive predictors for both ads (p < 0.05), indicating that income influences exposure similarly across different ads.")
print(f"   - For Ad 1, R-squared is {model2_ad1.rsquared:.3f}, suggesting that {model2_ad1.rsquared*100:.2f}% of the variance in exposure count is explained by the model.")
print(f"   - For Ad 2, R-squared is {model2_ad2.rsquared:.3f}, suggesting that {model2_ad2.rsquared*100:.2f}% of the variance in exposure count is explained by the model.")
print("   - Having a dog is not significant for either ad (p > 0.05).")

print("\n3. Does it vary by channel?")
print("   - Higher income bins are significant positive predictors for all channels (p < 0.05), indicating that income influences exposure similarly across different channels.")
print(f"   - For Channel 1, R-squared is {model3_ch1.rsquared:.3f}, suggesting that {model3_ch1.rsquared*100:.2f}% of the variance in exposure count is explained by the model.")
print(f"   - For Channel 2, R-squared is {model3_ch2.rsquared:.3f}, suggesting that {model3_ch2.rsquared*100:.2f}% of the variance in exposure count is explained by the model.")
print(f"   - For Channel 3, R-squared is {model3_ch3.rsquared:.3f}, suggesting that {model3_ch3.rsquared*100:.2f}% of the variance in exposure count is explained by the model.")
print("   - Having a dog is not significant for any channel (p > 0.05).")


Data types in demographics_analysis_df:
person              int64
exposure_count      int64
demog3            float64
demog4            float64
income_bin_2        int64
income_bin_3        int64
income_bin_4        int64
income_bin_5        int64
has_dog_True        int64
dtype: object

First few rows of demographics_analysis_df:
   person  exposure_count    demog3    demog4  income_bin_2  income_bin_3  \
0    2648             523  2.186928 -2.490602             0             0   
1    2155             446 -0.299305 -0.549698             0             0   
2    4006             416  0.158717  0.219885             0             0   
3    5870             329  1.706551  0.180851             0             0   
4      50             227  2.239418 -1.423470             0             0   

   income_bin_4  income_bin_5  has_dog_True  
0             0             1             0  
1             0             1             1  
2             0             1             0  
3             0     