In [None]:
## imports
import numpy as np
import pandas as pd
from chow_test import chow_test

In [None]:
## load data
pos_words = pd.read_csv("FIGS/VIS_DATA/positive_words_analysis_data.csv")
neg_words = pd.read_csv("FIGS/VIS_DATA/negative_words_analysis_data.csv")

all_words = pd.concat([pos_words, neg_words])

#pivot to make context words column headers
all_words_wide = all_words.pivot_table(
    index='year', 
    columns='word', 
    values='mean_across_iterations'
).reset_index()

#clean up column names (removes the 'word' label)
all_words_wide.columns.name = None

print(all_words_wide.head())

all_words_wide['year']

   year   empower      harm    invest  legislation  liability  litigation  \
0  2010  0.217307  0.368224  0.102083     0.366008   0.168271    0.191940   
1  2011  0.157787  0.342905  0.058923     0.351480   0.116545    0.161949   
2  2012  0.140549  0.383963  0.105659     0.355607   0.144250    0.188393   
3  2013  0.179980  0.394876  0.142112     0.358269   0.131456    0.185068   
4  2014  0.151991  0.386000  0.144131     0.390296   0.148784    0.220432   

   obligation   protect  reputation   respect     trust    uphold  
0    0.033588  0.377598    0.364798  0.192478  0.076005  0.166343  
1    0.027249  0.379028    0.368427  0.193478  0.077115  0.167386  
2    0.003484  0.373725    0.397669  0.210489  0.079162  0.090933  
3    0.016065  0.377045    0.405079  0.232730  0.095468  0.120221  
4    0.030264  0.386584    0.402952  0.216187  0.069196  0.186740  


0     2010
1     2011
2     2012
3     2013
4     2014
5     2015
6     2016
7     2017
8     2018
9     2019
10    2020
11    2021
12    2022
Name: year, dtype: int64

In [None]:
"""
- use the chow test to test multiple structural breakpoints where regulatory shocks (GDPR & CCPA)
may have had an effect on the semantic relationships with the word "privacy"
- Chose to test 2018 (GDPR enforcement), 2019, and 2020 (CCPA enforcement)
- used Gen AI to help with the logic of bootstrapping and generating robust p-values

"""

#store results: list of dictionaries
results = []

#1000 bootstrap iterations
n_bootstrap = 1000

#run chow test for each word, year pair
for word_pair in all_words['word'].unique():
   for break_year in [2018, 2019, 2020]:
       
       #calculate original Chow-stat
       original_result = chow_test(
           y_series=all_words_wide[word_pair], 
           X_series=all_words_wide['year'],
           last_index=break_year-2010-1,
           first_index=break_year-2010,
           significance=0.05
       )
       original_chow = original_result[0]
       
       #store bootstrap data
       bootstrap_chow_stats = []
       
       for i in range(n_bootstrap):
           
           #bootstrap resampling (with replacement) -- create 1000 datasets with no breakpoints
           boot_indices = np.random.choice(len(all_words_wide), size=len(all_words_wide), replace=True)
           boot_sample = all_words_wide.iloc[boot_indices].reset_index(drop=True)
           
           try:
               boot_result = chow_test(
                   y_series=boot_sample[word_pair], 
                   X_series=boot_sample['year'],
                   last_index=break_year-2010-1,
                   first_index=break_year-2010,
                   significance=0.05
               )
               bootstrap_chow_stats.append(boot_result[0])
           except:
               #error handling -- didn't need
               continue
       
       # calculate bootstrapped p-value by comparing the original chow statistic to the bootstrapped dataset
       if len(bootstrap_chow_stats) > 0:
           bootstrap_p = (np.array(bootstrap_chow_stats) >= original_chow).mean()
       else:
           bootstrap_p = np.nan
       
       # Store results
       results.append({
           'word_pair': word_pair,
           'break_year': break_year,
           'original_chow_stat': original_chow,
           'original_p_value': original_result[1],
           'bootstrap_p_value': bootstrap_p,
           'n_bootstrap_samples': len(bootstrap_chow_stats)
       })
       
       #print(f"{word_pair}, {break_year}: F={original_chow:.3f}, Original p={original_result[1]:.3f}, Bootstrap p={bootstrap_p:.3f}")

#save to csv
results_df = pd.DataFrame(results)
results_df.to_csv('chow_test_bootstrap_results.csv', index=False)

Reject the null hypothesis of equality of regression coefficients in the two periods.
Chow Statistic: 9.909712106999413, P_value: 0.00683860311568063
Fail to reject the null hypothesis of equality of regression coefficients in the two periods.
Chow Statistic: 4.000332476147229, P_value: 0.06248961119980534
Fail to reject the null hypothesis of equality of regression coefficients in the two periods.
Chow Statistic: 0.7764577053464626, P_value: 0.49183136302432917
Fail to reject the null hypothesis of equality of regression coefficients in the two periods.
Chow Statistic: 1.652090015323875, P_value: 0.2508439818388246
Fail to reject the null hypothesis of equality of regression coefficients in the two periods.
Chow Statistic: 1.0925526397352436, P_value: 0.3806254802976141
Fail to reject the null hypothesis of equality of regression coefficients in the two periods.
Chow Statistic: 2.4417859941600746, P_value: 0.148667074057795
Fail to reject the null hypothesis of equality of regression 