# 0. Libraries and setup

In [1]:
from sklearn.model_selection import KFold, TimeSeriesSplit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from panelsplit.cross_validation import PanelSplit
from functools import reduce
from hw1_library import feature_engineering, target_engineer, text_preprocessing

# 1. Loading the data that have been previously cleaned and merged

In [2]:
df = pd.read_csv('clean_data/ccc_combined.csv', low_memory=False)
df.head()

Unnamed: 0,date,year,month,state,type,issues,actors,claims,valence,size_low,size_high,size_mean,arrests_any,injuries_crowd_any,injuries_police_any,property_damage_any,notes
0,2017-01-01,2017,1.0,DC,vigil,military,,for banning nuclear weapons; for peace,0.0,,,,0.0,0.0,0.0,0.0,White House Peace Vigil continuous since June ...
1,2017-01-01,2017,1.0,MN,vigil,military,Peace Vigil Mankato,for peace,0.0,,,,0.0,0.0,0.0,0.0,every Sunday since 2001
2,2017-01-01,2017,1.0,MN,protest; banner drop,banking and finance; economy; energy; environm...,general protestors,against the Dakota Access Pipeline; for indige...,1.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,hung banner from stadium roof during NFL game
3,2017-01-01,2017,1.0,RI,vigil,environment; guns; military,Sakonnet Peace Alliance,for peace; for gun control; for climate action,1.0,,,,0.0,0.0,0.0,0.0,every Sunday since 2003
4,2017-01-01,2017,1.0,TN,vigil,military,Oak Ridge Environmental Peace Alliance,for abolishing nuclear weapons,0.0,,,,0.0,0.0,0.0,0.0,every Sunday since the late 1990s


# 2. Feature engineering 

In [3]:
# create violence variable
cols = ['arrests_any', 'injuries_crowd_any', 'injuries_police_any', 'property_damage_any']
df['violence'] = df[cols].any(axis=1).astype(int)
df.drop(columns=cols, inplace=True)

In [4]:
# removing NaN from state and date columns
df = df.dropna(subset=['date'])

In [5]:
# 'period' column instead of 'date' column to just consider year and month
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['period'] = df['date'].dt.strftime('%Y%m')


In [6]:
# define protest types to monitor
protest_categories = {
   'healthcare': ['healthcare'],
   'racism_immigration': ['racism', 'immigration'],
   'housing': ['housing'],
   'guns_criminal_justice': ['guns', 'criminal justice']
}

In [7]:
# create target for each category
def create_target_df(df, category, keywords):
   # identify protests containing keywords for the category
   matches = []
   for idx, row in df.iterrows():
       issue_text = row['issues']
       if pd.notna(issue_text):
           issue_text = issue_text.lower()
           if any(keyword in issue_text for keyword in keywords):
               matches.append(idx)
   
   # create a copy of the original dataframe
   df_copy = df.copy()
   
   # assign 1 to protests matching the category, 0 to others
   df_copy['target'] = 0
   df_copy.loc[matches, 'target'] = 1
   
   # Aggregate by state and period, taking the maximum of target
   # (if at least one protest is of that category, target=1)
   aggregated = df_copy.groupby(['state', 'period']).agg({
       'target': 'max',
       'violence': 'max'
   }).reset_index()
   
   # Convert aggregated dataframe to appropriate format
   aggregated = aggregated.set_index(['state', 'period']).sort_index()
   
   return aggregated

# dataframes for each protest category
df_healthcare = create_target_df(df, 'healthcare', protest_categories['healthcare'])
df_racism = create_target_df(df, 'racism_immigration', protest_categories['racism_immigration'])
df_housing = create_target_df(df, 'housing', protest_categories['housing'])
df_violence = create_target_df(df, 'guns_criminal_justice', protest_categories['guns_criminal_justice'])

In [8]:
# fill missing dates with zeros
def fill_missing_dates(df, target_col='target'):
   # Reset index 
   df = df.reset_index()
   
   # Convert periods to datetime
   df['period_dt'] = pd.to_datetime(df['period'].astype(str), format='%Y%m', errors='coerce')
   
   # Get all states
   all_states = df['state'].unique()
   
   # Find global min and max date
   global_min_date = df['period_dt'].min()
   global_max_date = df['period_dt'].max()
   
   # Create all possible dates in range with monthly frequency
   all_dates = pd.date_range(start=global_min_date, end=global_max_date, freq='MS')
   
   # Create all state-date combinations
   all_combinations = []
   for state in all_states:
       for date in all_dates:
           period = date.strftime('%Y%m')
           all_combinations.append((state, period))
   
   # Convert to DataFrame
   complete_grid = pd.DataFrame(all_combinations, columns=['state', 'period'])
   
   # Merge with original data
   columns_to_merge = ['state', 'period', target_col]
   if 'violence' in df.columns:
       columns_to_merge.append('violence')
   
   merged = pd.merge(complete_grid, df[columns_to_merge], on=['state', 'period'], how='left')
   
   # Fill NaN in target column with 0
   merged[target_col] = merged[target_col].fillna(0)
   
   # Fill violence with 0 if present and NaN
   if 'violence' in merged.columns:
       merged['violence'] = merged['violence'].fillna(0)
   
   # Set index again and sort
   result = merged.set_index(['state', 'period']).sort_index()
   
   return result

df_housing = fill_missing_dates(df_housing)
df_violence = fill_missing_dates(df_violence)
df_racism = fill_missing_dates(df_racism)
df_healthcare = fill_missing_dates(df_healthcare)

## 3. Define incidence variable

In [9]:
# 8. Creating incidence variable for each category
te_housing = target_engineer.TargetEngineer(df=df_housing, unit='state', time='period', y_col="target")
te_violence = target_engineer.TargetEngineer(df=df_violence, unit='state', time='period', y_col="target")
te_racism = target_engineer.TargetEngineer(df=df_racism, unit='state', time='period', y_col="target")
te_healthcare = target_engineer.TargetEngineer(df=df_healthcare, unit='state', time='period', y_col="target")

threshold = 0
horizon = 6
incidence_housing = te_housing.incidence(threshold=threshold, horizon=horizon)
incidence_violence = te_violence.incidence(threshold=threshold, horizon=horizon)
incidence_racism = te_racism.incidence(threshold=threshold, horizon=horizon)
incidence_healthcare = te_healthcare.incidence(threshold=threshold, horizon=horizon)

# Show output
print("Dataset for housing-related protests:")
print(incidence_housing.head(15))

Dataset for housing-related protests:
              target  anytarget_th0  inc_anytarget_th0_h6
state period                                             
AK    201701     0.0              0                   0.0
      201702     0.0              0                   0.0
      201703     0.0              0                   0.0
      201704     0.0              0                   0.0
      201705     0.0              0                   1.0
      201706     0.0              0                   1.0
      201707     0.0              0                   1.0
      201708     0.0              0                   1.0
      201709     0.0              0                   1.0
      201710     0.0              0                   1.0
      201711     1.0              1                   0.0
      201712     0.0              0                   0.0
      201801     0.0              0                   0.0
      201802     0.0              0                   0.0
      201803     0.0              

In [10]:
incidence_housing.to_csv('clean_data/incidence_housing.csv')
incidence_violence.to_csv('clean_dataincidence_violence.csv')
incidence_racism.to_csv('clean_data/incidence_racism.csv')
incidence_healthcare.to_csv('clean_data/incidence_healthcare.csv')