# 0. Libraries and setup

In [1]:
from sklearn.model_selection import KFold, TimeSeriesSplit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from panelsplit.cross_validation import PanelSplit
from functools import reduce
from hw1_library import feature_engineering, target_engineer, text_preprocessing

# 1. Loading the data that have been previously cleaned and merged

In [2]:
df = pd.read_csv('clean_data/ccc_combined.csv', low_memory=False)
df.head()

Unnamed: 0,date,year,month,state,type,issues,actors,claims,valence,size_low,size_high,size_mean,arrests_any,injuries_crowd_any,injuries_police_any,property_damage_any,notes
0,2017-01-01,2017,1.0,DC,vigil,military,,for banning nuclear weapons; for peace,0.0,,,,0.0,0.0,0.0,0.0,White House Peace Vigil continuous since June ...
1,2017-01-01,2017,1.0,MN,vigil,military,Peace Vigil Mankato,for peace,0.0,,,,0.0,0.0,0.0,0.0,every Sunday since 2001
2,2017-01-01,2017,1.0,MN,protest; banner drop,banking and finance; economy; energy; environm...,general protestors,against the Dakota Access Pipeline; for indige...,1.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,hung banner from stadium roof during NFL game
3,2017-01-01,2017,1.0,RI,vigil,environment; guns; military,Sakonnet Peace Alliance,for peace; for gun control; for climate action,1.0,,,,0.0,0.0,0.0,0.0,every Sunday since 2003
4,2017-01-01,2017,1.0,TN,vigil,military,Oak Ridge Environmental Peace Alliance,for abolishing nuclear weapons,0.0,,,,0.0,0.0,0.0,0.0,every Sunday since the late 1990s


# 2. Feature engineering 
In our dataset, we identify four columns that capture different forms of violence: arrests, injuries among crowd members, injuries involving the police, and property damage. We construct a new binary variable, `violence`, which is equal to 1 if any of these four indicators has a value greater than 0. This variable may offer useful insights, as there could be underlying patterns linking the nature of a protest’s claim to its likelihood of becoming violent. For instance, protests that occur more frequently — often driven by issues perceived as highly relevant or urgent by the population — might be relatively more prone to violence than protests that are rare and centered around less pressing concerns.

In [3]:
# define the columns to use to define the 'violence' column and create this new feature 
cols = ['arrests_any', 'injuries_crowd_any', 'injuries_police_any', 'property_damage_any']
df['violence'] = df[cols].any(axis=1).astype(int)
# drop the initial columns which are now redundant 
df.drop(columns=cols, inplace=True)
df.head()

Unnamed: 0,date,year,month,state,type,issues,actors,claims,valence,size_low,size_high,size_mean,notes,violence
0,2017-01-01,2017,1.0,DC,vigil,military,,for banning nuclear weapons; for peace,0.0,,,,White House Peace Vigil continuous since June ...,0
1,2017-01-01,2017,1.0,MN,vigil,military,Peace Vigil Mankato,for peace,0.0,,,,every Sunday since 2001,0
2,2017-01-01,2017,1.0,MN,protest; banner drop,banking and finance; economy; energy; environm...,general protestors,against the Dakota Access Pipeline; for indige...,1.0,2.0,2.0,2.0,hung banner from stadium roof during NFL game,1
3,2017-01-01,2017,1.0,RI,vigil,environment; guns; military,Sakonnet Peace Alliance,for peace; for gun control; for climate action,1.0,,,,every Sunday since 2003,0
4,2017-01-01,2017,1.0,TN,vigil,military,Oak Ridge Environmental Peace Alliance,for abolishing nuclear weapons,0.0,,,,every Sunday since the late 1990s,0


We already removed NaNs in state and date columns, but we double check if there are still some and we eventually remove them.

In [4]:
print(df['state'].isna().sum())
print(df['date'].isna().sum())

0
2


In [5]:
df = df.dropna(subset=['date'])
print(df['date'].isna().sum())

0


We handle the date column by transforming it to datetime and then to a string to only keep month and year as we don't need the day for our application. 

In [6]:
# 'period' column instead of 'date' column to just consider year and month
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['period'] = df['date'].dt.strftime('%Y%m')
df.head()

Unnamed: 0,date,year,month,state,type,issues,actors,claims,valence,size_low,size_high,size_mean,notes,violence,period
0,2017-01-01,2017,1.0,DC,vigil,military,,for banning nuclear weapons; for peace,0.0,,,,White House Peace Vigil continuous since June ...,0,201701
1,2017-01-01,2017,1.0,MN,vigil,military,Peace Vigil Mankato,for peace,0.0,,,,every Sunday since 2001,0,201701
2,2017-01-01,2017,1.0,MN,protest; banner drop,banking and finance; economy; energy; environm...,general protestors,against the Dakota Access Pipeline; for indige...,1.0,2.0,2.0,2.0,hung banner from stadium roof during NFL game,1,201701
3,2017-01-01,2017,1.0,RI,vigil,environment; guns; military,Sakonnet Peace Alliance,for peace; for gun control; for climate action,1.0,,,,every Sunday since 2003,0,201701
4,2017-01-01,2017,1.0,TN,vigil,military,Oak Ridge Environmental Peace Alliance,for abolishing nuclear weapons,0.0,,,,every Sunday since the late 1990s,0,201701


Since our dataset includes only actual protest events (i.e., there are no explicit zero entries indicating the absence of protests), and given that multiple protests can occur within the same month and state, we adopt an aggregation strategy to construct the target variable.

We focus on four protest categories particularly relevant within the U.S. context: healthcare, racism/immigration, housing, and guns/criminal justice. Each category is identified through a set of representative keywords in the protest's 'issues' field.

The process implemented in our code involves:

1. **Categorization**: For each of the four selected categories, we identify protests containing relevant keywords in their 'issues' field.

2. **Aggregation**: We group the data by state and period (month-year), calculating two key metrics:
  - The total count of protests for each category in that state-month
  - Whether any of those protests involved violence

3. **Handling of NaNs in the date column**: We had some states for which there were some months with no protests, and this caused the groupby operation by state and period to create NaNs in the `protests_count` column (instead of properly having zeros). By filling the missing dates with zeros (i.e., combination of month and state that did not experience any protest) than we also properly create the `protests_count` column, having in it only zeros or actual counts of protests. 

4. **Threshold-based target creation**: Unlike a simple binary indicator, we create a more sophisticated target variable that captures unusual levels of protest activity:
  - For each state, we calculate the average number of protests per month for each category over the entire time period and we use it as state-specific threshold
  - A state-month is assigned a target of 1 if the number of protests exceeds this threshold
  - This approach adjusts for the baseline level of protest activity in each state, making the target more meaningful

This dynamic threshold approach offers several advantages:
- It accounts for varying baseline protest levels across states (e.g., New York naturally has more protests than Wyoming)
- It helps identify unusual spikes in protest activity rather than just the presence of protests
- It creates a more balanced target variable, improving the potential predictive power of forecasting models

By tracking these unusual protest levels over time and across states, we can identify patterns that might indicate emerging social tensions before they escalate into more widespread unrest.

In [None]:
# define protest types to monitor
protest_categories = {
   'healthcare': ['healthcare'],
   'racism_immigration': ['racism', 'immigration'],
   'housing': ['housing'],
   'guns_criminal_justice': ['guns', 'criminal justice']
}

# count protests for each category
def create_target_df(df, category, keywords):
    # identify protests containing keywords for the category
    df_copy = df.copy()
    df_copy[category] = 0
    
    for idx, row in df_copy.iterrows():
        issue_text = row['issues']
        if pd.notna(issue_text):
            issue_text = issue_text.lower()
            if any(keyword in issue_text for keyword in keywords):
                df_copy.loc[idx, category] = 1
    
    # aggregate by state and period, counting the number of relevant protests and handling the violence column
    aggregated = df_copy.groupby(['state', 'period']).agg({
        category: 'sum',       # count of protests in the category
        'violence': 'max'      # if any protest had violence, then add 1
    }).reset_index()
    
    # rename columns
    aggregated = aggregated.rename(columns={category: 'protest_count'})
    
    # convert periods to datetime for finding min/max
    aggregated['period_dt'] = pd.to_datetime(aggregated['period'], format='%Y%m')
    
    # get all unique states and time periods
    all_states = aggregated['state'].unique()
    min_date = aggregated['period_dt'].min()
    max_date = aggregated['period_dt'].max()
    all_dates = pd.date_range(start=min_date, end=max_date, freq='MS')
    
    # create a complete grid of all state-period combinations
    complete_grid = []
    for state in all_states:
        for date in all_dates:
            period = date.strftime('%Y%m')
            complete_grid.append((state, period))
    
    complete_df = pd.DataFrame(complete_grid, columns=['state', 'period'])
    
    # merge with the aggregated data, which will introduce NaNs for missing combinations
    merged = pd.merge(complete_df, aggregated.drop(columns='period_dt'), 
                     on=['state', 'period'], how='left')
    
    # fill NaNs with 0 (no protests or violence in those months)
    merged['protest_count'] = merged['protest_count'].fillna(0)
    merged['violence'] = merged['violence'].fillna(0)
    
    # set the index and sort
    result = merged.set_index(['state', 'period']).sort_index()
    
    return result

# create dataframes for each protest category
df_healthcare = create_target_df(df, 'healthcare', protest_categories['healthcare'])
df_racism = create_target_df(df, 'racism_immigration', protest_categories['racism_immigration'])
df_housing = create_target_df(df, 'housing', protest_categories['housing'])
df_violence = create_target_df(df, 'guns_criminal_justice', protest_categories['guns_criminal_justice'])

Our aggregated protest data naturally contains gaps—time periods without recorded protests for certain states. 
To build a robust forecasting model, we need a complete and balanced panel dataset without missing state-month 
combinations.

The `fill_missing_dates` function ensures doing the following:

- Generates all possible combinations of states and months between the earliest and latest dates in our dataset

- Fills gaps by merging this complete grid with our actual protest data and filling missing protest counts with zeros (assuming no relevant protests occurred).

The resulting balanced panel dataset provides a consistent foundation for our subsequent threshold-based target
creation and forecasting models. For each protest category (housing, racism/immigration, healthcare, and guns/criminal
justice), we now have a complete record that includes both periods with and without protest activity.

In [10]:
# fill missing dates with zeros
def fill_missing_dates(df, count_col='protest_count'):
    # reset index 
    df = df.reset_index()
   
    # convert periods to datetime
    df['period_dt'] = pd.to_datetime(df['period'].astype(str), format='%Y%m', errors='coerce')
   
    # get all states
    all_states = df['state'].unique()
   
    # find global min and max date
    global_min_date = df['period_dt'].min()
    global_max_date = df['period_dt'].max()
   
    # create all possible dates in range with monthly frequency
    all_dates = pd.date_range(start=global_min_date, end=global_max_date, freq='MS')
   
    # create a df with all state-date combinations
    all_combinations = []
    for state in all_states:
        for date in all_dates:
            period = date.strftime('%Y%m')
            all_combinations.append((state, period))
    complete_grid = pd.DataFrame(all_combinations, columns=['state', 'period'])
   
    # merge with original data
    columns_to_merge = ['state', 'period', count_col]
    if 'violence' in df.columns:
        columns_to_merge.append('violence')
   
    merged = pd.merge(complete_grid, df[columns_to_merge], on=['state', 'period'], how='left')
   
    # fill violence with 0 
    if 'violence' in merged.columns:
        merged['violence'] = merged['violence'].fillna(0)
   
    # set index again and sort
    result = merged.set_index(['state', 'period']).sort_index()
   
    return result

df_housing = fill_missing_dates(df_housing, count_col='protest_count')
df_violence = fill_missing_dates(df_violence, count_col='protest_count')
df_racism = fill_missing_dates(df_racism, count_col='protest_count')
df_healthcare = fill_missing_dates(df_healthcare, count_col='protest_count')

After counting protests by category for each state-month combination, we now need to define what constitutes an 
"unusual" or "significant" level of protest activity worth forecasting. Rather than using an arbitrary fixed 
threshold that would apply equally to all states, we implement a dynamic threshold approach that accounts for 
state-specific baseline activity.

Our `create_dynamic_threshold_target` function:

- For each state, we calculate the average monthly number of protests for each category over the entire time period. This captures the "normal" level of protest activity for that state and category.
- A state-month is assigned a target value of 1 if the number of protests exceeds the state's average for that category, and 0 otherwise. This effectively identifies months with above-average protest activity.

This approach accounts for state heterogeneity and so is able to dentify anomalies, indeed we are interested in detecting unusual spikes in protest activity relative to what is normal for each state. Lastly, by focusing on above-average activity rather than just the presence of protests, we create a more balanced target variable. If instead we would be assigning 1 to the target variable if only 1 protest occured, we would have an higly unbalanced dataset we basically no zeros. 


In [11]:
# create target based on dynamic threshold (monthly state average, divided by the topic)
def create_dynamic_threshold_target(df, category_name):
    # reset index
    df_reset = df.reset_index()
    
    # calculate average protest count per state
    state_avg = df_reset.groupby('state')['protest_count'].mean().reset_index()
    state_avg = state_avg.rename(columns={'protest_count': 'avg_protest_count'})
    
    # merge back average protest count
    df_with_avg = pd.merge(df_reset, state_avg, on='state', how='left')
    
    # create target based on whether monthly protest count exceeds state's average
    df_with_avg['target'] = (df_with_avg['protest_count'] > df_with_avg['avg_protest_count']).astype(int)
    
    # handle states with zero average (no protests historically)
    zero_avg_mask = (df_with_avg['avg_protest_count'] == 0)
    df_with_avg.loc[zero_avg_mask & (df_with_avg['protest_count'] > 0), 'target'] = 1
    
    # final df
    result = df_with_avg[['state', 'period', 'protest_count', 'violence', 'target', 'avg_protest_count']]
    
    # set index and sort
    result = result.set_index(['state', 'period']).sort_index()
    
    return result

housing_target = create_dynamic_threshold_target(df_housing, 'housing')
violence_target = create_dynamic_threshold_target(df_violence, 'guns_criminal_justice')
racism_target = create_dynamic_threshold_target(df_racism, 'racism_immigration')
healthcare_target = create_dynamic_threshold_target(df_healthcare, 'healthcare')

## 3. Define incidence variable

Our target variable is the **incidence** of protests by category. Specifically, we aim to forecast whether an unusual number of protests related to a given claim will occur within the next 6 months (`horizon = 6`) for each `(state, period)` pair.

This setup allows us to identify early signals of emerging unrest tied to specific themes, rather than focusing on isolated events.

If all models predict a `0`, it may indicate either the absence of protest activity or the occurrence of a protest unrelated to our selected categories. However, this ambiguity is not critical for our purpose: the goal is to detect protests that reflect urgent, actionable issues on which the state could realistically intervene in the short term.

Therefore, the inability to distinguish between "no protest" and "other type of protest" is an acceptable trade-off for maintaining thematic focus and policy relevance.

In [12]:
# create incidence variable for each df (i.e., each category of protests)
te_housing = target_engineer.TargetEngineer(df=housing_target, unit='state', time='period', y_col="target")
te_violence = target_engineer.TargetEngineer(df=violence_target, unit='state', time='period', y_col="target")
te_racism = target_engineer.TargetEngineer(df=racism_target, unit='state', time='period', y_col="target")
te_healthcare = target_engineer.TargetEngineer(df=healthcare_target, unit='state', time='period', y_col="target")

threshold = 0
horizon = 6
incidence_housing = te_housing.incidence(threshold=threshold, horizon=horizon)
incidence_violence = te_violence.incidence(threshold=threshold, horizon=horizon)
incidence_racism = te_racism.incidence(threshold=threshold, horizon=horizon)
incidence_healthcare = te_healthcare.incidence(threshold=threshold, horizon=horizon)

# merge the incidence dataframes with the original dataframes to include protest_count and avg_protest_count
incidence_housing = incidence_housing.join(housing_target[['protest_count', 'avg_protest_count', 'violence']])
incidence_violence = incidence_violence.join(violence_target[['protest_count', 'avg_protest_count', 'violence']])
incidence_racism = incidence_racism.join(racism_target[['protest_count', 'avg_protest_count', 'violence']])
incidence_healthcare = incidence_healthcare.join(healthcare_target[['protest_count', 'avg_protest_count', 'violence']])

# summary statistics
print("Percentage of state-months with above-threshold housing protests:")
print(f"{housing_target['target'].mean() * 100:.2f}%")

print("\nPercentage of state-months with above-threshold criminal justice protests:")
print(f"{violence_target['target'].mean() * 100:.2f}%")

print("\nPercentage of state-months with above-threshold racism/immigration protests:")
print(f"{racism_target['target'].mean() * 100:.2f}%")

print("\nPercentage of state-months with above-threshold healthcare protests:")
print(f"{healthcare_target['target'].mean() * 100:.2f}%")

# save the incidence dataframes 
incidence_housing.to_csv('clean_data/incidence_housing.csv')
incidence_violence.to_csv('clean_data/incidence_violence.csv')
incidence_racism.to_csv('clean_data/incidence_racism.csv')
incidence_healthcare.to_csv('clean_data/incidence_healthcare.csv')

Percentage of state-months with above-threshold housing protests:
20.89%

Percentage of state-months with above-threshold criminal justice protests:
17.23%

Percentage of state-months with above-threshold racism/immigration protests:
28.75%

Percentage of state-months with above-threshold healthcare protests:
24.52%
