# 0. Libraries and setup

In [1]:
from sklearn.model_selection import KFold, TimeSeriesSplit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from panelsplit.cross_validation import PanelSplit
from functools import reduce
from hw1_library import feature_engineering, target_engineer, text_preprocessing

# 1. Loading the data that have been previously cleaned and merged

In [2]:
df = pd.read_csv('clean_data/ccc_combined.csv', low_memory=False)

In [3]:
df.head()

Unnamed: 0,date,year,month,state,type,issues,actors,claims,valence,size_low,size_high,size_mean,arrests_any,injuries_crowd_any,injuries_police_any,property_damage_any,notes
0,2017-01-01,2017,1.0,DC,vigil,military,,for banning nuclear weapons; for peace,0.0,,,,0.0,0.0,0.0,0.0,White House Peace Vigil continuous since June ...
1,2017-01-01,2017,1.0,MN,vigil,military,Peace Vigil Mankato,for peace,0.0,,,,0.0,0.0,0.0,0.0,every Sunday since 2001
2,2017-01-01,2017,1.0,MN,protest; banner drop,banking and finance; economy; energy; environm...,general protestors,against the Dakota Access Pipeline; for indige...,1.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,hung banner from stadium roof during NFL game
3,2017-01-01,2017,1.0,RI,vigil,environment; guns; military,Sakonnet Peace Alliance,for peace; for gun control; for climate action,1.0,,,,0.0,0.0,0.0,0.0,every Sunday since 2003
4,2017-01-01,2017,1.0,TN,vigil,military,Oak Ridge Environmental Peace Alliance,for abolishing nuclear weapons,0.0,,,,0.0,0.0,0.0,0.0,every Sunday since the late 1990s


# 2. Feature engineering 
In our dataset, we identify four columns that capture different forms of violence: arrests, injuries among crowd members, injuries involving the police, and property damage. We construct a new binary variable, `violence`, which is equal to 1 if any of these four indicators has a value greater than 0. This variable may offer useful insights, as there could be underlying patterns linking the nature of a protest’s claim to its likelihood of becoming violent. For instance, protests that occur more frequently — often driven by issues perceived as highly relevant or urgent by the population — might be relatively more prone to violence than protests that are rare and centered around less pressing concerns.

In [4]:
# define the columns to use to define the 'violence' column and create this new feature 
cols = ['arrests_any', 'injuries_crowd_any', 'injuries_police_any', 'property_damage_any']
df['violence'] = df[cols].any(axis=1).astype(int)
# drop the initial columns which are now redundant 
df.drop(columns=cols, inplace=True)

In [5]:
df.head()

Unnamed: 0,date,year,month,state,type,issues,actors,claims,valence,size_low,size_high,size_mean,notes,violence
0,2017-01-01,2017,1.0,DC,vigil,military,,for banning nuclear weapons; for peace,0.0,,,,White House Peace Vigil continuous since June ...,0
1,2017-01-01,2017,1.0,MN,vigil,military,Peace Vigil Mankato,for peace,0.0,,,,every Sunday since 2001,0
2,2017-01-01,2017,1.0,MN,protest; banner drop,banking and finance; economy; energy; environm...,general protestors,against the Dakota Access Pipeline; for indige...,1.0,2.0,2.0,2.0,hung banner from stadium roof during NFL game,1
3,2017-01-01,2017,1.0,RI,vigil,environment; guns; military,Sakonnet Peace Alliance,for peace; for gun control; for climate action,1.0,,,,every Sunday since 2003,0
4,2017-01-01,2017,1.0,TN,vigil,military,Oak Ridge Environmental Peace Alliance,for abolishing nuclear weapons,0.0,,,,every Sunday since the late 1990s,0


In [6]:
df['violence'].value_counts()

violence
0    218031
1      5353
Name: count, dtype: int64

We already removed NaNs in state and date columns, but we double check if there are still some and we eventually remove them.

In [7]:
print(df['state'].isna().sum())
print(df['date'].isna().sum())

0
2


In [8]:
df = df.dropna(subset=['date'])
print(df['date'].isna().sum())

0


We handle the date column by transforming it to datetime and then to a string to only keep month and year as we don't need the day for our application. 

In [9]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['period'] = df['date'].dt.strftime('%Y%m')

In [10]:
df.head()

Unnamed: 0,date,year,month,state,type,issues,actors,claims,valence,size_low,size_high,size_mean,notes,violence,period
0,2017-01-01,2017,1.0,DC,vigil,military,,for banning nuclear weapons; for peace,0.0,,,,White House Peace Vigil continuous since June ...,0,201701
1,2017-01-01,2017,1.0,MN,vigil,military,Peace Vigil Mankato,for peace,0.0,,,,every Sunday since 2001,0,201701
2,2017-01-01,2017,1.0,MN,protest; banner drop,banking and finance; economy; energy; environm...,general protestors,against the Dakota Access Pipeline; for indige...,1.0,2.0,2.0,2.0,hung banner from stadium roof during NFL game,1,201701
3,2017-01-01,2017,1.0,RI,vigil,environment; guns; military,Sakonnet Peace Alliance,for peace; for gun control; for climate action,1.0,,,,every Sunday since 2003,0,201701
4,2017-01-01,2017,1.0,TN,vigil,military,Oak Ridge Environmental Peace Alliance,for abolishing nuclear weapons,0.0,,,,every Sunday since the late 1990s,0,201701


Since our dataset includes only actual protest events (i.e., there are no explicit zero entries indicating the absence of protests), and given that multiple protests can occur within the same month and state, we adopt an aggregation strategy to construct the target variable.

As previously mentioned, we decided to focus on a subset of four protest types that we consider particularly relevant within the specific context of our analysis. Consequently, any protest that does not fall into one of these four categories is classified as a zero (i.e., not relevant for the target).

The logic implemented in the following code is the following:

- We define a dictionary that maps each of the four selected protest categories to a set of representative keywords. For each protest, we check whether its `issues` field contains any of these keywords. If no match is found, the protest is assigned label `0` (non-relevant).
- If a protest matches multiple categories, it is **exploded** into multiple rows - one for each relevant theme - so that each category is treated independently during aggregation.
- We group the data by `state` and `period` (month-year). For each group, we count the number of protests per category and assign the **target label** as the most frequent category in that group. If no protest matches any of the four target categories, the group is assigned label `0`.

This approach is more informative than forecasting individual protests, as it captures broader and potentially recurring patterns of social unrest. A single isolated protest may not reflect a systemic issue, whereas repeated protests related to the same claim over a short period of time could indicate more deeply rooted societal tensions that deserve attention.

Finally, while we focus on these four protest categories because of their relevance to the U.S. context, this methodology can be adapted to other regions by selecting different categories that reflect the local socio-political landscape.

In [11]:
def assign_majority_label(df):
    # define keywords for each protest category
    categories = {
        'healthcare': ['healthcare'],
        'racism_immigration': ['racism', 'immigration'],
        'housing': ['housing'],
        'guns_criminal_justice': ['guns', 'criminal justice']
    }

    # identify which categories match a given 'issues' text
    def categorize_targets(issue_text):
        if pd.isna(issue_text):
            
            return []
        issue_text = issue_text.lower()
        matches = []
        for category, keywords in categories.items():
            if any(keyword in issue_text for keyword in keywords):
                matches.append(category)
        return matches if matches else [0]

    # apply the categorization to the 'issues' column
    df['target_list'] = df['issues'].apply(categorize_targets)

    # create a new row for each matched category
    df_exploded = df.explode('target_list').rename(columns={'target_list': 'target'})

    # remove entries that didn't match any relevant category (i.e., target = 0)
    df_exploded = df_exploded[df_exploded['target'] != 0]

    # group by state and period 
    grouped = df_exploded.groupby(['state', 'period'])

    results = []

    for (state, period), group in grouped:
        # count how many times each category appears
        counts = group['target'].value_counts().to_dict()

        # select the category with the highest frequency
        majority_class = max(counts, key=counts.get) if counts else 0

        # handle the 'violence' column: if at least one protest was violent in this group, set violence = 1
        aggregated_violence = 1 if (group['violence'] == 1).any() else 0

        results.append({
            'state': state,
            'period': period,
            'target': majority_class,
            'violence': aggregated_violence
        })

    # create a df with the aggregated results and sort it
    result_df = pd.DataFrame(results)
    result_df = result_df.sort_values(by=['state', 'period']).reset_index(drop=True)

    return result_df

df = assign_majority_label(df)

In [12]:
df['target'].value_counts()

target
racism_immigration       3004
guns_criminal_justice     734
healthcare                715
0                         164
housing                   120
Name: count, dtype: int64

In [13]:
df.head()

Unnamed: 0,state,period,target,violence
0,AK,201701,racism_immigration,0
1,AK,201702,healthcare,0
2,AK,201704,0,0
3,AK,201706,guns_criminal_justice,0
4,AK,201707,healthcare,0


We observe that most protest events are classified under one of the four targeted categories. The largest share is `racism_immigration` (≈ 3000 instances), followed by `guns_criminal_justice` and `healthcare`. However, a non-negligible number of state-month pairs (164) fall into the `0` category - indicating protest activity that does **not** match any of our priority themes.

Key insights:

- As expected, although the selected topics are highly relevant for our analysis, they can't cover the entire spectrum of protest motivations.
  
- The `0` category acts as a residual class, grouping together diverse protests that do not match our predefined categories. While these events may be thematically important, they are intentionally excluded from the primary classification task to retain thematic focus. 

As follows, we aggregate all rows with `target == 0` that occur within the same `(state, period)` combination. Since they do not belong to our core categories, it would not be useful to treat them as separate events.

In [14]:
# filter for rows with target == 0 and group them by state and period
target_zero = df[df['target'] == 0]
target_zero_grouped = (
    target_zero.groupby(['state', 'period'])
    .size()
    .reset_index(name='count') 
)
target_zero_grouped['target'] = 0

# filter rows where target != 0
target_non_zero = df[df['target'] != 0].copy()

# get combinations of state and period where target != 0
non_zero_combinations = target_non_zero[['state', 'period']].drop_duplicates()

# removes rows where target == 0 that have the same combination of state and period of target != 0
target_zero_filtered = target_zero_grouped[~(target_zero_grouped[['state', 'period']]
                                            .apply(tuple, axis=1)
                                            .isin(non_zero_combinations.apply(tuple, axis=1)))]

# combine rows in a dataframe
df = pd.concat([target_non_zero, target_zero_filtered[['state', 'period', 'target']]], 
               ignore_index=True)

# set the index to state and period and sort 
df = df.sort_values(by=['state', 'period']).set_index(['state', 'period'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,target,violence
state,period,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,201701,racism_immigration,0.0
AK,201702,healthcare,0.0
AK,201704,0,
AK,201706,guns_criminal_justice,0.0
AK,201707,healthcare,0.0


Here we are creating different dataframes depending on the target variable. Since our purpose will be forecasting if a specific set of demonstrations will occur in the next h months, we need to transform our multiclass classification problem into multiple binary classification problems.

For each protest category (housing, guns_criminal_justice, racism_immigration, healthcare), we create a separate dataframe containing:
1) All observations where the target equals the specific category (set to 1)
2) All observations where the target equals 0 (kept as 0)

This approach allows us to train separate models for each protest type, effectively transforming our multiclass problem into four independent binary classification tasks. For each target category:
- Target = 1: A protest of that specific category occurred
- Target = 0: No protest or a protest of another category occurred

By structuring our data this way, we can better analyze patterns specific to each protest type and potentially improve forecast accuracy by focusing on category-specific signals.

In [15]:
# filter for observations where target == 0 or target == 'housing'
df_housing = df[(df['target'] == 0) | (df['target'] == 'housing')].copy()
df_housing['target'] = df_housing['target'].apply(lambda x: 1 if x != 0 else 0)

# filter for observations where target == 0 or target == 'guns_criminal_justice'
df_violence = df[(df['target'] == 0) | (df['target'] == 'guns_criminal_justice')].copy()
df_violence['target'] = df_violence['target'].apply(lambda x: 1 if x != 0 else 0)

# filter for observations where target == 0 or target == 'racism_immigration'
df_racism = df[(df['target'] == 0) | (df['target'] == 'racism_immigration')].copy()
df_racism['target'] = df_racism['target'].apply(lambda x: 1 if x != 0 else 0)

# filter for observations where target == 0 or target == 'healthcare'
df_healthcare = df[(df['target'] == 0) | (df['target'] == 'healthcare')].copy()
df_healthcare['target'] = df_healthcare['target'].apply(lambda x: 1 if x != 0 else 0)

By construction, all these dataframes will contain missing dates, so we are checking for those and imputing them with zeros, as we know that those missing values simply represent another type of protest.

To ensure consistency across time and states, we define a function that fills in these missing entries by generating the full set of `(state, period)` combinations and merging them with the original data. Missing target values are then imputed with zero, under the assumption that no event of interest occurred.

We validate the output using a separate function that checks whether all expected months are present for each state.

Together, these two functions ensure a complete and uniform dataset.

In [16]:
def fill_missing_dates(df, target_col='target'):
    # reset index 
    df = df.reset_index()
    
    # convert periods to datetime
    df['period_dt'] = pd.to_datetime(df['period'].astype(str), format='%Y%m', errors='coerce')
    
    # get all states
    all_states = df['state'].unique()
    
    # find global min and max period
    global_min_date = df['period_dt'].min()
    global_max_date = df['period_dt'].max()
    
    # create all possible dates in the range with monthly frequency
    all_dates = pd.date_range(start=global_min_date, end=global_max_date, freq='MS')
    
    # create all state-date combinations
    all_combinations = []
    for state in all_states:
        for date in all_dates:
            period = date.strftime('%Y%m')
            all_combinations.append((state, period))
    
    # convert to df
    complete_grid = pd.DataFrame(all_combinations, columns=['state', 'period'])
    
    # merge with original data
    merged = pd.merge(complete_grid, df[['state', 'period', target_col, 'violence'] if 'violence' in df.columns else ['state', 'period', target_col]], 
                     on=['state', 'period'], how='left')
    
    # fill NaNs in target column with 0
    merged[target_col] = merged[target_col].fillna(0)
    
    # fil violence with 0 NaN
    if 'violence' in merged.columns:
        merged['violence'] = merged['violence'].fillna(0)
    
    # set again the index 
    result = merged.set_index(['state', 'period']).sort_index()
    
    return result

df_housing = fill_missing_dates(df_housing)
df_violence = fill_missing_dates(df_violence)
df_racism = fill_missing_dates(df_racism)
df_healthcare = fill_missing_dates(df_healthcare)

In [17]:
def check_missing_dates(df):
    missing_dates = {}

    # make sure period is a string
    df = df.copy()
    df.index = df.index.set_levels(df.index.levels[1].astype(str), level='period')

    for state in df.index.get_level_values('state').unique():
        # get all periods for the current state
        state_periods = sorted(df.loc[state].index.get_level_values('period').unique())

        if not state_periods:
            continue

        # convert to datetime
        state_periods_dt = [pd.to_datetime(period, format='%Y%m') for period in state_periods]
        
        # extract first and last periods 
        start_dt = min(state_periods_dt)
        end_dt = max(state_periods_dt)
        
        # generate a complete range of months 
        full_range_dt = pd.date_range(start=start_dt, end=end_dt, freq='MS')
        
        # convert back to string format (initial format)
        full_range = [dt.strftime('%Y%m') for dt in full_range_dt]
        
        # take missing periods 
        actual_periods = set(state_periods)
        expected_periods = set(full_range)
        missing = sorted(expected_periods - actual_periods)

        if missing:
            missing_dates[state] = missing

    return missing_dates

missing_housing = check_missing_dates(df_housing)
missing_violence = check_missing_dates(df_violence)
missing_racism = check_missing_dates(df_racism)
missing_healthcare = check_missing_dates(df_healthcare)

print("Missing dates in df_housing:")
for state, dates in missing_housing.items():
    print(f"State: {state}, Missing dates: {list(dates)}")

print("\nMissing dates in df_violence:")
for state, dates in missing_violence.items():
    print(f"State: {state}, Missing dates: {list(dates)}")

print("\nMissing dates in df_racism:")
for state, dates in missing_racism.items():
    print(f"State: {state}, Missing dates: {list(dates)}")

print("\nMissing dates in df_healthcare:")
for state, dates in missing_healthcare.items():
    print(f"State: {state}, Missing dates: {list(dates)}")

Missing dates in df_housing:

Missing dates in df_violence:

Missing dates in df_racism:

Missing dates in df_healthcare:


## 3. Define incidence variable

Our target variable is the **incidence** of protests by category. Specifically, we aim to forecast whether protests related to a given claim will occur within the next 6 months (`horizon = 6`) for each `(state, period)` pair.

This setup allows us to identify early signals of emerging unrest tied to specific themes, rather than focusing on isolated events.

If all models predict a `0`, it may indicate either the absence of protest activity or the occurrence of a protest unrelated to our selected categories. However, this ambiguity is not critical for our purpose: the goal is to detect protests that reflect urgent, actionable issues on which the state could realistically intervene in the short term.

Therefore, the inability to distinguish between "no protest" and "other type of protest" is an acceptable trade-off for maintaining thematic focus and policy relevance.


In [18]:
te_housing = target_engineer.TargetEngineer(df=df_housing, unit='state', time='period', y_col="target")
te_violence = target_engineer.TargetEngineer(df=df_violence, unit='state', time='period', y_col="target")
te_racism = target_engineer.TargetEngineer(df=df_racism, unit='state', time='period', y_col="target")
te_healthcare = target_engineer.TargetEngineer(df=df_healthcare, unit='state', time='period', y_col="target")

threshold = 0
horizon = 6
incidence_housing = te_housing.incidence(threshold=threshold, horizon=horizon)
incidence_violence = te_violence.incidence(threshold=threshold, horizon=horizon)
incidence_racism = te_racism.incidence(threshold=threshold, horizon=horizon)
incidence_healthcare = te_healthcare.incidence(threshold=threshold, horizon=horizon)

In [19]:
incidence_housing.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,target,anytarget_th0,inc_anytarget_th0_h6
state,period,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,201701,0.0,0,0.0
AK,201702,0.0,0,0.0
AK,201703,0.0,0,0.0
AK,201704,0.0,0,0.0
AK,201705,0.0,0,1.0
AK,201706,0.0,0,1.0
AK,201707,0.0,0,1.0
AK,201708,0.0,0,1.0
AK,201709,0.0,0,1.0
AK,201710,0.0,0,1.0
