# 0. Libraries and setup

In [1]:
from sklearn.model_selection import KFold, TimeSeriesSplit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from panelsplit.cross_validation import PanelSplit
from functools import reduce
from hw1_library import feature_engineering, target_engineer, text_preprocessing

# 1. Loading the data that have been previously cleaned and merged

In [2]:
df = pd.read_csv('clean_data/ccc_combined.csv', low_memory=False)

In [3]:
df.head()

Unnamed: 0,date,year,month,state,type,issues,actors,claims,valence,size_low,size_high,size_mean,arrests_any,injuries_crowd_any,injuries_police_any,property_damage_any,notes
0,2017-01-01,2017,1.0,DC,vigil,military,,for banning nuclear weapons; for peace,0.0,,,,0.0,0.0,0.0,0.0,White House Peace Vigil continuous since June ...
1,2017-01-01,2017,1.0,MN,vigil,military,Peace Vigil Mankato,for peace,0.0,,,,0.0,0.0,0.0,0.0,every Sunday since 2001
2,2017-01-01,2017,1.0,MN,protest; banner drop,banking and finance; economy; energy; environm...,general protestors,against the Dakota Access Pipeline; for indige...,1.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,hung banner from stadium roof during NFL game
3,2017-01-01,2017,1.0,RI,vigil,environment; guns; military,Sakonnet Peace Alliance,for peace; for gun control; for climate action,1.0,,,,0.0,0.0,0.0,0.0,every Sunday since 2003
4,2017-01-01,2017,1.0,TN,vigil,military,Oak Ridge Environmental Peace Alliance,for abolishing nuclear weapons,0.0,,,,0.0,0.0,0.0,0.0,every Sunday since the late 1990s


# 2. Feature engineering 
In our dataset, we identify four columns that capture different forms of violence: arrests, injuries among crowd members, injuries involving the police, and property damage. We construct a new binary variable, `violence`, which is equal to 1 if any of these four indicators has a value greater than 0. This variable may offer useful insights, as there could be underlying patterns linking the nature of a protest’s claim to its likelihood of becoming violent. For instance, protests that occur more frequently — often driven by issues perceived as highly relevant or urgent by the population — might be relatively more prone to violence than protests that are rare and centered around less pressing concerns.

In [4]:
cols = ['arrests_any', 'injuries_crowd_any', 'injuries_police_any', 'property_damage_any']
df['violence'] = df[cols].any(axis=1).astype(int)
df.drop(columns=cols, inplace=True)

In [5]:
df.head()

Unnamed: 0,date,year,month,state,type,issues,actors,claims,valence,size_low,size_high,size_mean,notes,violence
0,2017-01-01,2017,1.0,DC,vigil,military,,for banning nuclear weapons; for peace,0.0,,,,White House Peace Vigil continuous since June ...,0
1,2017-01-01,2017,1.0,MN,vigil,military,Peace Vigil Mankato,for peace,0.0,,,,every Sunday since 2001,0
2,2017-01-01,2017,1.0,MN,protest; banner drop,banking and finance; economy; energy; environm...,general protestors,against the Dakota Access Pipeline; for indige...,1.0,2.0,2.0,2.0,hung banner from stadium roof during NFL game,1
3,2017-01-01,2017,1.0,RI,vigil,environment; guns; military,Sakonnet Peace Alliance,for peace; for gun control; for climate action,1.0,,,,every Sunday since 2003,0
4,2017-01-01,2017,1.0,TN,vigil,military,Oak Ridge Environmental Peace Alliance,for abolishing nuclear weapons,0.0,,,,every Sunday since the late 1990s,0


In [6]:
df['violence'].value_counts()

violence
0    218031
1      5353
Name: count, dtype: int64

We already removed NaNs in state and date columns, but we double check if there are still some and we eventually remove them.

In [7]:
print(df['state'].isna().sum())
print(df['date'].isna().sum())

0
2


In [8]:
df = df.dropna(subset=['date'])
print(df['date'].isna().sum())

0


Since our dataset includes only actual protest events (i.e., there are no explicit zero entries indicating the absence of protests), and given that multiple protests can occur within the same month and state, we adopt an aggregation strategy to construct the target variable.

As previously mentioned, we decided to focus on a subset of four protest types that we consider particularly relevant within the specific context of our analysis. Consequently, any protest that does not fall into one of these four categories is classified as a zero (i.e., not relevant for the target).

The logic implemented in the following code is as follows:
- We aggregate protests by topic, grouping together all protests that occur within the same state and month and share the same claim.
- We then define the target label by examining the count of protests per category. In cases where multiple topics are present in the same time and location, the label is assigned based on the claim with the highest number of occurrences.

This approach is arguably more informative than forecasting individual protests, as it captures broader and potentially recurring patterns of social unrest. A single isolated protest may not reflect a systemic issue, whereas repeated protests related to the same claim over a short period of time could indicate more deeply rooted societal tensions that deserve attention.

Finally, while we focus on these four protest categories because of their relevance to the U.S. context, this methodology can be adapted to other regions by selecting different categories that reflect the local socio-political landscape.

In [10]:
def assign_majority_label(df):
    # categorize protests basing on the 'issues' column
    def categorize_target(claim):
        if pd.isna(claim): 
            return 0
        elif 'healthcare' in claim:
            return 'healthcare'
        elif 'racism' in claim or 'immigration' in claim:
            return 'racism_immigration'
        elif 'housing' in claim:
            return 'housing'
        elif 'guns' in claim or 'criminal justice' in claim:
            return 'guns_criminal_justice'
        else:
            return 0  # 0 if no category matches

    df['target'] = df['issues'].apply(categorize_target)

    # group by state and month
    grouped = df.groupby(['state', df['date'].dt.to_period('M')])

    results = []

    for (state, period), group in grouped:
        # count the occurrences of each target category
        counts = {
            'housing': (group['target'] == 'housing').sum(),
            'guns_criminal_justice': (group['target'] == 'guns_criminal_justice').sum(),
            'racism_immigration': (group['target'] == 'racism_immigration').sum(),
            'healthcare': (group['target'] == 'healthcare').sum()
        }

        # find the majority class
        majority_class = max(counts, key=counts.get) if max(counts.values()) > 0 else 0

        # handle the 'violence' column: when aggregating, if at least 1 protest was violent, set to 1
        aggregated_violence = 1 if (group['violence'] == 1).any() else 0

        results.append({
            'state': state,
            'date': period.start_time,  
            'target': majority_class,
            'violence': aggregated_violence
        })

    result_df = pd.DataFrame(results)

    result_df = result_df.sort_values(by=['state', 'date']).reset_index(drop=True)

    return result_df

df_aggregated = assign_majority_label(df)

print(df_aggregated.head())

  state       date              target  violence
0    AK 2017-01-01  racism_immigration         0
1    AK 2017-02-01          healthcare         0
2    AK 2017-03-01                   0         0
3    AK 2017-04-01                   0         0
4    AK 2017-05-01                   0         0


In [11]:
df['target'].value_counts()

target
0                        132193
racism_immigration        51261
healthcare                18632
guns_criminal_justice     16933
housing                    4363
Name: count, dtype: int64

The majority of protests in our dataset fall under the 0 class, which reflects the challenge of forecasting a target variable in the presence of highly unbalanced data. Although the selected issues have demonstrated relevance within the context of our analysis, it is reasonable to assume that they represent a minority of all recorded events, as the 'general' category encompasses a wide range of other types of protests.

However, a lower frequency does not imply lesser importance. On the contrary, these less common but thematically focused protest types may signal particularly urgent or sensitive societal issues. For this reason, we proceed with our analysis, recognizing that identifying patterns within underrepresented but meaningful categories can offer valuable insights.


In [None]:
# drop non relevant columns
df = df.drop(columns=['year', 'month', 'type', 'issues', 'actors', 'claims', 'valence',
                      'size_low', 'size_high', 'size_mean', 'notes'])

We group together all rows with `target == 0` that share the same combination of date and state. Since these rows are not part of the target categories we aim to forecast, and they all occurred within the same time and location, keeping them separate would not provide additional insight. Aggregating them simplifies the dataset without losing relevant information for the analysis.

In [None]:
# filter rows where target == 0
target_zero = df[df['target'] == 0]

# group by state and month, keeping only one row per combination
target_zero_grouped = (
    target_zero.groupby(['state', df['date'].dt.to_period('M')])
    .size()
    .reset_index(name='count')  # Add a 'count' column to verify the number of combined rows
)

# convert the 'date' column to datetime format to ensure consistency
target_zero_grouped['date'] = target_zero_grouped['date'].dt.to_timestamp()

# keep only the required columns: state, date, target
target_zero_grouped['target'] = 0

# filter rows where target != 0
target_non_zero = df.loc[df['target'] != 0].copy()  # Use .loc and .copy() to avoid SettingWithCopyWarning

# ensure the 'date' column in target_non_zero is in datetime format
target_non_zero['date'] = pd.to_datetime(target_non_zero['date'])

# combine rows with target == 0 and rows with target != 0
df = pd.concat([target_non_zero, target_zero_grouped], ignore_index=True)

# sort the dataset by state and date
df = df.sort_values(by=['state', 'date']).reset_index(drop=True)
print(df.head())

        date state  violence              target
0 2017-01-01    AK       NaN                   0
1 2017-01-29    AK       0.0  racism_immigration
2 2017-01-30    AK       0.0  racism_immigration
3 2017-02-01    AK       NaN                   0
4 2017-02-22    AK       0.0          healthcare


In [15]:
# trasform the 'date' column to ignore the day of the month
df['date'] = df['date'].dt.to_period('M').dt.to_timestamp()

# identify combinations of state and date where target != 0
non_zero_combinations = df[df['target'] != 0][['state', 'date']].drop_duplicates()

# filter out rows with target == 0 if the same state and date exist in non_zero_combinations
df = df[~((df['target'] == 0) & (df[['state', 'date']].apply(tuple, axis=1).isin(non_zero_combinations.apply(tuple, axis=1))))]

In [16]:
df = df.set_index(['state', 'date']).sort_index() 
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,violence,target
state,date,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,2017-01-01,0.0,racism_immigration
AK,2017-01-01,0.0,racism_immigration
AK,2017-02-01,0.0,healthcare
AK,2017-03-01,,0
AK,2017-04-01,,0


In [17]:
# Filter for observations where target == 0 or target == 'housing'
df_housing = df[(df['target'] == 0) | (df['target'] == 'housing')].copy()
df_housing['target'] = df_housing['target'].apply(lambda x: 1 if x != 0 else 0)

# Filter for observations where target == 0 or target == 'guns_criminal_justice'
df_violence = df[(df['target'] == 0) | (df['target'] == 'guns_criminal_justice')].copy()
df_violence['target'] = df_violence['target'].apply(lambda x: 1 if x != 0 else 0)

# Filter for observations where target == 0 or target == 'racism_immigration'
df_racism = df[(df['target'] == 0) | (df['target'] == 'racism_immigration')].copy()
df_racism['target'] = df_racism['target'].apply(lambda x: 1 if x != 0 else 0)

# Filter for observations where target == 0 or target == 'healthcare'
df_healthcare = df[(df['target'] == 0) | (df['target'] == 'healthcare')].copy()
df_healthcare['target'] = df_healthcare['target'].apply(lambda x: 1 if x != 0 else 0)

# 3. Definition of incidence 
Our target variable is the incidence, and we are doing a two-steps forecast:
- forecast whether there will be a protest in the next h months
- forecast the claim of the protest

With regards to the second point, we will focus on 4 main claims that we identified (housing, guns and violence, racism and immigration and healthcare) and, if the forecasted protest does not fall in any of these categories, then it will be labelled as 'other'.
 

In [18]:
te_housing = target_engineer.TargetEngineer(df=df_housing, unit='state', time='date', y_col="target")
te_violence = target_engineer.TargetEngineer(df=df_violence, unit='state', time='date', y_col="target")
te_racism = target_engineer.TargetEngineer(df=df_racism, unit='state', time='date', y_col="target")
te_healthcare = target_engineer.TargetEngineer(df=df_healthcare, unit='state', time='date', y_col="target")

In [19]:
threshold = 0
horizon = 6
incidence_housing = te_housing.incidence(threshold=threshold, horizon=horizon)
incidence_violence = te_violence.incidence(threshold=threshold, horizon=horizon)
incidence_racism = te_racism.incidence(threshold=threshold, horizon=horizon)
incidence_healthcare = te_healthcare.incidence(threshold=threshold, horizon=horizon)

In [20]:
# fill missing dates for each DataFrame
def fill_missing_dates(df, target_col='target'):
    missing_rows = []

    # iterate on each state 
    for state in df.index.get_level_values('state').unique():
        # extract states 
        state_dates = df.loc[state].index.get_level_values('date')
        
        # generate a complete interval of time
        full_range = pd.date_range(start=state_dates.min(), end=state_dates.max(), freq='MS')
        
        # find missing dates 
        missing = full_range.difference(state_dates)
        
        # create artifical rows for each missing date
        for date in missing:
            missing_rows.append({'state': state, 'date': date, target_col: 0})

    # create a DataFrame for the missing rows
    missing_df = pd.DataFrame(missing_rows)

    # add the missing rows to the original DataFrame
    df = df.reset_index()  
    df = pd.concat([df, missing_df], ignore_index=True)
    df = df.set_index(['state', 'date']).sort_index()

    return df

df_housing = fill_missing_dates(df_housing)
df_violence = fill_missing_dates(df_violence)
df_racism = fill_missing_dates(df_racism)
df_healthcare = fill_missing_dates(df_healthcare)

In [21]:
# check whether there are no longer missing dates in the 4 dataframes 
def check_missing_dates(df):
    missing_dates = {}

    # iterate on each state 
    for state in df.index.get_level_values('state').unique():
        # extract dates
        state_dates = df.loc[state].index.get_level_values('date')
        
        # generate a complete interval of time
        full_range = pd.date_range(start=state_dates.min(), end=state_dates.max(), freq='MS')
        
        # find missing dates
        missing = full_range.difference(state_dates)
        
        # if missing dates exist, add them to the dictionary
        if len(missing) > 0:
            missing_dates[state] = missing

    return missing_dates

missing_housing = check_missing_dates(df_housing)
missing_violence = check_missing_dates(df_violence)
missing_racism = check_missing_dates(df_racism)
missing_healthcare = check_missing_dates(df_healthcare)

print("Missing dates in df_housing:")
for state, dates in missing_housing.items():
    print(f"State: {state}, Missing dates: {list(dates)}")

print("\nMissing dates in df_violence:")
for state, dates in missing_violence.items():
    print(f"State: {state}, Missing dates: {list(dates)}")

print("\nMissing dates in df_racism:")
for state, dates in missing_racism.items():
    print(f"State: {state}, Missing dates: {list(dates)}")

print("\nMissing dates in df_healthcare:")
for state, dates in missing_healthcare.items():
    print(f"State: {state}, Missing dates: {list(dates)}")

Missing dates in df_housing:

Missing dates in df_violence:

Missing dates in df_racism:

Missing dates in df_healthcare:
