# 0. Libraries and setup

In [1]:
import pandas as pd
import numpy as np
import os
import chardet

In [2]:
# Set path to the directory containing the CSV files
data_path = '../data/'

# 1. Loading the data

Source of the data: https://dataverse.harvard.edu/dataverse/crowdcountingconsortium

The encoding of the data files isn't the standard one (UTF-8), so we must detect the type of encoding. To do so, we can use the library [`chardet`](https://chardet.readthedocs.io/en/latest/).

In [None]:
# Detect the encoding of the CSV file
# with open(os.path.join(data_path, 'dataverse_files_2017-2020', 'ccc_compiled_20172020.csv'), 'rb') as f:
#     result = chardet.detect(f.read()) 
# print(result)

{'encoding': 'MacRoman', 'confidence': 0.7292219720640579, 'language': ''}


In [3]:
encoding='ISO-8859-1'  # Adjust the encoding type of the CSV files if necessary ('ISO-8859-1' is common for Latin-1 encoded files)

df1 = pd.read_csv(
    os.path.join(data_path, 'dataverse_files_2017-2020', 'ccc_compiled_20172020.csv'), # From Jan 2017 to Dec 2020
    encoding='MacRoman',
    low_memory=False
)
df2 = pd.read_csv(
    os.path.join(data_path, 'dataverse_files_2021-2024', 'ccc_compiled_20212024.csv'), # From Jan 2021 to Dec 2024
    encoding='MacRoman',
    low_memory=False
)  
df3 = pd.read_csv(
    os.path.join(data_path, 'dataverse_files_2025_05_08-', 'ccc-phase3-public.csv'),  # From Jan 2025 to May 2025
    encoding='MacRoman',
    low_memory=False
)

# Save the dataframes in a dictionary
dfs = {
    '2017-2020': df1,
    '2021-2024': df2,
    '2025-': df3
}

In [4]:
df1.head()

Unnamed: 0,date,locality,state,location_detail,online,type,macroevent,actors,claims,valence,...,source_28,source_29,source_30,notes,lat,lon,resolved_locality,resolved_county,resolved_state,fips_code
0,2017-01-01,Washington,DC,Lafayette Square Park,0.0,vigil,,,"for banning nuclear weapons, for peace",0.0,...,,,,White House Peace Vigil continuous since June ...,38.907192,-77.036871,Washington,District of Columbia,DC,11001.0
1,2017-01-01,Mankato,MN,,0.0,vigil,,Peace Vigil Mankato,for peace,0.0,...,,,,every Sunday since 2001,44.163578,-93.9994,Mankato,Blue Earth County,MN,27013.0
2,2017-01-01,Minneapolis,MN,U.S. Bank Stadium,0.0,protest; banner drop,,general protestors,"against the Dakota Access Pipeline, for indige...",1.0,...,,,,hung banner from stadium roof during NFL game,44.977753,-93.265011,Minneapolis,Hennepin County,MN,27053.0
3,2017-01-01,Little Compton,RI,Town Green,0.0,vigil,,Sakonnet Peace Alliance,"for peace, for gun control, for climate action",1.0,...,,,,every Sunday since 2003,41.510103,-71.171156,Little Compton,Newport County,RI,44005.0
4,2017-01-01,Oak Ridge,TN,Y-12 National Security Complex,0.0,vigil,,Oak Ridge Environmental Peace Alliance,for abolishing nuclear weapons,0.0,...,,,,every Sunday since the late 1990s,36.010356,-84.269645,Oak Ridge,Anderson County,TN,47001.0


In [5]:
df2.head()

Unnamed: 0,date,locality,state,location_detail,online,type,title,macroevent,organizations,participants,...,source_28,source_29,source_30,notes,lat,lon,resolved_locality,resolved_county,resolved_state,fips_code
0,2021-01-01,Montgomery,AL,statewide,0.0,strike; boycott,,,Free Alabama Movement,prisoners,...,,,,Scheduled to run 30 days.,32.379223,-86.307737,Montgomery,Montgomery County,AL,1101.0
1,2021-01-01,Tucson,AZ,E Speedway Blvd and N Euclid Ave,0.0,vigil,,,Women in Black,,...,,,,every Friday since at least 2001; organizers c...,32.253979,-110.974177,Tucson,Pima County,AZ,4019.0
2,2021-01-01,Lafayette,CA,El Curtola Blvd & Highway 24,0.0,demonstration,,,Contra Costa County Patriots,,...,,,,,37.885758,-122.11802,Lafayette,Contra Costa County,CA,6013.0
3,2021-01-01,Palo Alto,CA,El Camino Real and Embarcadero Rd,0.0,vigil,,,WILPF,,...,,,,"every Friday; started in 1922, some gaps since",37.441883,-122.143019,Palo Alto,Santa Clara County,CA,6085.0
4,2021-01-01,Pasadena,CA,Rose Bowl Stadium,0.0,rally; parade,Patriots' Rose Parade,,Trump Unity Bridge SO CAL Events,Trump supporters,...,,,,Patriots' Rose Parade,34.147785,-118.144515,Pasadena,Los Angeles County,CA,6037.0


In [6]:
df3.head()

Unnamed: 0,date,locality,state,resolved_locality,resolved_state,resolved_county,fips_code,lat,lon,location,...,source21,source22,source23,source24,source25,source26,source27,source28,source29,source30
0,1/1/2025,Albany,NY,Albany,NY,Albany County,36001.0,42.652579,-73.756232,Albany Medical Center,...,,,,,,,,,,
1,1/1/2025,Baltimore,MD,Baltimore,MD,,24510.0,39.290385,-76.612189,online,...,,,,,,,,,,
2,1/1/2025,Bangor,ME,Bangor,ME,Penobscot County,23019.0,44.801613,-68.771226,Harlow St & Central St,...,,,,,,,,,,
3,1/1/2025,Bellingham,WA,Bellingham,WA,Whatcom County,53073.0,48.751911,-122.478685,Railroad Trail and I-5,...,,,,,,,,,,
4,1/1/2025,Bloomington,IN,Bloomington,IN,Monroe County,18105.0,39.165325,-86.526386,Dimension Mill,...,,,,,,,,,,


In [7]:
for key, df in dfs.items():
    print(f"DataFrame for {key}:\n")
    print(df.info())
    print("\n")

DataFrame for 2017-2020:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72181 entries, 0 to 72180
Data columns (total 62 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 72181 non-null  object 
 1   locality             72162 non-null  object 
 2   state                72143 non-null  object 
 3   location_detail      63413 non-null  object 
 4   online               72142 non-null  float64
 5   type                 71754 non-null  object 
 6   macroevent           1879 non-null   object 
 7   actors               66775 non-null  object 
 8   claims               72165 non-null  object 
 9   valence              72152 non-null  float64
 10  issues               68594 non-null  object 
 11  size_text            29583 non-null  object 
 12  size_low             36073 non-null  float64
 13  size_high            36050 non-null  float64
 14  size_mean            36073 non-null  float64
 15  size_cat  

# 2. Columns of interest

Information extracted from the documentation of the data. Check out as well the [coding guidelines](https://docs.google.com/document/d/1oaOf9s72FQnzQA8sbE8h0PwMIZLP6p0EDUV2ya065is/edit?tab=t.0#heading=h.86aoh6qprspd) of the authors that they use whenever they need to do a judgement call (e.g., calling a demonstration pro-Trump or anti-Trump).

**Columns in 2017-2020 (1st edition)**:

- `date`: Date of event in YYYY-MM-DD format. When an event spans multiple days, the start date is used.
- `locality`: Name of the locality in which the event took place. Labeled CityTown in the Google Sheets.
- `state`: Two-letter U.S. postal abbreviation for the state or U.S. territory in which the event took place. 
- `location_detail`: Where available, text giving additional details on the location(s) within the city or town where the action took place.
- `online`: Binary indicator for online-only events. 1 = online, 0 = in-person. Generated from location and event type information in the source data.
- `type`: Type(s) of protest action (e.g. march, protest, demonstration, strike, counter-protest, sit-in), separated with semicolons or commas when more than one. EventType in source data.
- `macroevent`: A unique id that associates a counter-protest with the protest event it countered. These strings are composed of a date, a location, and something about the nature of the event, all separated by hyphens (e.g., "20220624-phoenix-abortion"). In most cases, these ids will uniquely identify pairs of events. In cases where the counter-protest is itself countered, however—e.g., a community defense action in response to a protest targeting an LGBTQ+ pride festival—the additional events are given the same id, so these clusters will sometimes include three or more events.
  - NOTE: This field was created in 2021, and it was only retroactively generated for events associated with the Black Lives Matter protest wave that began in May 2020. So, for CCC Phase 1 data, it is only useful for analysis of that specific subset.
- `actors`: The organization(s) that organized the protest event (e.g. Women's March, Greenpeace, etc.), and/or the type of people participating (e.g., students, nurses). Usually separated by semicolons, sometimes by commas.
- `claims`: A phrase or phrases describing what the event was about, based on the claims or demands the participants made (e.g. for women's rights, anti-Muslim Ban, against racism, etc.), as summarized by coders. Phrases are separated by semicolons in some records, commas in others.
- `valence`: Political valence of the event. Labeled Pro(2)/Anti(1) in the source data.
  - 2 = pro-Trump
  - 1 = anti-Trump
  - 0 = other/neither
- `issues`: String of semicolon-separated tags identifying political issues (or themes) associated with the event (e.g., "democracy; women's rights" for events associated with the 2017 Women's March). These are generated after data compilation by running a series of regular expressions over the claims description text.
- `size_text`: Words or phrases that journalists or eyewitnesses used to describe the size of the crowd at the protest event (e.g., “more than 100”, “dozens”, “about 50”)—or, in cases where no text descriptions were found but other sources gave information about crowd size, the alternative source used: “count pic” for photograph(s), “count vid” for video(s), “count FB” for Facebook, or “eyewitness” for eyewitness.
- `size_low`: Lowest crowd size reported in, or estimated from, size_text. To convert vague text counts to numbers, the following three rules are always used for both `size_low` and `size_high` (see below).
  1. For phrases with fudge words (e.g., “about”, “nealy” “approximately”, “maybe”), ignore the fudge word and treat the number that follows it as the count.
  2. When unspecific multiples are given, assume the multiple is 2  (e.g., “hundreds” becomes 200, “thousands” becomes 2,000).
  3. Assume “several” means 3 (e.g., “several dozen” becomes 36, “several thousand” becomes 3,000).
- `size_high`: Highest crowd size reported in, or estimated from, size_text. 
  - When vague text counts are converted to numbers, the high and low estimates are assumed to be equivalent. For example, if the only information available about crowd size is a single report describing it as “hundreds”, 200 would be assigned to both size_low and size_high. 
  - If, however, one source described the crowd size as “hundreds” and another as “about 500”, then size_low would still be 200, but size_high would be 500.
- `size_mean`: The mathematical average of size_low and size_high, rounded up to the nearest integer.
- `size_cat`: Ordered categorical indicator of crowd size, representing orders of magnitude and derived from size_mean.
  - 0 = unknown
  - 1 = 1-99 (tens)
  - 2 = 100-999 (hundreds)
  - 3 = 1,000-9,999 (thousands)
  - 4 = 10,000+ (tens of thousands)
- `arrests`: Text, sometimes specifying the count of protesters reportedly arrested (e.g., “5”), sometimes a phrase indicating ambiguity about that count (e.g., "more than 5", "unclear", “unspecified”).
- `arrests_any`: Binary indicator for whether or not any arrests occurred, derived from arrests. 1 = yes, 0 = no.
- `injuries_crowd`: Text, sometimes giving a count of protesters reportedly injured (e.g., “5”), sometimes a phrase indicating ambiguity about that count (e.g., "more than 5", "unclear", “unspecified”).
- `injuries_crowd_any`: Binary indicator for whether or not any protesters were reportedly injured, derived from injuries_crowd. 1 = yes, 0 = no.
- `injuries_police`: Text, sometimes giving a count of police officers reportedly injured (e.g., “5”), sometimes a phrase indicating ambiguity about that count (e.g., "more than 5", "unclear", “unspecified”). 
- `injuries_police_any`: Binary indicator for whether or not any police officers were reportedly injured, derived from injuries_police. 1 = yes, 0 = no.
- `property_damage`: Text, usually a binary indicator for whether or not any property damage occurred, sometimes a count or other number of unclear meaning, sometimes something else. 
- `property_damage_any`: Binary indicator for whether or not protesters reportedly caused any property damage, derived from property_damage. 1 = yes, 0 = no.
- `chemical_agents`: Binary indicator for whether or not police or other state security forces used tear gas or other chemical irritants, such as pepper spray or pepper balls, on protesters. Labeled TearGas in the source data, but only available for May–December 2020.
- `source_n`: URL of nth source, or description where the source is not a web page (e.g., “eyewitness”, “correspondence”).
- `notes`: Miscellaneous additional information about the event as noted by the coder. 
- `lat`: Latitude of locality in which the event took place, as resolved by Google Maps Geocoding API. Note that this is not based on address or landmark-level information where that is given, only on the name of the city or town.
- `lon`: Longitude of locality in which the event took place, as resolved by Google Maps Geocoding API. Note that this is not based on address or landmark-level information where that is given, only on the name of the city or town.
- `resolved_locality`: Name of the locality in which the event occurred, as resolved by running the city or town name and state abbreviation through the Google Maps Geocoding API.
- `resolved_state`: Postal abbreviation of the state or territory in which the event occurred, as resolved by the Google Maps Geocoding API.
- `resolved_county`: Name of the county in which the event occurred, as resolved by running resolved_locality and resolved_state through the Google Maps Geocoding API.
- `fips_code`: Five-digit FIPS code for the county (or LA parish or AK borough or independent city or DC or U.S. territory) given in resolved_county. See 'data-compilation/fips_for_county_function.r' for details on how these are generated using the 'tigris' package and some custom code to handle various exceptions.
  - NOTE: When you load the data from the stored .csv, you will probably need to add leading zeros back to FIPS codes that have them, because your software will probably read that column as integers instead of strings. In R, you could do this with `ifelse(nchar(fips_code) == 4, paste0("0", fips_code), fips_code)`.

**Columns and modifications in the 2nd edition**:
- `title`: Title of action when one is given, usually on a flyer announcing the event.
- `organizations`: Semicolon-separated names of organization(s) that participated in the event, including but not limited to organizers and endorsers (.e.g, “Fridays for Future New York; Extinction Rebellion New York; Greenfaith”). 
- `participants`: Semicolon-separated text descriptors of participants in the event, usually from press reports, sometimes recorded by coders based on photos or video (e.g., “students; faculty; staff; community members”).
- `claims`: Comma-separated text phrases describing what the event was about. 
- `claims_summary`: Text phrase or comma-separated text phrases giving coder’s summary of the action’s main claims or demands. Each phrase should begin with either “for”, “against”, or “in [action word] [of/with/for]” (e.g., “in commemoration of…”, “in solidarity with…”, “in remembrance of…”). Generated programmatically from claims field, so there will be occasional errors of omission and commission due to typos and such.
- `claims_verbatim`: Comma-separated verbatim text captures of claims made by event participants on signs, banners, t-shirts, and flags or as chants or shouts. Generated programmatically from claims field, so there will be occasional errors of omission and commission due to typos and such.
  - Phrases that end with an exclamation point (e.g., “no justice no peace!”) are chants or shouts.
  - Text inside square brackets (e.g., “[wire hanger ]”, “[watermelon]”) is a coder’s description of an image or symbol seen on a sign or banner.
  - Flags are recorded as phrases ending in “flag” (e.g., “Confederate flag”, Gadsden flag”). National flags other than the U.S. flag are recorded as the name of the country followed by “flag” (e.g., “Israel flag”, “Mexico flag”). The American flag is usually recorded as “American flag”, sometimes as “U.S. flag”, “US flag”, or “USA flag”.
  - When the same claim is seen more than once, only one instance of it is recorded here.
- `issue_tags_summary`: String of semicolon-separated tags identifying political issues (or themes) associated with the event, based only on the coder-generated summary descriptions of participants’ demands in claims_summary.
- `issue_tags_verbatim`: String of semicolon-separated tags identifying political issues (or themes) associated with the event, based only on the verbatim text captures of protester claims or demands found in claims_verbatim.
- `issue_tags`: String of semicolon-separated tags identifying political issues (or themes) associated with the event, as generated from either the coder summary or verbatim capture—i.e., the concatenation of issue_tags_summary and issue_tags_verbatim.
- `participant_measures`: Text field describing notable actions taken by protest participants in the course of the event, including but not limited to interactions with police and/or counter-protesters. See the Coding Guidelines for details on what is recorded and how.
- `police_measures`: Text field describing presence of, and notable actions taken by, police in response to the protest event, including interactions with protesters and/or counter-protesters. See the Coding Guidelines for details on what is recorded and how.
- `participant_deaths`: Reported count of protest participants who died as a result of their participation in the protest. This includes deaths by natural causes during a protest event as well as deaths reportedly caused by the actions of police or counter-protesters.
- `police_deaths`: Reported count of police officers who died while policing the protest. This includes deaths by natural causes during a protest event as well as deaths reportedly caused by the actions of protesters or counter-protesters.

In [None]:
# Count the number of observations with injuries in the crowd, police and property damage (riots?)
df1_riot = df1[(df1['injuries_crowd_any'] == 1) & (df1['injuries_police_any'] == 1 & (df1['property_damage_any'] == 1))]

df1_riot

Unnamed: 0,date,locality,state,location_detail,online,type,macroevent,actors,claims,valence,...,source_28,source_29,source_30,notes,lat,lon,resolved_locality,resolved_county,resolved_state,fips_code
107,2017-01-18,Washington,DC,Trump International Hotel,0.0,protest,,general protestor,against election of Donald Trump to presidency,1.0,...,,,,,38.907192,-77.036871,Washington,District of Columbia,DC,11001.0
119,2017-01-19,Washington,DC,National Press Club,0.0,counter-protest,,general protestors,in support of Donald Trump,2.0,...,,,,,38.907192,-77.036871,Washington,District of Columbia,DC,11001.0
130,2017-01-20,Washington,DC,National Mall,0.0,counter-protest,,general protestors,for election of Donald Trump to presidency,2.0,...,,,,,38.907192,-77.036871,Washington,District of Columbia,DC,11001.0
1005,2017-01-29,Portland,ME,Portland International Jetport,0.0,demonstration,,general protestors,"end the ""Muslim ban""; immigration; anti-Trump",1.0,...,,,,,43.659099,-70.256819,Portland,Cumberland County,ME,23005.0
1041,2017-01-29,Portland,OR,Portland International Airport,0.0,demonstration,,general protestors,Protest immigration ban; anti-Trump,1.0,...,,,,,45.515232,-122.678385,Portland,Multnomah County,OR,41051.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71378,2020-12-12,Santa Fe,NM,The Roundhouse,0.0,counter-protest,20201212-santafe-stopthesteal,general protesters,"against President Trump, against fascism, agai...",1.0,...,,,,verbal clashes and at least one physical fight...,35.686975,-105.937799,Santa Fe,Santa Fe County,NM,35049.0
71777,2020-12-19,Buffalo,NY,Niagara Square,0.0,counter-protest,20201219-buffalo-covid,general protesters,"in memory of victims of COVID-19, for free hea...",1.0,...,,,,placed trash bags as mock body bags during dem...,42.886447,-78.878369,Buffalo,Erie County,NY,36029.0
71807,2020-12-20,Rocklin,CA,Destiny Christian Church,0.0,protest,20201220-rocklin-covidchurch,Allies for Black Lives-Placer County,against church's defiance of COVID-related pub...,1.0,...,,,,,38.790734,-121.235783,Rocklin,Placer County,CA,6061.0
71873,2020-12-21,McAllen,TX,The Monitor,0.0,rally,20201221-mcallen-trump,general protesters,"for President Trump, against election fraud",2.0,...,,,,verbal and physical confrontations and physica...,26.203407,-98.230012,McAllen,Hidalgo County,TX,48215.0


# 3. Unique issues

In [33]:
# Unique issues in the 2017-2020 dataset
unique_issues_1 = df1['issues'].unique()

issues_1 = set()  # Initialize an empty set to store unique issues

# Store unique issues in the set, ignoring NaN values and separating by semicolons
for issue in unique_issues_1:
    if pd.notna(issue):  # Check if the issue is not NaN
        issues = issue.split(';')  # Split the issues by semicolon
        for i in issues:
            issues_1.add(i.strip())  # Add each issue to the set, stripping whitespace

# Do the same for the 2021-2024 dataset
unique_issues_2 = df2['issue_tags'].unique()

issues_2 = set()  # Initialize an empty set to store unique issues

# Store unique issues in the set, ignoring NaN values and separating by semicolons
for issue in unique_issues_2:
    if pd.notna(issue):  # Check if the issue is not NaN
        issues = issue.split(';')  # Split the issues by semicolon
        for i in issues:
            issues_2.add(i.strip())  # Add each issue to the set, stripping whitespace

# Do the same for the 2025 dataset
unique_issues_3 = df3['issues'].unique()

issues_3 = set()  # Initialize an empty set to store unique issues

# Store unique issues in the set, ignoring NaN values and separating by semicolons
for issue in unique_issues_3:
    if pd.notna(issue):  # Check if the issue is not NaN
        issues = issue.split(';')  # Split the issues by semicolon
        for i in issues:
            issues_3.add(i.strip())  # Add each issue to the set, stripping whitespace

print(issues_1)
print(issues_2)
print(issues_3)

{'policing', 'democracy', 'healthcare', "women's rights", 'immigration', 'patriotism', 'environment', 'education', 'reproductive rights', 'foreign affairs', 'disability rights', 'civil rights', 'covid', 'lgbtqia', 'labor', 'indigenous peoples', 'free speech', 'housing', 'transportation', 'racism', 'corruption', 'economy', 'banking and finance', 'guns', 'religion', 'science', 'sports', 'animal rights', 'legislative', 'judiciary', 'military', 'development', 'criminal justice', 'sexual violence', 'presidency', 'taxes', 'drugs', 'energy'}
{'policing', 'democracy', 'healthcare', "women's rights", 'immigration', 'patriotism', 'environment', 'education', 'reproductive rights', 'foreign affairs', 'disability rights', 'civil rights', 'covid', 'lgbtqia', 'labor', 'indigenous peoples', 'housing', 'transportation', 'racism', 'corruption', 'economy', 'banking and finance', 'religion', 'guns', 'science', 'sports', 'drugs', 'animal rights', 'legislative', 'judiciary', 'military', 'criminal justice', 

In [34]:
# Find the intersection of issues between the three datasets
intersection_issues = issues_1.intersection(issues_2).intersection(issues_3)

print("Intersection of issues between the three datasets:")
for issue in intersection_issues:
    print(issue)

Intersection of issues between the three datasets:
policing
democracy
healthcare
women's rights
patriotism
immigration
environment
education
reproductive rights
foreign affairs
disability rights
civil rights
covid
lgbtqia
indigenous peoples
free speech
housing
transportation
racism
corruption
economy
banking and finance
religion
science
guns
sports
animal rights
legislative
judiciary
criminal justice
sexual violence
presidency
taxes
drugs
energy


Topics:
- healthcare
- racism + immigration
- housing
- guns + criminal justice?

# 4. Merging the datasets and keeping information of interest

In [8]:
# Print the names of the columns of all data frames
for key, df in dfs.items():
    print(f"Columns in {key} dataset:")
    print(df.columns.tolist())
    print("\n")

Columns in 2017-2020 dataset:
['date', 'locality', 'state', 'location_detail', 'online', 'type', 'macroevent', 'actors', 'claims', 'valence', 'issues', 'size_text', 'size_low', 'size_high', 'size_mean', 'size_cat', 'arrests', 'arrests_any', 'injuries_crowd', 'injuries_crowd_any', 'injuries_police', 'injuries_police_any', 'property_damage', 'property_damage_any', 'chemical_agents', 'source_1', 'source_2', 'source_3', 'source_4', 'source_5', 'source_6', 'source_7', 'source_8', 'source_9', 'source_10', 'source_11', 'source_12', 'source_13', 'source_14', 'source_15', 'source_16', 'source_17', 'source_18', 'source_19', 'source_20', 'source_21', 'source_22', 'source_23', 'source_24', 'source_25', 'source_26', 'source_27', 'source_28', 'source_29', 'source_30', 'notes', 'lat', 'lon', 'resolved_locality', 'resolved_county', 'resolved_state', 'fips_code']


Columns in 2021-2024 dataset:
['date', 'locality', 'state', 'location_detail', 'online', 'type', 'title', 'macroevent', 'organizations', 'p

Columns that are (or could be) of interest in our analysis (and that are shared across all editions, so that we have enough information). Note that we re mainly interested in the content that motivated the protests:
1. `date`.
2. `state` (not `resolved_state`, where both have similar characteristics).
3. `type` (`event_type` in the 3rd edition).
4. `actors` (`organizations` + `participants` in the 2nd and 3rd edition).
5. `claims` (1st edition, claims separated by commas or semicolons); `claims_summary` (2nd and 3rd editions, separated by semicolons). Add also `title` for 2nd and 3rd editions (title of action when one is given).
6. `valence` (pro-trump or anti-trump).
7. `issues` (1st and 3rd editions, separated by semicolons) and `issue_tags` (2nd edition, separated by semicolons).
8. `size_low`, `size_high` and `size_mean`, for the estimated minimum, maximum and mean crowd, respectively. Note that there might be relevant noise in this variable (as the estimations are made according to the criteria of the researchers).
9. `arrests_any`: instead of just `arrests` because the latter (though more diverse) is noisier.
10. `injuries_crowd_any` (1st and 2nd editions), `participant_casualties_any` (3rd edition, inferred from the `participant_injuries` column).
11. `injuries_police_any` (1st and 2nd editions), `police_casualties_any` (3rd edition, inferred from the `police_injuries` column).
12. `property_damage_any`.
13. `notes`. Could contain useful text-based information not contained in the previous columns.

In [42]:
# Select the relevant columns from all dataframes

df1_sel = df1[['date', 'state', 'type', 'actors', 'claims', 'valence', 'issues',
               'size_low', 'size_high', 'size_mean', 'arrests_any', 'injuries_crowd_any',
               'injuries_police_any', 'property_damage_any', 'notes']]

df2_sel = df2[['date', 'state', 'type', 'organizations', 'participants', 'claims_summary', 
               'title', 'valence', 'issue_tags', 'size_low', 'size_high', 'size_mean', 
               'arrests_any', 'injuries_crowd_any', 'injuries_police_any', 'property_damage_any', 
               'notes']]

df3_sel = df3[['date', 'state', 'event_type', 'organizations', 'participants', 'claims_summary', 
               'title', 'valence', 'issues', 'size_low', 'size_high', 'size_mean', 
               'arrests_any', 'participant_casualties_any', 'police_casualties_any', 'property_damage_any', 
               'notes']]

# Rename the columns (for those with similar content) to have consistent names 
# across all dataframes

df2_sel = df2_sel.rename(columns={
    'issue_tags': 'issues'
})

df3_sel = df3_sel.rename(columns={
    'event_type': 'type',
    'participant_casualties_any': 'injuries_crowd_any',
    'police_casualties_any': 'injuries_police_any'
})

To unify the features in all dataframes, however, there are still some operations left to do:
1. 1st edition: change separator of `claims` (from commas or semicolons to semicolons).
2. 1st edition: change separator of `types` (from commas or semicolons to semicolons).
3. 1st edition: note that the `actors` field in the 1st edition is "usually separated by semicolons, sometimes by commas", while `organizations` and `participants` in the 2nd and 3rd editions are separated by semicolons in each cell. This should be unified.
4. 2nd and 3rd editions: create an `actors` column which combines the `organizations` and the `participants`. 
5. 2nd and 3rd editions: concatenate the `title` to the `claims_summary`, with a semicolon as the separator. Create `claims` from the result.
6. 3rd edition: convert date format to YYYY-MM-DD.

In [44]:
# In the 1st edition, change all commas to semicolons in the 'claims' column
df1_sel['claims'] = df1_sel['claims'].str.replace(',', ';', regex=False)

# In the 1st edition, change all commas to semicolons in the 'types' column
df1_sel['type'] = df1_sel['type'].str.replace(',', ';', regex=False)

# In the 1st edition, change all commas to semicolons in the 'actors' column
df1_sel['actors'] = df1_sel['actors'].str.replace(',', ';', regex=False)

# # In the 2nd and 3rd editions, create an `actors` column by combining the 
# # `organizations` and `participants` columns, separating them with a semicolon
# # if they are not NaN
df2_sel['actors'] = df2_sel.apply(
    lambda row: f"{row['organizations']};{row['participants']}" if pd.notna(row['organizations']) and pd.notna(row['participants']) 
    else row['organizations'] if pd.notna(row['organizations']) 
    else row['participants'] if pd.notna(row['participants']) 
    else np.nan, 
    axis=1  # Apply the function row-wise
)
df3_sel['actors'] = df3_sel.apply(
    lambda row: f"{row['organizations']};{row['participants']}" if pd.notna(row['organizations']) and pd.notna(row['participants']) 
    else row['organizations'] if pd.notna(row['organizations']) 
    else row['participants'] if pd.notna(row['participants']) 
    else np.nan, 
    axis=1  # Apply the function row-wise
)


# # In the 2nd and 3rd editions, create a 'claims' column by combining the 'claims_summary' 
# # and 'title' columns, separating them with a semicolon if they are not NaN
df2_sel['claims'] = df2_sel.apply(
    lambda row: f"{row['claims_summary']};{row['title']}" if pd.notna(row['claims_summary']) and pd.notna(row['title']) 
    else row['claims_summary'] if pd.notna(row['claims_summary']) 
    else row['title'] if pd.notna(row['title']) 
    else np.nan, 
    axis=1  # Apply the function row-wise
)
df3_sel['claims'] = df3_sel.apply(
    lambda row: f"{row['claims_summary']};{row['title']}" if pd.notna(row['claims_summary']) and pd.notna(row['title']) 
    else row['claims_summary'] if pd.notna(row['claims_summary']) 
    else row['title'] if pd.notna(row['title']) 
    else np.nan, 
    axis=1  # Apply the function row-wise
)

# In the 3rd edition, change date from MM/DD/YYYY to YYYY-MM-DD
df3_sel['date'] = pd.to_datetime(df3_sel['date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_sel['claims'] = df1_sel['claims'].str.replace(',', ';', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_sel['type'] = df1_sel['type'].str.replace(',', ';', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_sel['actors'] = df1_sel['actors'].str.replace(',', ';', reg

In [46]:
df1_sel.head()

Unnamed: 0,date,state,type,actors,claims,valence,issues,size_low,size_high,size_mean,arrests_any,injuries_crowd_any,injuries_police_any,property_damage_any,notes
0,2017-01-01,DC,vigil,,for banning nuclear weapons; for peace,0.0,military,,,,0,0,0,0,White House Peace Vigil continuous since June ...
1,2017-01-01,MN,vigil,Peace Vigil Mankato,for peace,0.0,military,,,,0,0,0,0,every Sunday since 2001
2,2017-01-01,MN,protest; banner drop,general protestors,against the Dakota Access Pipeline; for indige...,1.0,banking and finance; economy; energy; environm...,2.0,2.0,2.0,1,0,0,0,hung banner from stadium roof during NFL game
3,2017-01-01,RI,vigil,Sakonnet Peace Alliance,for peace; for gun control; for climate action,1.0,environment; guns; military,,,,0,0,0,0,every Sunday since 2003
4,2017-01-01,TN,vigil,Oak Ridge Environmental Peace Alliance,for abolishing nuclear weapons,0.0,military,,,,0,0,0,0,every Sunday since the late 1990s


In [47]:
df2_sel.head()

Unnamed: 0,date,state,type,organizations,participants,claims_summary,title,valence,issues,size_low,size_high,size_mean,arrests_any,injuries_crowd_any,injuries_police_any,property_damage_any,notes,actors,claims
0,2021-01-01,AL,strike; boycott,Free Alabama Movement,prisoners,against prison labor;for safer conditions in A...,,1.0,covid;criminal justice;labor,,,,0,0,0,0,Scheduled to run 30 days.,Free Alabama Movement;prisoners,against prison labor;for safer conditions in A...
1,2021-01-01,AZ,vigil,Women in Black,,for ending Israel's occupation of Palestine;fo...,,0.0,foreign affairs,,,,0,0,0,0,every Friday since at least 2001; organizers c...,Women in Black,for ending Israel's occupation of Palestine;fo...
2,2021-01-01,CA,demonstration,Contra Costa County Patriots,,for President Trump,,2.0,presidency,,,,0,0,0,0,,Contra Costa County Patriots,for President Trump
3,2021-01-01,CA,vigil,WILPF,,for peace;against war;for banning nuclear weapons,,0.0,military,,,,0,0,0,0,"every Friday; started in 1922, some gaps since",WILPF,for peace;against war;for banning nuclear weapons
4,2021-01-01,CA,rally; parade,Trump Unity Bridge SO CAL Events,Trump supporters,for President Trump;against election fraud;aga...,Patriots' Rose Parade,2.0,covid;democracy;presidency,1000.0,1000.0,1000.0,0,0,0,0,Patriots' Rose Parade,Trump Unity Bridge SO CAL Events;Trump supporters,for President Trump;against election fraud;aga...


In [48]:
df3_sel.head()

Unnamed: 0,date,state,type,organizations,participants,claims_summary,title,valence,issues,size_low,size_high,size_mean,arrests_any,injuries_crowd_any,injuries_police_any,property_damage_any,notes,actors,claims
0,2025-01-01,NY,demonstration,Healthcare Workers for Palestine Albany; Pales...,,for ceasefire in Gaza; for ending all U.S. wea...,,0.0,banking and finance;economy;foreign affairs;he...,,,,0,0,0,0,every Wednesday,Healthcare Workers for Palestine Albany; Pales...,for ceasefire in Gaza; for ending all U.S. wea...
1,2025-01-01,MD,rally,West Coalition,,for justice for Tyrone West and all victims of...,West Wednesday,1.0,policing;war and peace,,,,0,0,0,0,every Wednesday,West Coalition,for justice for Tyrone West and all victims of...
2,2025-01-01,ME,demonstration,Maine Coalition for Palestine,,for peace; for Palestinian liberation,,0.0,foreign affairs;war and peace,,,,0,0,0,0,every Wednesday,Maine Coalition for Palestine,for peace; for Palestinian liberation
3,2025-01-01,WA,demonstration,Bellingham Action 4 Palestine,,for Palestinian liberation,Banner Drop in Solidarity With Palestine,0.0,foreign affairs;war and peace,,,,0,0,0,0,every Wednesday,Bellingham Action 4 Palestine,for Palestinian liberation;Banner Drop in Soli...
4,2025-01-01,IN,demonstration,Bloomington for Palestine,,against presence in Bloomington of businesses ...,,0.0,economy;foreign affairs;war and peace,,,,0,0,0,0,every Wednesday,Bloomington for Palestine,against presence in Bloomington of businesses ...


In [49]:
# Drop the columns that are not needed in the final dataset (2nd and 3rd editions)
df2_sel = df2_sel.drop(columns=['organizations', 'participants', 'claims_summary', 'title'])
df3_sel = df3_sel.drop(columns=['organizations', 'participants', 'claims_summary', 'title'])

In [50]:
# Print the names of the columns of all data frames
for df in [df1_sel, df2_sel, df3_sel]:
    print(f"Columns in dataset:")
    print(sorted(df.columns.tolist()))
    print(len(df.columns.tolist()))
    print("\n")

Columns in dataset:
['actors', 'arrests_any', 'claims', 'date', 'injuries_crowd_any', 'injuries_police_any', 'issues', 'notes', 'property_damage_any', 'size_high', 'size_low', 'size_mean', 'state', 'type', 'valence']
15


Columns in dataset:
['actors', 'arrests_any', 'claims', 'date', 'injuries_crowd_any', 'injuries_police_any', 'issues', 'notes', 'property_damage_any', 'size_high', 'size_low', 'size_mean', 'state', 'type', 'valence']
15


Columns in dataset:
['actors', 'arrests_any', 'claims', 'date', 'injuries_crowd_any', 'injuries_police_any', 'issues', 'notes', 'property_damage_any', 'size_high', 'size_low', 'size_mean', 'state', 'type', 'valence']
15




In [51]:
# Concatenate the three dataframes along the rows into one
df_combined = pd.concat([df1_sel, df2_sel, df3_sel], ignore_index=True)

df_combined.head()

Unnamed: 0,date,state,type,actors,claims,valence,issues,size_low,size_high,size_mean,arrests_any,injuries_crowd_any,injuries_police_any,property_damage_any,notes
0,2017-01-01,DC,vigil,,for banning nuclear weapons; for peace,0.0,military,,,,0,0,0,0,White House Peace Vigil continuous since June ...
1,2017-01-01,MN,vigil,Peace Vigil Mankato,for peace,0.0,military,,,,0,0,0,0,every Sunday since 2001
2,2017-01-01,MN,protest; banner drop,general protestors,against the Dakota Access Pipeline; for indige...,1.0,banking and finance; economy; energy; environm...,2.0,2.0,2.0,1,0,0,0,hung banner from stadium roof during NFL game
3,2017-01-01,RI,vigil,Sakonnet Peace Alliance,for peace; for gun control; for climate action,1.0,environment; guns; military,,,,0,0,0,0,every Sunday since 2003
4,2017-01-01,TN,vigil,Oak Ridge Environmental Peace Alliance,for abolishing nuclear weapons,0.0,military,,,,0,0,0,0,every Sunday since the late 1990s


In [53]:
# We order the columns in the final dataset
df_combined = df_combined[['date', 'state', 'type', 'issues',  # Basic protest data
                           'actors', 'claims', 'valence',  # Claims and actors
                           'size_low', 'size_high', 'size_mean',  # Size of the protest
                           'arrests_any', 'injuries_crowd_any', 'injuries_police_any', 'property_damage_any',  # Violence and damage
                           'notes']]

# Save the combined dataframe to a CSV file
df_combined.to_csv(
    'clean_data/ccc_combined.csv',
    index=False,
    encoding='utf-8'
)