# 03 – Text Cleaning & Feature Enrichment

In this notebook, we:
 - Preprocess event descriptions to reduce noise
 - Add structured features (e.g. country, fatalities)
 - Prepare input for modeling

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import spacy
import string

In [28]:
# Load data
df = pd.read_csv("acled_sample.csv")  # or re-query BigQuery
#df = df[['event_description', 'event_type', 'fatalities', 'country', 'actor1']].dropna()

# Make sure target is categorical
df['event_type'] = df['event_type'].astype('category')
df['label'] = df['event_type'].cat.codes

In [29]:
df.head()


Unnamed: 0.1,Unnamed: 0,event_id_cnty,event_type,event_date,country,admin1,admin2,actor1,actor2,inter1,interaction,event_description,fatalities,latitude,longitude,label
0,0,SOM45530,Explosions/Remote violence,2024-08-25,Somalia,Banadir,Banadir,Al Shabaab,Military Forces of Somalia (2022-),,,"On 25 August 2024, an IED planted by Al Shabaa...",2,2.0611,45.2589,1
1,1,SOM45523,Violence against civilians,2024-08-24,Somalia,Banadir,Banadir,Al Shabaab,Civilians (Somalia),,,"On 24 August 2024, Al Shabaab shot and killed ...",2,2.0576,45.2853,5
2,2,MUS523,Protests,2024-08-27,Mauritius,Moka,,Protesters (Mauritius),Police Forces of Mauritius (2018-) Special Sup...,Protesters,,"On 27 August 2024, at the call of the Rann Nou...",0,-20.219,57.5108,2
3,3,IND165800,Strategic developments,2024-08-27,India,Assam,Nagaon,Civilians (India),,Civilians,Civilians only,"Other: On 27 August 2024, a section of fish tr...",0,26.3469,92.6851,4
4,4,IRQ58463,Strategic developments,2024-08-27,Iraq,Ninewa,Al Mosul,Civilians (Iraq),,Civilians,Civilians only,"Other: On 27 August 2024, 517 displaced people...",0,36.335,43.1189,4


In [30]:
print("inter1 unique:", df['inter1'].unique())
print("interaction unique:", df['interaction'].unique())


inter1 unique: [nan 'Protesters' 'Civilians' 'External/Other forces' 'Identity militia'
 'Political militia' 'Rioters' 'Rebel group' 'State forces']
interaction unique: [nan 'Civilians only' 'Civilians-Civilians' 'External/Other forces only'
 'External/Other forces-Civilians'
 'External/Other forces-External/Other forces' 'Identity militia only'
 'Identity militia-Civilians' 'Identity militia-Identity militia'
 'Political militia only' 'Political militia-Civilians'
 'Political militia-External/Other forces'
 'Political militia-Identity militia'
 'Political militia-Political militia' 'Political militia-Protesters'
 'Political militia-Rioters' 'Protesters only'
 'Protesters-External/Other forces' 'Protesters-Protesters'
 'Rebel group only' 'Rebel group-Civilians'
 'Rebel group-External/Other forces' 'Rebel group-Identity militia'
 'Rebel group-Political militia' 'Rebel group-Rebel group' 'Rioters only'
 'Rioters-Civilians' 'Rioters-External/Other forces' 'Rioters-Protesters'
 'Rioters-Ri

In [31]:
nlp = spacy.load("en_core_web_sm")

def clean_and_lemmatize_pipe(docs):
    cleaned = []
    for doc in tqdm(nlp.pipe(docs, batch_size=1000, n_process=1), total=len(docs)):
        tokens = [token.lemma_ for token in doc if not token.is_stop and token.lemma_.isalpha()]
        cleaned.append(" ".join(tokens))
    return cleaned

# progress bar
df['clean_description'] = clean_and_lemmatize_pipe(df['event_description'].fillna(""))


100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [02:56<00:00, 113.23it/s]


In [32]:

df[['event_description', 'clean_description']].head()




Unnamed: 0,event_description,clean_description
0,"On 25 August 2024, an IED planted by Al Shabaa...",August ied plant Al Shabaab detonate target go...
1,"On 24 August 2024, Al Shabaab shot and killed ...",August Al Shabaab shoot kill government soldie...
2,"On 27 August 2024, at the call of the Rann Nou...",August Rann Nou La Terre movement Indo Mauriti...
3,"Other: On 27 August 2024, a section of fish tr...",August section fish trader announce halt expor...
4,"Other: On 27 August 2024, 517 displaced people...",August displace people return Dibaga Camp Erbi...


## Feature extraction

Fatalities, description length, actor type

In [33]:
# Ensure these are numeric
df['fatalities'] = pd.to_numeric(df['fatalities'], errors='coerce').fillna(0)
df['fatalities_clipped'] = df['fatalities'].clip(upper=50)

# Description length
df['desc_length'] = df['clean_description'].apply(lambda x: len(x.split()))





In [34]:
print(df.columns.tolist())

['Unnamed: 0', 'event_id_cnty', 'event_type', 'event_date', 'country', 'admin1', 'admin2', 'actor1', 'actor2', 'inter1', 'interaction', 'event_description', 'fatalities', 'latitude', 'longitude', 'label', 'clean_description', 'fatalities_clipped', 'desc_length']


In [35]:
#Interactions
# Fill missing values with 'Unknown' for safe encoding
df['inter1'] = df['inter1'].fillna('Unknown')
df['interaction'] = df['interaction'].fillna('Unknown')

# One-hot encode directly as categorical strings
df = pd.get_dummies(df, columns=['inter1', 'interaction'], prefix=['actor_type', 'interaction'], drop_first=True)


[]

In [36]:
df.head()

Unnamed: 0.1,Unnamed: 0,event_id_cnty,event_type,event_date,country,admin1,admin2,actor1,actor2,event_description,...,interaction_State forces only,interaction_State forces-Civilians,interaction_State forces-External/Other forces,interaction_State forces-Identity militia,interaction_State forces-Political militia,interaction_State forces-Protesters,interaction_State forces-Rebel group,interaction_State forces-Rioters,interaction_State forces-State forces,interaction_Unknown
0,0,SOM45530,Explosions/Remote violence,2024-08-25,Somalia,Banadir,Banadir,Al Shabaab,Military Forces of Somalia (2022-),"On 25 August 2024, an IED planted by Al Shabaa...",...,False,False,False,False,False,False,False,False,False,True
1,1,SOM45523,Violence against civilians,2024-08-24,Somalia,Banadir,Banadir,Al Shabaab,Civilians (Somalia),"On 24 August 2024, Al Shabaab shot and killed ...",...,False,False,False,False,False,False,False,False,False,True
2,2,MUS523,Protests,2024-08-27,Mauritius,Moka,,Protesters (Mauritius),Police Forces of Mauritius (2018-) Special Sup...,"On 27 August 2024, at the call of the Rann Nou...",...,False,False,False,False,False,False,False,False,False,True
3,3,IND165800,Strategic developments,2024-08-27,India,Assam,Nagaon,Civilians (India),,"Other: On 27 August 2024, a section of fish tr...",...,False,False,False,False,False,False,False,False,False,False
4,4,IRQ58463,Strategic developments,2024-08-27,Iraq,Ninewa,Al Mosul,Civilians (Iraq),,"Other: On 27 August 2024, 517 displaced people...",...,False,False,False,False,False,False,False,False,False,False


In [37]:
df.to_pickle("acled_cleaned.pkl")