# 03 – Text Cleaning & Feature Enrichment

In this notebook, we:
 - Preprocess event descriptions to reduce noise
 - Add structured features (e.g. country, fatalities)
 - Prepare input for modeling

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import spacy
import string

In [8]:
# Load data
df = pd.read_csv("acled_sample.csv")  # or re-query BigQuery
#df = df[['event_description', 'event_type', 'fatalities', 'country', 'actor1']].dropna()

# Make sure target is categorical
df['event_type'] = df['event_type'].astype('category')
df['label'] = df['event_type'].cat.codes

In [9]:
df.head()


Unnamed: 0.1,Unnamed: 0,event_id_cnty,event_type,event_date,country,admin1,admin2,actor1,actor2,inter1,interaction,event_description,fatalities,latitude,longitude,label
0,0,SOM45530,Explosions/Remote violence,2024-08-25,Somalia,Banadir,Banadir,Al Shabaab,Military Forces of Somalia (2022-),,,"On 25 August 2024, an IED planted by Al Shabaa...",2,2.0611,45.2589,1
1,1,SOM45523,Violence against civilians,2024-08-24,Somalia,Banadir,Banadir,Al Shabaab,Civilians (Somalia),,,"On 24 August 2024, Al Shabaab shot and killed ...",2,2.0576,45.2853,5
2,2,MUS523,Protests,2024-08-27,Mauritius,Moka,,Protesters (Mauritius),Police Forces of Mauritius (2018-) Special Sup...,Protesters,,"On 27 August 2024, at the call of the Rann Nou...",0,-20.219,57.5108,2
3,3,IND165800,Strategic developments,2024-08-27,India,Assam,Nagaon,Civilians (India),,Civilians,Civilians only,"Other: On 27 August 2024, a section of fish tr...",0,26.3469,92.6851,4
4,4,IRQ58463,Strategic developments,2024-08-27,Iraq,Ninewa,Al Mosul,Civilians (Iraq),,Civilians,Civilians only,"Other: On 27 August 2024, 517 displaced people...",0,36.335,43.1189,4


In [10]:
nlp = spacy.load("en_core_web_sm")

def clean_and_lemmatize_pipe(docs):
    cleaned = []
    for doc in tqdm(nlp.pipe(docs, batch_size=1000, n_process=1), total=len(docs)):
        tokens = [token.lemma_ for token in doc if not token.is_stop and token.lemma_.isalpha()]
        cleaned.append(" ".join(tokens))
    return cleaned

# progress bar
df['clean_description'] = clean_and_lemmatize_pipe(df['event_description'].fillna(""))


100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [02:51<00:00, 116.72it/s]


In [11]:

df[['event_description', 'clean_description']].head()




Unnamed: 0,event_description,clean_description
0,"On 25 August 2024, an IED planted by Al Shabaa...",August ied plant Al Shabaab detonate target go...
1,"On 24 August 2024, Al Shabaab shot and killed ...",August Al Shabaab shoot kill government soldie...
2,"On 27 August 2024, at the call of the Rann Nou...",August Rann Nou La Terre movement Indo Mauriti...
3,"Other: On 27 August 2024, a section of fish tr...",August section fish trader announce halt expor...
4,"Other: On 27 August 2024, 517 displaced people...",August displace people return Dibaga Camp Erbi...


## Feature extraction

Fatalities, description length, actor type

In [12]:
# Ensure these are numeric
df['fatalities'] = pd.to_numeric(df['fatalities'], errors='coerce').fillna(0)
df['fatalities_clipped'] = df['fatalities'].clip(upper=50)

# Description length
df['desc_length'] = df['clean_description'].apply(lambda x: len(x.split()))

# Interactions
df['inter1'] = pd.to_numeric(df['inter1'], errors='coerce').fillna(0).astype(int)
df['interaction'] = pd.to_numeric(df['interaction'], errors='coerce').fillna(0).astype(int)

# One-hot encode both
df = pd.get_dummies(df, columns=['inter1', 'interaction'], drop_first=True)

In [13]:
df.to_pickle("acled_cleaned.pkl")