# 03 – Text Cleaning & Feature Enrichment

In this notebook, we:
 - Preprocess event descriptions to reduce noise
 - Add structured features (e.g. country, fatalities)
 - Prepare input for modeling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
import spacy
import string

In [4]:
# Load data
df = pd.read_csv("acled_sample.csv")  # or re-query BigQuery
df = df[['event_description', 'event_type', 'fatalities', 'country', 'actor1']].dropna()

# Make sure target is categorical
df['event_type'] = df['event_type'].astype('category')
df['label'] = df['event_type'].cat.codes

In [5]:
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")

def clean_and_lemmatize(text):
    # Lowercase, remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text.lower())
    text = re.sub(r'\d+', '', text)

    # Tokenize and lemmatize with spaCy
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc 
        if token.lemma_ not in stop_words and token.lemma_.isalpha()
    ]

    return " ".join(tokens)

# Apply to dataset
df['clean_description'] = df['event_description'].apply(clean_and_lemmatize)

# Preview
df[['event_description', 'clean_description']].head()


Unnamed: 0,event_description,clean_description
0,"On 31 December 2020, Al Shabaab threw a hand g...",on december al shabaab threw a hand grenade at...
1,"On 30 December 2020, Al Shabaab militants thre...",on december al shabaab militants threw a hand ...
2,"On 29 December 2020, a remote controlled IED b...",on december a remote controlled ied by al shab...
3,"On 26 December 2020, Al Shabaab militants shot...",on december al shabaab militants shot and kill...
4,"On 26 December 2020, suspected Al Shabaab mili...",on december suspected al shabaab militants thr...
