 # Logistic Regression with cleaned text + structured features


In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Load dataset with cleaned text and structured features
df = pd.read_pickle("acled_cleaned.pkl")
df.head()


Unnamed: 0.1,Unnamed: 0,event_id_cnty,event_type,event_date,country,admin1,admin2,actor1,actor2,event_description,fatalities,latitude,longitude,label,clean_description,fatalities_clipped,desc_length
0,0,SOM45530,Explosions/Remote violence,2024-08-25,Somalia,Banadir,Banadir,Al Shabaab,Military Forces of Somalia (2022-),"On 25 August 2024, an IED planted by Al Shabaa...",2,2.0611,45.2589,1,August ied plant Al Shabaab detonate target go...,2,26
1,1,SOM45523,Violence against civilians,2024-08-24,Somalia,Banadir,Banadir,Al Shabaab,Civilians (Somalia),"On 24 August 2024, Al Shabaab shot and killed ...",2,2.0576,45.2853,5,August Al Shabaab shoot kill government soldie...,2,25
2,2,MUS523,Protests,2024-08-27,Mauritius,Moka,,Protesters (Mauritius),Police Forces of Mauritius (2018-) Special Sup...,"On 27 August 2024, at the call of the Rann Nou...",0,-20.219,57.5108,2,August Rann Nou La Terre movement Indo Mauriti...,0,66
3,3,IND165800,Strategic developments,2024-08-27,India,Assam,Nagaon,Civilians (India),,"Other: On 27 August 2024, a section of fish tr...",0,26.3469,92.6851,4,August section fish trader announce halt expor...,0,35
4,4,IRQ58463,Strategic developments,2024-08-27,Iraq,Ninewa,Al Mosul,Civilians (Iraq),,"Other: On 27 August 2024, 517 displaced people...",0,36.335,43.1189,4,August displace people return Dibaga Camp Erbi...,0,29


In [3]:
# Preprocessing structured features
df['fatalities'] = pd.to_numeric(df['fatalities'], errors='coerce').fillna(0)
df['fatalities_clipped'] = df['fatalities'].clip(upper=50)
df['desc_length'] = df['clean_description'].apply(lambda x: len(x.split()))
df['inter1'] = pd.to_numeric(df['inter1'], errors='coerce').fillna(0).astype(int)
df['interaction'] = pd.to_numeric(df['interaction'], errors='coerce').fillna(0).astype(int)



KeyError: 'inter1'

In [None]:
# One-hot encode categorical structured features
df = pd.get_dummies(df, columns=['inter1', 'interaction'], prefix=['actor_type', 'interaction'], drop_first=True)

# Text vectorization (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_text = vectorizer.fit_transform(df['clean_description'])

# Combine text features with structured ones

structured_features = df[['fatalities_clipped', 'desc_length'] + 
    [col for col in df.columns if col.startswith('actor_type_') or col.startswith('interaction_')]
]

X_structured = structured_features.reset_index(drop=True)



In [None]:
# Combine sparse matrix with dense DataFrame
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_structured_scaled = scaler.fit_transform(X_structured)
X_combined = hstack([X_text, X_structured_scaled])

# Target variable
y = df['event_type']

# Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, stratify=y, random_state=42)



In [None]:
# Train logistic regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)



In [None]:
# Evaluate
y_pred = clf.predict(X_test)



In [None]:
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=y.cat.categories))

# Normalized confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=y.cat.categories, normalize='true')
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=y.cat.categories,
            yticklabels=y.cat.categories)
plt.title("Normalized Confusion Matrix - Logistic Regression (Text + Structured Features)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()