In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

# Load your dataset
df = pd.read_csv("security_incidents_2025-03-12.csv")

# Define independent variables and target variables
target_vars = ['Total affected', 'Total killed', 'Total wounded', 'Total kidnapped']
independent_vars = ['Country', 'Region', 'Means of attack', 'Attack context', 'Motive']

# Drop rows where target variables contain non-numeric values
for col in target_vars:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, forcing errors to NaN

df = df.dropna(subset=target_vars)  # Remove rows with NaN target values

# Separate features and targets
X = df[independent_vars]
y = df[target_vars]

# Identify categorical features for encoding
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("R² Score:", r2)


Mean Absolute Error: 0.6318506514742374
Root Mean Squared Error: 1.5033364368199829
R² Score: 0.13473507736930268


On average, the model’s predictions are off by about 0.63 people when predicting the number of affected, killed, wounded, or kidnapped.It suggests relatively good performance

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load your dataset
df = pd.read_csv("security_incidents_2025-03-12.csv")

# Define independent variables and target variables
target_vars = ['Total affected', 'Total killed', 'Total wounded', 'Total kidnapped']
independent_vars = ['Country', 'Region', 'Means of attack', 'Attack context', 'Motive']

# Drop rows where target variables contain non-numeric values
for col in target_vars:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, forcing errors to NaN

df = df.dropna(subset=target_vars)  # Remove rows with NaN target values

# Separate features and targets
X = df[independent_vars]
y = df[target_vars]

# Identify categorical features for encoding
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')

# Define model using Gradient Boosting
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("R² Score:", r2)


ValueError: y should be a 1d array, got an array of shape (3468, 4) instead.

In [4]:
pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/5e/03/15cd49e855c62226ecf1831bbe4c8e73a4324856077a23c495538a36e557/xgboost-3.0.0-py3-none-win_amd64.whl.metadata
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/150.0 MB 1.2 MB/s eta 0:02:10
   ---------------------------------------- 0.1/150.0 MB 1.0 MB/s eta 0:02:26
   ---------------------------------------- 0.1/150.0 MB 950.9 kB/s eta 0:02:38
   ---------------------------------------- 0.3/150.0 MB 1.3 MB/s eta 0:01:58
   ---------------------------------------- 0.4/150.0 MB 1.4 MB/s eta 0:01:45
   ---------------------------------------- 0.5/150.0 MB 1.4 MB/s eta 0:01:47
   --------------------------------

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load your dataset
df = pd.read_csv("security_incidents_2025-03-12.csv")

# Define independent and target variables
target_vars = ['Total affected', 'Total killed', 'Total wounded', 'Total kidnapped']
independent_vars = ['Country', 'Region', 'Means of attack', 'Attack context', 'Motive']

# Drop rows where target variables contain non-numeric values
for col in target_vars:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, forcing errors to NaN

df = df.dropna(subset=target_vars)  # Remove rows with NaN target values

# Separate features and targets
X = df[independent_vars]
y = df[target_vars]

# Identify categorical features for encoding
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')

# Define the base model with hyperparameter tuning
xgb_model = XGBRegressor(
    n_estimators=200, 
    learning_rate=0.1, 
    max_depth=6, 
    random_state=42
)

# Wrap the base model in MultiOutputRegressor to handle multiple targets
multi_model = MultiOutputRegressor(xgb_model)

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', multi_model)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)
print("R² Score:", r2)


Mean Absolute Error: 0.6382941827031475
Root Mean Squared Error: 1.4711584319949158
R² Score: 0.16483898923599793
