In [1]:
#!/usr/bin/env python
# coding: utf-8

# Import necessary libraries for data manipulation, visualization, and modeling
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pickle

# Importing machine learning libraries and evaluation metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    PowerTransformer, StandardScaler, OneHotEncoder, 
    OrdinalEncoder
)
from sklearn.metrics import (
    mean_absolute_error, 
    r2_score
)
from xgboost import XGBRegressor
from sklearn.cluster import DBSCAN
from sklearn.impute import SimpleImputer

# Suppress warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data
excel_file_path = "../data/Nigeria_1997-2024_Sep20.csv"
df = pd.read_csv(excel_file_path, encoding="latin-1")

In [3]:
# Apply DBSCAN clustering on latitude and longitude and visualize clusters
def perform_dbscan_and_plot(df, eps=0.17, min_samples=4):
    """
    Perform DBSCAN clustering on latitude and longitude data and plot the resulting clusters.

    Parameters:
    - df (DataFrame): The DataFrame containing 'latitude' and 'longitude' columns.
    - eps (float): The maximum distance between two points for them to be considered as in the same neighborhood.
    - min_samples (int): The minimum number of points to form a cluster.

    Returns:
    - df (DataFrame): The DataFrame with an additional 'cluster' column containing cluster labels.
    """
    # Prepare data
    coords = df[['latitude', 'longitude']].to_numpy()
    # Apply DBSCAN for clustering
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(coords)
    # Add cluster labels to DataFrame
    df['cluster'] = db.labels_
    # Plot clusters
    plt.figure(figsize=(10, 6))
    # Plot each cluster
    for cluster_label in df['cluster'].unique():
        cluster_data = df[df['cluster'] == cluster_label]
        plt.scatter(cluster_data['longitude'], cluster_data['latitude'], label=f'Cluster {cluster_label}', s=20)
    # Customize plot
    plt.title("DBSCAN Clusters of Crimes")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.legend(loc='best')
    plt.savefig("eda_images/dbscan_clusters.png", format='png', dpi=300)
    plt.close()
    return df

# Encoding specific columns with custom integer replacements
def encode_source_scale(df):
    int_replacements = { "National": 1, "International": 2, "Local partner-International": 3, "New media": 4, "National-Regional": 5, "Regional": 6, "Subnational": 7, "National-International": 8, "New media-National": 9, "Other": 10, "Local partner-Other": 11, "Subnational-National": 12, "Other-National": 13, "Other-New media": 14, "New media-Regional": 15, "Other-International": 16, "New media-International": 17, "Other-Subnational": 18, "New media-Subnational": 19, "Subnational-Regional": 20, "Subnational-International": 21, "Other-Regional": 22, "Regional-International": 23,}
    df['source_scale'] = df['source_scale'].replace(int_replacements)
    int_replacements = { "Civilian targeting": 1}
    df['civilian_targeting'] = df['civilian_targeting'].replace(int_replacements)
    df['civilian_targeting']=df['civilian_targeting'].fillna(0)
    return df

# Function to impute missing values in specific actor columns
def imputing(df):
    # Define the keywords to search for in 'actor1' column
    keywords = {'militia':['militia', 'boko', 'ipob', 'islam', 'unidentified'], 'state':['police', 'military']}
    for x,y in keywords.items():
        # Create a mask for rows where 'actor1' contains any of the keyword
        mask = df['actor1'].str.contains('|'.join(y), case=False, na=False)
        # # Check for rows where 'assoc_actor_1' is empty (NaN) and apply the mas
        df.loc[mask & df['assoc_actor_1'].isnull(), 'assoc_actor_1'] = x
        # Create a mask for rows where 'actor1' contains any of the keyword
        mask = df['actor2'].str.contains('|'.join(y), case=False, na=False)
        # # Check for rows where 'assoc_actor_1' is empty (NaN) and apply the mas
        df.loc[mask & df['assoc_actor_2'].isnull(), 'assoc_actor_2'] = x
        break
    return df

# Extract year, month, day, and week from the event date column
def date_handling(df):
    # Create separate columns for month, day, and year
    df['event_date'] = pd.to_datetime(df['event_date'])
    df['month'] = df['event_date'].dt.month
    df['day'] = df['event_date'].dt.day
    df['year'] = df['event_date'].dt.year
    df['week'] = df['event_date'].dt.to_period('W').dt.start_time
    return df

# Preprocessing pipeline to apply all transformations
def pre_process(df):
    df = encode_source_scale(df)
    df = imputing(df)
    df = date_handling(df)
    df['assoc_actor_1'] = df['assoc_actor_1'].fillna('Unknown')
    df = perform_dbscan_and_plot(df, eps=0.17, min_samples=4)
    return df


df = pre_process(df)

In [4]:
df.head()

Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,inter1,...,source,source_scale,notes,fatalities,tags,timestamp,month,day,week,cluster
0,NIG38575,2024-09-20,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),Refugees/IDPs (Nigeria),6,...,Whatsapp,4,"On 20 September 2024, IDPs (flood victims) fro...",0,crowd size=no report,1727134598,9,20,2024-09-16,0
1,NIG38585,2024-09-20,2024,2,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),Muslim Group (Nigeria),6,...,Daily Trust (Nigeria),1,"Around 20 September 2024 (as reported), hundre...",0,crowd size=hundreds,1727134598,9,20,2024-09-16,1
2,NIG38581,2024-09-19,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),Unknown,6,...,Daily Post (Nigeria); Guardian (Nigeria); Saha...,5,"On 19 September 2024, for a second consecutive...",0,crowd size=hundreds,1727134598,9,19,2024-09-16,1
3,NIG38588,2024-09-19,2024,1,Strategic developments,Strategic developments,Disrupted weapons use,Police Forces of Nigeria (2023-),Unknown,1,...,Daily Post (Nigeria); Nigeria Punch,1,"Weapons seizure: On 19 September 2024, Police ...",0,,1727134598,9,19,2024-09-16,2
4,NIG38591,2024-09-19,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),PDP: People's Democratic Party,6,...,Vanguard (Nigeria),1,"On 19 September 2024, PDP youths protested at ...",0,crowd size=no report,1727134598,9,19,2024-09-16,1


In [5]:
# Drop duplicate rows and save the cleaned dataset
df = df.drop_duplicates()
df.to_csv("data_after_pre_processing.csv", index=False)

In [6]:
# Define features (X) and target (Y) for model training
def get_X_Y(df):
    X = df.drop(columns=["event_id_cnty", "fatalities", "iso", "region", "country", "admin3", "tags", "notes", "event_date"])
    Y = df["fatalities"]
    return X, Y


X, Y = get_X_Y(df)
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.20, random_state=5
)

print(X_train.shape)

(30460, 26)


In [7]:
# Define categorical and numerical features
categories_order = {
    "civilian_targeting": sorted(list(df["civilian_targeting"].unique())),
    "year": sorted(list(df["year"].unique())),
    "time_precision": sorted(list(df["time_precision"].unique())),
    "source_scale": sorted(list(df["source_scale"].unique())),
    "geo_precision": sorted(list(df["geo_precision"].unique())),
    "month": sorted(list(df["month"].unique())),
    "year": sorted(list(df["year"].unique())),
    "day": sorted(list(df["day"].unique())),
}
categorical_feat_ord = list(categories_order.keys())
categorical_feat_nom = [ "disorder_type", "event_type", "sub_event_type", "actor1", "actor2", "admin1", "admin2", "timestamp", "location", "source", "assoc_actor_1", "assoc_actor_2", "cluster", "week"]
numerical_features_1 = ["inter1", "inter2", "latitude", "longitude"]
numerical_features_2 = ["interaction"]

In [8]:
# Visualize histograms for numerical features
def draw_histograms(df):
    plt.figure(figsize=(12, 8))  # Adjust size based on number of columns
    arr = numerical_features_1+numerical_features_2
    for i, col in enumerate(arr, 1):
        plt.subplot(3, 3, i)  # Adjust grid size based on the number of columns
        sns.histplot(df[col], kde=True, bins=30)
        plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.savefig("eda_images/num_data_dist.png", format='png', dpi=300)
    plt.close()


# Example usage:
draw_histograms(df)


In [9]:
# Separate transformers for categorical and numerical features

trf = StandardScaler()
trf1 = PowerTransformer()
trf2 = FunctionTransformer(np.sqrt, validate=True)

numerical_transformer_1 = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("stdscal", trf),
        ("pwr", trf1),
    ]
)
numerical_transformer_2 = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("sqrt", trf2),
    ]
)
categorical_transformer_onehot = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
categorical_transformer_ordinal = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        (
            "ordinal",
            OrdinalEncoder(
                categories=[categories_order[col] for col in categorical_feat_ord],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        ),
    ]
)

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer_onehot, categorical_feat_nom),
        ("cat_1", categorical_transformer_ordinal, categorical_feat_ord),
        ("num", numerical_transformer_1, numerical_features_1),
        ("num_1", numerical_transformer_2, numerical_features_2),
    ]
)

model = XGBRegressor(learning_rate=0.22, n_estimators=500, subsample=1)

# Define the pipeline
pipeline = Pipeline([("preprocessor", preprocessor),("model", model)])

# Fit the pipeline on the training data
pipeline.fit(X_train, Y_train)

In [11]:
# Save the fitted pipeline as a .pkl file
filename_pkl = "model.pkl"
pickle.dump(pipeline, open(filename_pkl, "wb"))
print(f"Model saved as {filename_pkl}")
# Evaluate the model
y_pred = pipeline.predict(X_test)
print(f"Mean absolute Error: {mean_absolute_error(Y_test, y_pred)}")
r2 = r2_score(Y_test, y_pred)
print(f"R² score: {r2}")
n = len(Y_test)
p = len(X_test.columns)
adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)
print(f"Adjusted R² score: {adj_r2}")

Model saved as model.pkl
Mean absolute Error: 3.076121096540979
R² score: 0.3551264423727535
Adjusted R² score: 0.352917098256492
