MODEL DEVELOPMENT AND EVALUATION

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('Nigeria_1997-2024_Sep20_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,event_date,year,disorder_type,event_type,sub_event_type,actor1,inter1,civilian_targeting,admin1,admin2,location,latitude,longitude,fatalities
0,2024-09-20,2024,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),6,Unknown,Borno,Maiduguri Metro,Bolori,11.8826,13.089,0
1,2024-09-20,2024,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),6,Unknown,Osun,Ife Central,Ile-Ife,7.4824,4.5603,0
2,2024-09-19,2024,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),6,Unknown,Oyo,Ibadan North,Agodi,7.4035,3.9132,0
3,2024-09-19,2024,Strategic developments,Strategic developments,Disrupted weapons use,Police Forces of Nigeria (2023-),1,Unknown,Kaduna,Chikun,Buruku,10.6179,7.2331,0
4,2024-09-19,2024,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),6,Unknown,Edo,Etsako East,Wanno,7.1389,6.5724,0


In [4]:
#Confirming that there are no NaN values in the dataframe
df.isna().sum()

event_date            0
year                  0
disorder_type         0
event_type            0
sub_event_type        0
actor1                0
inter1                0
civilian_targeting    0
admin1                0
admin2                0
location              0
latitude              0
longitude             0
fatalities            0
dtype: int64

In [5]:
import numpy as np
df.shape

(37794, 14)

FEATURE ENGINEERING

In [6]:
df['event_date'] = pd.to_datetime(df['event_date'])

# Extract year, month, and day of week
df['month'] = df['event_date'].dt.month
df['day_of_week'] = df['event_date'].dt.dayofweek  # Monday=0, Sunday=6
df['day_of_month'] = df['event_date'].dt.day
df = df.drop(columns=['event_date'])

In [7]:
#Converting categorical data to numeric data as ML models can not be applied upon strings.
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to the 'admin2' column
df['admin2_encoded'] = label_encoder.fit_transform(df['admin2'])

df = df.drop('admin2', axis=1)

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Apply Label Encoding to each categorical column
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

# Check the updated DataFrame
print(df)

       year  disorder_type  event_type  sub_event_type  actor1  inter1  \
0      2024              0           2              17     737       6   
1      2024              0           2              17     737       6   
2      2024              0           2              17     737       6   
3      2024              3           4               7     723       1   
4      2024              0           2              17     737       6   
...     ...            ...         ...             ...     ...     ...   
37789  1997              1           0               3     335       4   
37790  1997              1           0               3     335       4   
37791  1997              1           0               3     214       4   
37792  1997              1           0               3     214       4   
37793  1997              1           1              19     831       3   

       civilian_targeting  admin1  location  latitude  longitude  fatalities  \
0                       1      

In [8]:
# Encode inter1 (interaction type) if it's not numeric
df['inter1'] = df['inter1'].astype(int)

# Encode civilian_targeting (turn 'Unknown' into 0, or others into 1 if applicable)
df['civilian_targeting'] = df['civilian_targeting'].apply(lambda x: 0 if x == 'Unknown' else 1)

In [9]:
df.head()

Unnamed: 0,year,disorder_type,event_type,sub_event_type,actor1,inter1,civilian_targeting,admin1,location,latitude,longitude,fatalities,month,day_of_week,day_of_month,admin2_encoded
0,2024,0,2,17,737,6,1,7,775,11.8826,13.089,0,9,4,20,458
1,2024,0,2,17,737,6,1,29,2095,7.4824,4.5603,0,9,4,20,286
2,2024,0,2,17,737,6,1,30,153,7.4035,3.9132,0,9,3,19,263
3,2024,3,4,7,723,1,1,18,867,10.6179,7.2331,0,9,3,19,131
4,2024,0,2,17,737,6,1,11,4918,7.1389,6.5724,0,9,3,19,200


ML MODELLING

In [10]:
# For splitting data into train data and test data
from sklearn.model_selection import train_test_split

In [11]:
# Define features and target variable
X = df.drop('fatalities', axis=1)  # Feature set
y = df['fatalities']  # Target variable

# Split the data into training and testing sets (60% train, 40% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [12]:
#Import libraries for different ML models
from sklearn.linear_model import LogisticRegression #Logistic regression
from sklearn.neighbors import KNeighborsClassifier # kNN
from sklearn.naive_bayes import GaussianNB # Naive Bayes
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier #Random forest, bagging, boosting respectively

from sklearn.metrics import accuracy_score

In [13]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "Bagging": BaggingClassifier(n_estimators=10),
    "Boosting": GradientBoostingClassifier()
}

In [14]:
# Dictionary to store accuracy results
accuracy_results = {}

In [15]:
# Train and evaluate each model
for model_name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, predictions)

    # Store accuracy in the results dictionary
    accuracy_results[model_name] = accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
# Display the accuracy results
for model_name, accuracy in accuracy_results.items():
    print(f"{model_name}: {accuracy:.4f}")

# Identify the best model
best_model = max(accuracy_results, key=accuracy_results.get)
print(f"The best model is: {best_model} with accuracy: {accuracy_results[best_model]:.4f}")

Logistic Regression: 0.5437
KNN: 0.5128
Naive Bayes: 0.4283
Random Forest: 0.5847
Bagging: 0.5654
Boosting: 0.5152
The best model is: Random Forest with accuracy: 0.5847


In [19]:
from sklearn.metrics import f1_score

In [20]:
# Dictionary to store F1 score results
f1_score_results = {}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Make predictions
    predictions = model.predict(X_test)
    
    # Calculate F1 score
    f1 = f1_score(y_test, predictions, average='weighted')
    
    # Store F1 score in the results dictionary
    f1_score_results[model_name] = f1
    print(f"{model_name}: F1 Score = {f1:.4f}")

# Identify the best model based on F1 score
best_model = max(f1_score_results, key=f1_score_results.get)
print(f"The best model is: {best_model} with F1 Score = {f1_score_results[best_model]:.4f}")

Training Logistic Regression...
Logistic Regression: F1 Score = 0.3842
Training KNN...
KNN: F1 Score = 0.4415
Training Naive Bayes...
Naive Bayes: F1 Score = 0.4508
Training Random Forest...
Random Forest: F1 Score = 0.5459
Training Bagging...
Bagging: F1 Score = 0.5315
Training Boosting...
Boosting: F1 Score = 0.4700
The best model is: Random Forest with F1 Score = 0.5459
