In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder,MinMaxScaler,LabelEncoder
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error,mean_absolute_percentage_error
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

In [2]:
df=pd.read_csv('/kaggle/input/nigeria-crime/Nigeria_1997-2024_Sep20.csv')
df.head()

Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,inter1,...,location,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,tags,timestamp
0,NIG38575,2024-09-20,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),Refugees/IDPs (Nigeria),6,...,Bolori,11.8826,13.089,1,Whatsapp,New media,"On 20 September 2024, IDPs (flood victims) fro...",0,crowd size=no report,1727134598
1,NIG38585,2024-09-20,2024,2,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),Muslim Group (Nigeria),6,...,Ile-Ife,7.4824,4.5603,1,Daily Trust (Nigeria),National,"Around 20 September 2024 (as reported), hundre...",0,crowd size=hundreds,1727134598
2,NIG38581,2024-09-19,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),,6,...,Agodi,7.4035,3.9132,1,Daily Post (Nigeria); Guardian (Nigeria); Saha...,National-Regional,"On 19 September 2024, for a second consecutive...",0,crowd size=hundreds,1727134598
3,NIG38588,2024-09-19,2024,1,Strategic developments,Strategic developments,Disrupted weapons use,Police Forces of Nigeria (2023-),,1,...,Buruku,10.6179,7.2331,1,Daily Post (Nigeria); Nigeria Punch,National,"Weapons seizure: On 19 September 2024, Police ...",0,,1727134598
4,NIG38591,2024-09-19,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),PDP: People's Democratic Party,6,...,Wanno,7.1389,6.5724,1,Vanguard (Nigeria),National,"On 19 September 2024, PDP youths protested at ...",0,crowd size=no report,1727134598


In [3]:
for l in df.columns:
    print(l,"=",df[l].unique().shape)

event_id_cnty = (38076,)
event_date = (6513,)
year = (28,)
time_precision = (3,)
disorder_type = (4,)
event_type = (6,)
sub_event_type = (24,)
actor1 = (881,)
assoc_actor_1 = (1219,)
inter1 = (8,)
actor2 = (884,)
assoc_actor_2 = (1382,)
inter2 = (9,)
interaction = (43,)
civilian_targeting = (2,)
iso = (2,)
region = (1,)
country = (1,)
admin1 = (38,)
admin2 = (752,)
admin3 = (1,)
location = (5116,)
latitude = (4935,)
longitude = (5013,)
geo_precision = (3,)
source = (2824,)
source_scale = (23,)
notes = (36356,)
fatalities = (134,)
tags = (222,)
timestamp = (2107,)


Dropping unnecessary columns


In [4]:
df.drop(columns=['timestamp','admin3','country','region','iso','event_id_cnty','latitude','longitude','assoc_actor_1','assoc_actor_2','tags'], axis=1, inplace=True)

In [5]:
df['event_date'] = pd.to_datetime(df['event_date'])
df['month'] = df['event_date'].dt.month

In [7]:
df['month'].value_counts()

month
3     3490
5     3487
1     3482
4     3362
2     3341
6     3275
7     3248
8     3238
10    2855
12    2809
11    2767
9     2722
Name: count, dtype: int64

In [8]:
df.drop(columns=['event_date'], axis=1, inplace=True)

Label encoding the categorical columns


In [9]:
cat = df.select_dtypes(exclude=np.number).columns.tolist()
encoders={}

for col in cat:
    if col=='notes': 
        continue
    l = LabelEncoder()
    df[col] = l.fit_transform(df[col])
    encoders[col] = l

In [10]:
df.isna().sum()

year                  0
time_precision        0
disorder_type         0
event_type            0
sub_event_type        0
actor1                0
inter1                0
actor2                0
inter2                0
interaction           0
civilian_targeting    0
admin1                0
admin2                0
location              0
geo_precision         0
source                0
source_scale          0
notes                 0
fatalities            0
month                 0
dtype: int64

In [23]:
df.dropna(inplace=True)

Fine-tuning DistilBert on the given dataset

In [12]:
import os
os.environ["WANDB_MODE"] = "disabled"


In [13]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=512)

tokenized_texts = df['notes'].apply(tokenize_function).tolist()
input_ids = torch.tensor([text['input_ids'] for text in tokenized_texts])
attention_masks = torch.tensor([text['attention_mask'] for text in tokenized_texts])
labels = torch.tensor(df['fatalities'].values, dtype=torch.float)

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }
    
train_input_ids, val_input_ids, train_attention_masks, val_attention_masks, train_labels, val_labels = train_test_split(input_ids, attention_masks, labels, test_size=0.2, random_state=42)
train = CustomDataset(train_input_ids, train_attention_masks, train_labels)
val = CustomDataset(val_input_ids, val_attention_masks, val_labels)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

args = TrainingArguments(
    output_dir='./results',  
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy='epoch',
    save_strategy='epoch',  
    save_total_limit=2,  
    load_best_model_at_end=True,  
    metric_for_best_model='eval_loss',  
    logging_dir='./logs',
    logging_steps=10,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train,
    eval_dataset=val,
)
trainer.train()
results = trainer.evaluate()
print(results)
trainer.save_model('./best_model')  


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,422.964,86.759254
2,10.5536,75.773514
3,5.3683,67.315468
4,35.2642,63.41777
5,52.5021,60.544857


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 60.544857025146484, 'eval_runtime': 68.943, 'eval_samples_per_second': 110.468, 'eval_steps_per_second': 6.904, 'epoch': 5.0}


predicting values from bert and adding it to the dataframe

In [14]:
model.eval()  
def process_text(text):
    inp = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    inp = {key: value.to(device) for key, value in inp.items()}
    with torch.no_grad():
        out = model(**inp)
    logits = out.logits
    pred = logits.item()  
    return pred
df['bert_predictions'] = df['notes'].apply(process_text)

In [15]:
df.drop(columns=['notes'], axis=1, inplace=True)

Scaling the columns with standard scaler


In [16]:
scaler = StandardScaler()
df1 = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

Processing the data for training by splitting into train/test set

In [17]:
X=df1.drop(columns=['fatalities'],axis=1)
y=df1['fatalities']
X1=X.copy()

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Creating stacked models manually by passing BERT predictions as an input feature to traditional regression models


In [19]:
lg=LinearRegression()
lg.fit(X_train,y_train)
y_pred=lg.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"R2 Score: {r2}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Absolute Percentage Error: {mape}")


R2 Score: 0.5741366670169694
Mean Squared Error: 0.28302845004131677
Mean Absolute Error: 0.10531858494614235
Mean Absolute Percentage Error: 0.7545913435302664


In [20]:
rf=RandomForestRegressor()
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"R2 Score: {r2}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Absolute Percentage Error: {mape}")


R2 Score: 0.5309251777639868
Mean Squared Error: 0.31174677322162214
Mean Absolute Error: 0.04937705256277586
Mean Absolute Percentage Error: 0.3165029471768125


In [21]:
gb=GradientBoostingRegressor()
gb.fit(X_train,y_train)
y_pred=gb.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"R2 Score: {r2}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Absolute Percentage Error: {mape}")

R2 Score: 0.6470960130292498
Mean Squared Error: 0.23453972368574982
Mean Absolute Error: 0.048015844977241234
Mean Absolute Percentage Error: 0.29490963295914463


Feature importance based on the Random forest model


In [22]:

feature_importances = rf.feature_importances_
feature_names = X1.columns

# Create a DataFrame for better visualization
importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importances_df = importances_df.sort_values('Importance', ascending=False)

print(importances_df)


               Feature  Importance
18    bert_predictions    0.713394
13            location    0.051269
0                 year    0.037605
9          interaction    0.031034
5               actor1    0.030018
15              source    0.025995
17               month    0.021884
12              admin2    0.017637
11              admin1    0.013544
3           event_type    0.013377
16        source_scale    0.010756
1       time_precision    0.007502
4       sub_event_type    0.006593
7               actor2    0.006439
8               inter2    0.004495
6               inter1    0.002915
14       geo_precision    0.002779
10  civilian_targeting    0.002652
2        disorder_type    0.000113
