## Nigeria-Crime-Trends Project
Date: October 2024

In [1]:
# Import libraries
import pandas as pd
from datetime import datetime

# Set display options to show full dataset
pd.set_option('display.max_rows', None)   # Show all rows
pd.set_option('display.max_columns', None) # Show all columns
pd.set_option('display.width', None)      # Do not truncate output

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the dataset
df = pd.read_csv("../data/Nigeria_1997-2024_Sep20.csv")

In [3]:
# Print the first 5 rows of the dataset
df.head()

Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,inter1,actor2,assoc_actor_2,inter2,interaction,civilian_targeting,iso,region,country,admin1,admin2,admin3,location,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,tags,timestamp
0,NIG38575,2024-09-20,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),Refugees/IDPs (Nigeria),6,,,0,60,,566,Western Africa,Nigeria,Borno,Maiduguri Metro,,Bolori,11.8826,13.089,1,Whatsapp,New media,"On 20 September 2024, IDPs (flood victims) fro...",0,crowd size=no report,1727134598
1,NIG38585,2024-09-20,2024,2,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),Muslim Group (Nigeria),6,,,0,60,,566,Western Africa,Nigeria,Osun,Ife Central,,Ile-Ife,7.4824,4.5603,1,Daily Trust (Nigeria),National,"Around 20 September 2024 (as reported), hundre...",0,crowd size=hundreds,1727134598
2,NIG38581,2024-09-19,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),,6,,,0,60,,566,Western Africa,Nigeria,Oyo,Ibadan North,,Agodi,7.4035,3.9132,1,Daily Post (Nigeria); Guardian (Nigeria); Saha...,National-Regional,"On 19 September 2024, for a second consecutive...",0,crowd size=hundreds,1727134598
3,NIG38588,2024-09-19,2024,1,Strategic developments,Strategic developments,Disrupted weapons use,Police Forces of Nigeria (2023-),,1,Kaduna Communal Militia (Nigeria),,4,14,,566,Western Africa,Nigeria,Kaduna,Chikun,,Buruku,10.6179,7.2331,1,Daily Post (Nigeria); Nigeria Punch,National,"Weapons seizure: On 19 September 2024, Police ...",0,,1727134598
4,NIG38591,2024-09-19,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),PDP: People's Democratic Party,6,,,0,60,,566,Western Africa,Nigeria,Edo,Etsako East,,Wanno,7.1389,6.5724,1,Vanguard (Nigeria),National,"On 19 September 2024, PDP youths protested at ...",0,crowd size=no report,1727134598


In [4]:
# Print the information about the features
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38076 entries, 0 to 38075
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   event_id_cnty       38076 non-null  object 
 1   event_date          38076 non-null  object 
 2   year                38076 non-null  int64  
 3   time_precision      38076 non-null  int64  
 4   disorder_type       38076 non-null  object 
 5   event_type          38076 non-null  object 
 6   sub_event_type      38076 non-null  object 
 7   actor1              38076 non-null  object 
 8   assoc_actor_1       10199 non-null  object 
 9   inter1              38076 non-null  int64  
 10  actor2              29525 non-null  object 
 11  assoc_actor_2       8803 non-null   object 
 12  inter2              38076 non-null  int64  
 13  interaction         38076 non-null  int64  
 14  civilian_targeting  14964 non-null  object 
 15  iso                 38076 non-null  int64  
 16  regi

### Data Cleaning

**Steps:**
1. Handling missing values
2. Removing duplicates
3. Correcting data types (e.g., converting strings to dates)
4. Standardizing formats (e.g., date formats, categorical variable formats)

In [5]:
# Check the count and percentage of missing values in the dataset

# check the number of missing values
null_count = df.isnull().sum()

# calculate the percentage of missing values
percent_null = (df.isnull().sum() / len(df)) * 100

# create a DataFrame to display missing value count, and percentage missing
null_df = pd.DataFrame({
    'column_name': df.columns,
    'missing_value_count': null_count,
    'percent_missing': percent_null
})

# Display the DataFrame
null_df

Unnamed: 0,column_name,missing_value_count,percent_missing
event_id_cnty,event_id_cnty,0,0.0
event_date,event_date,0,0.0
year,year,0,0.0
time_precision,time_precision,0,0.0
disorder_type,disorder_type,0,0.0
event_type,event_type,0,0.0
sub_event_type,sub_event_type,0,0.0
actor1,actor1,0,0.0
assoc_actor_1,assoc_actor_1,27877,73.214098
inter1,inter1,0,0.0


In [6]:
# Replace NaN values in 'civilian_targeting' with 'Not Civilian targeting' as it was recorded that it can be blank
df['civilian_targeting'] = df['civilian_targeting'].fillna('Not Civilian targeting')

# Replace NaN values in 'actor2' with 'Not available' as it was recorded that it can be blank
df['actor2'] = df['actor2'].fillna('Not available')

# The row 'admin2' or 'admin1' had 1 missing value and had to be dropped because the location was the Gulf of Guinea; international waters
df = df.dropna(subset=['admin2', 'admin1'])

# Drop columns with over 70% of missing values or contain irrelevant data
columns_to_drop = ['assoc_actor_1', 'assoc_actor_2', 'admin3', 'tags'] 
df = df.drop(columns=columns_to_drop)

In [7]:
# Drop duplicates
df = df.drop_duplicates()

In [8]:
# find unique values in each column
for col in df.columns:
    print(col, df[col].nunique())

event_id_cnty 38075
event_date 6513
year 28
time_precision 3
disorder_type 4
event_type 6
sub_event_type 24
actor1 881
inter1 8
actor2 884
inter2 9
interaction 43
civilian_targeting 2
iso 2
region 1
country 1
admin1 37
admin2 751
location 5115
latitude 4934
longitude 5012
geo_precision 3
source 2824
source_scale 23
notes 36355
fatalities 134
timestamp 2107


In [9]:
# Drop columns with single unique values or contain non-relevant information 

# event_id_cnty contains 38075 unique values
# iso, region and country contains non-relevant information
# notes contained trimmed brief summary of the event

columns_to_drop = ['event_id_cnty', 'iso', 'region', 'country', 'notes'] 
df = df.drop(columns=columns_to_drop)

In [10]:
# Convert 'event_date' to datetime format
df['event_date'] = pd.to_datetime(df['event_date'])

# Convert 'timestamp' to datetime and format to 'yyyy-mm-dd'
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s').dt.strftime('%Y-%m-%d')

In [11]:
# preview data
df.head()

Unnamed: 0,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,inter1,actor2,inter2,interaction,civilian_targeting,admin1,admin2,location,latitude,longitude,geo_precision,source,source_scale,fatalities,timestamp
0,2024-09-20,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),6,Not available,0,60,Not Civilian targeting,Borno,Maiduguri Metro,Bolori,11.8826,13.089,1,Whatsapp,New media,0,2024-09-23
1,2024-09-20,2024,2,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),6,Not available,0,60,Not Civilian targeting,Osun,Ife Central,Ile-Ife,7.4824,4.5603,1,Daily Trust (Nigeria),National,0,2024-09-23
2,2024-09-19,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),6,Not available,0,60,Not Civilian targeting,Oyo,Ibadan North,Agodi,7.4035,3.9132,1,Daily Post (Nigeria); Guardian (Nigeria); Saha...,National-Regional,0,2024-09-23
3,2024-09-19,2024,1,Strategic developments,Strategic developments,Disrupted weapons use,Police Forces of Nigeria (2023-),1,Kaduna Communal Militia (Nigeria),4,14,Not Civilian targeting,Kaduna,Chikun,Buruku,10.6179,7.2331,1,Daily Post (Nigeria); Nigeria Punch,National,0,2024-09-23
4,2024-09-19,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),6,Not available,0,60,Not Civilian targeting,Edo,Etsako East,Wanno,7.1389,6.5724,1,Vanguard (Nigeria),National,0,2024-09-23


### Feature Engineering

**Steps:**
1. Extract `quarter_of_year`, `week_of_year`, `month`, `day`, `day_of_week` from `event_date`
2. Derive `date_difference` i.e. difference between the date when the event occurred and when it was uploaded on the ACLED database (`timestamp` - `event_date`)

In [12]:
# Extract quarter of the year
df['quarter_of_year'] = df['event_date'].dt.quarter

# Extract the week of the year
df['week_of_year'] = df['event_date'].dt.isocalendar().week

# Extract month name
df['month'] = df['event_date'].dt.strftime('%B')

# Extract day and convert to int64
df['day'] = df['event_date'].dt.day.astype('int64')

# Extract day of week
df['day_of_week'] = df['event_date'].dt.day_name()

# Convert 'timestamp' back to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Calculate the difference between 'timestamp' and 'event_date' in days
df['date_difference'] = (df['timestamp'] - df['event_date']).dt.days

In [13]:
# Drop redundant columns
columns_to_drop = ['event_date', 'timestamp'] 
df = df.drop(columns=columns_to_drop)

In [14]:
# cleaned dataset preview
df.head()

Unnamed: 0,year,time_precision,disorder_type,event_type,sub_event_type,actor1,inter1,actor2,inter2,interaction,civilian_targeting,admin1,admin2,location,latitude,longitude,geo_precision,source,source_scale,fatalities,quarter_of_year,week_of_year,month,day,day_of_week,date_difference
0,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),6,Not available,0,60,Not Civilian targeting,Borno,Maiduguri Metro,Bolori,11.8826,13.089,1,Whatsapp,New media,0,3,38,September,20,Friday,3
1,2024,2,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),6,Not available,0,60,Not Civilian targeting,Osun,Ife Central,Ile-Ife,7.4824,4.5603,1,Daily Trust (Nigeria),National,0,3,38,September,20,Friday,3
2,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),6,Not available,0,60,Not Civilian targeting,Oyo,Ibadan North,Agodi,7.4035,3.9132,1,Daily Post (Nigeria); Guardian (Nigeria); Saha...,National-Regional,0,3,38,September,19,Thursday,4
3,2024,1,Strategic developments,Strategic developments,Disrupted weapons use,Police Forces of Nigeria (2023-),1,Kaduna Communal Militia (Nigeria),4,14,Not Civilian targeting,Kaduna,Chikun,Buruku,10.6179,7.2331,1,Daily Post (Nigeria); Nigeria Punch,National,0,3,38,September,19,Thursday,4
4,2024,1,Demonstrations,Protests,Peaceful protest,Protesters (Nigeria),6,Not available,0,60,Not Civilian targeting,Edo,Etsako East,Wanno,7.1389,6.5724,1,Vanguard (Nigeria),National,0,3,38,September,19,Thursday,4


In [15]:
df.shape

(38075, 26)

In [16]:
# save cleaned data
df.to_csv('Nigeria_1997-2024_Sep20_cleaned.csv', index=False)