In [1]:
import pandas as pd
import os
import time

# work with directories
from pathlib import Path 

# datetime
import datetime as dt

# work with data
import numpy as np

# visualize data
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# create bag of word vector
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sktime.forecasting.arima import ARIMA, AutoARIMA

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score, precision_recall_curve, auc
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate

import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

## Import data

In [68]:
path = os.getcwd()
parent_path = os.path.abspath(os.path.join(path, os.pardir))

merged_og = pd.read_csv(os.path.abspath(parent_path + '/data/merged.csv'))
wb_pop_og = pd.read_csv(os.path.abspath(parent_path + '/data/WorldBank_pop/wb_pop.csv'), sep=';')

print('countires with pop data: ',wb_pop_og['Country Code'].nunique())
print('countires in merged data: ',merged_og['isocode'].nunique())

countires with pop data:  266
countires in merged data:  240


In [120]:
merged = merged_og.copy()
merged = merged[merged['year'] >= 1989]

# Because of issue with MonthYear column: create new column with year and month
merged['year'] = merged['year'].astype(str)
merged['month'] = merged['month'].astype(str)

# Pad the month column with leading zeros if needed (e.g., convert '1' to '01')
merged['month'] = merged['month'].str.zfill(2)

# Concatenate the year and month columns with a '-' &  convert to datetime object
merged['month_year'] = merged['year'] + '-' + merged['month']
merged['month_year'] = pd.to_datetime(merged['month_year'], format='%Y-%m')


# compare month_year and MonthYear
merged.MonthYear = pd.to_datetime(merged.MonthYear)

print('entires merged:', len(merged))
print('date range:', merged.month_year.min(), 'to', merged.month_year.max())
print('date range:', merged.MonthYear.min(), 'to', merged.MonthYear.max())

merged[['month_year', 'MonthYear']].head(3)

entires merged: 92637
date range: 1989-01-01 00:00:00 to 2023-05-01 00:00:00
date range: 1970-01-01 00:00:00.000198901 to 1970-01-01 00:00:00.000202305


Unnamed: 0,month_year,MonthYear
23666,1989-01-01,1970-01-01 00:00:00.000198901
23667,1989-01-01,1970-01-01 00:00:00.000198901
23668,1989-01-01,1970-01-01 00:00:00.000198901


In [121]:
pop = wb_pop_og.copy()
pop.drop(columns=['Unnamed: 67', '2022', 'Indicator Name', 'Indicator Code'], inplace=True)

# fill missing years
pop['2022'] = pop['2021']
pop['2023'] = pop['2021']

# melt population data
pop = pd.melt(pop, id_vars=('Country Code', 'Country Name'), var_name='Year', value_name='wb_pop')

pop.rename(columns={'Country Code': 'isocode'}, inplace=True)
pop.rename(columns={'Year': 'year'}, inplace=True)
pop = pop[pop['year'] >= '1989']

In [131]:
df_og = pd.merge(merged, pop, on=['isocode', 'year'], how='left')


# fill missing deaths with 0
death_columns = ['deaths', 'state_deaths', 'nonstate_deaths', 'onesided_deaths', 'civilian_deaths']
df_og[death_columns] = df[death_columns].fillna(0)

# fill missing events with with 0
event_share_columns= df.filter(like='event_share').columns.tolist()
df_og[event_share_columns] = df[event_share_columns].fillna(0)

In [141]:
df_og['deaths_pc'] = df['deaths'] /  df['wb_pop'] *1000000

In [133]:
df_og

Unnamed: 0,MonthYear,isocode,month,year,count_events_1,count_events_2,count_events_3,count_events_4,count_events_5,count_events_6,...,country,deaths,state_deaths,nonstate_deaths,onesided_deaths,civilian_deaths,month_year,Country Name,wb_pop,rel_deaths
0,1970-01-01 00:00:00.000198901,AFG,01,1989,462.0,256.0,338.0,2024.0,264.0,88.0,...,Afghanistan,693.0,693.0,0.0,0.0,0.0,1989-01-01,Afghanistan,10673168.0,64.929176
1,1970-01-01 00:00:00.000198901,AGO,01,1989,110.0,72.0,152.0,430.0,242.0,24.0,...,Angola,249.0,249.0,0.0,0.0,0.0,1989-01-01,Angola,11439498.0,21.766689
2,1970-01-01 00:00:00.000198901,ALB,01,1989,64.0,44.0,116.0,192.0,18.0,0.0,...,,0.0,0.0,0.0,0.0,0.0,1989-01-01,Albania,3227943.0,0.000000
3,1970-01-01 00:00:00.000198901,ARE,01,1989,10.0,10.0,16.0,24.0,2.0,2.0,...,,0.0,0.0,0.0,0.0,0.0,1989-01-01,United Arab Emirates,1791840.0,0.000000
4,1970-01-01 00:00:00.000198901,ARG,01,1989,146.0,36.0,10.0,200.0,58.0,18.0,...,,0.0,0.0,0.0,0.0,0.0,1989-01-01,Argentina,32165766.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92632,1970-01-01 00:00:00.000202305,XKX,05,2023,60.0,50.0,58.0,200.0,124.0,2.0,...,,0.0,0.0,0.0,0.0,0.0,2023-05-01,Kosovo,1786038.0,0.000000
92633,1970-01-01 00:00:00.000202305,YEM,05,2023,276.0,156.0,182.0,1364.0,362.0,38.0,...,,0.0,0.0,0.0,0.0,0.0,2023-05-01,"Yemen, Rep.",32981641.0,0.000000
92634,1970-01-01 00:00:00.000202305,ZAF,05,2023,6568.0,2976.0,2874.0,10818.0,3262.0,1188.0,...,,0.0,0.0,0.0,0.0,0.0,2023-05-01,South Africa,59392255.0,0.000000
92635,1970-01-01 00:00:00.000202305,ZMB,05,2023,386.0,264.0,176.0,724.0,374.0,48.0,...,,0.0,0.0,0.0,0.0,0.0,2023-05-01,Zambia,19473125.0,0.000000


## Model preparation

In [142]:
df = df_og.copy()

### Generate classifier option

In [125]:
deaths_cutoff = 20
shifter = 2
TARGET = 'conflict'

unit_of_analyis = 'isocode'

In [137]:
def make_target(df: pd.DataFrame, shifter: int, target: str): # t is the number of periods for the shift

    # loop through each period and generate the shift variables
    for i in range(1, shifter+1):
        col_name = f'{target}{i}' # name doesn't matter, just interimset str(target) + f'{i}' # 

        df[col_name] = df.groupby(unit_of_analyis)[str(target)].shift(-i)

    # take the maximum for t periods forward and create the new variable
    df['target_clsf_f{}'.format(shifter)] = df[[f'{target}{i}' for i in range(1, shifter+1)]].max(axis=1, skipna=False)

    # drop the shift variables
    df = df.drop(columns=[f'{target}{i}' for i in range(1, shifter+1)])
    
    return(df)

In [143]:
# generate conflict variable
df['conflict'] = (df['deaths_pc'] >= deaths_cutoff).astype(int)

# generate binary classification  target
df = make_target(df, shifter, TARGET)

# generate regression target
df[f'target_regr_f{shifter}'] = df.groupby(unit_of_analyis)['rel_deaths'].shift(-shifter)

# check it worked
df.loc[df['isocode'] == 'BFA', ['isocode','year', 'deaths', 'wb_pop', 'deaths_pc', 'conflict', f'target_clsf_f{shifter}', f'target_regr_f{shifter}']][-8:]

Unnamed: 0,isocode,year,deaths,wb_pop,deaths_pc,conflict,target_clsf_f2,target_regr_f2
90764,BFA,2022,105.0,22100683.0,4.750984,0,0.0,8.823257
91001,BFA,2022,203.0,22100683.0,9.185236,0,0.0,0.0
91238,BFA,2022,195.0,22100683.0,8.823257,0,1.0,21.492548
91475,BFA,2023,0.0,22100683.0,0.0,0,1.0,25.338583
91710,BFA,2023,475.0,22100683.0,21.492548,1,1.0,0.0
91947,BFA,2023,560.0,22100683.0,25.338583,1,0.0,0.0
92184,BFA,2023,0.0,22100683.0,0.0,0,,
92421,BFA,2023,0.0,22100683.0,0.0,0,,
