This is my attempt to replicate and improve the predictive models in Blair & Sambanis (2020) Forecasting Civil Wars: Theory and Structure in an Age of “Big Data” and Machine Learning. 

In [1]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score,  roc_auc_score
import os
from statistics import mean

Set the working directory and load the 1 month data from Blair & Sambanis (2020).

In [2]:
#os.chdir('set your directory path')
os.chdir("/home/christopher/Desktop/blair&sambanis2020_rn/b&s_replication" +
    "/sj-zip-1-jcr-10.1177_0022002720918923/replication-3")

file_path = 'data/1mo_data.dta'
df = pd.read_stata(file_path)

Deleting NaN for the target feature. These are observations in which a conflict is ongoing in a country, and therefor conflict cannot break out (at the country level).

Seperating the training and testing data for the base model


In [9]:
all_variables = ["gov_opp_low_level", "gov_reb_low_level", "opp_gov_low_level",
    "reb_gov_low_level", "gov_opp_nonviol_repression", "gov_reb_nonviol_repression",
    "gov_opp_accommodations", "gov_reb_accommodations", "reb_gov_demands",          
    "opp_gov_demands", "incidence_civil_ns_plus1"]

df = df[all_variables].dropna()

train_period = df.loc[(df['month'] == 12) & (df['year'] == 2007)]
train_period = mean(train_period['period'])
end_period = df.loc[(df['month'] == 12) & (df['year'] == 2015)]
end_period = mean(end_period['period'])

train = df.loc[(df['period'] <= train_period)]
test = df.loc[(df['period'] > train_period) & (df['period'] <= end_period)]

Create the target

## Escalation model
Select the features.

In [14]:
features_escalation = ["gov_opp_low_level", "gov_reb_low_level", "opp_gov_low_level",
    "reb_gov_low_level", "gov_opp_nonviol_repression", "gov_reb_nonviol_repression",
    "gov_opp_accommodations", "gov_reb_accommodations", "reb_gov_demands",          
    "opp_gov_demands"]

X_train_escalation = train[features_escalation]
X_test_escalation = test[features_escalation]

In [15]:
y_train_base = train['incidence_civil_ns_plus1']
y_test_base = test['incidence_civil_ns_plus1']

In [16]:
X_test_escalation.isnull().sum()

gov_opp_low_level             0
gov_reb_low_level             0
opp_gov_low_level             0
reb_gov_low_level             0
gov_opp_nonviol_repression    0
gov_reb_nonviol_repression    0
gov_opp_accommodations        0
gov_reb_accommodations        0
reb_gov_demands               0
opp_gov_demands               0
dtype: int64

Define the random forest model.

In [18]:
rf_reg = RandomForestRegressor(n_estimators=100000, max_leaf_nodes=5)
rf_reg.fit(X_train_escalation, y_train_base)
preds = rf_reg.predict(X_test_escalation)

print('Mean squared error: %.2f'
      % mean_squared_error(y_test_base, preds))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test_base, preds))

Mean squared error: 0.00
Coefficient of determination: -0.02


# Adding my improvements

In order to try to make a more accurate model, I use, imputation of missing values, scalling, and gradient boosting.

In [None]:
my_imputer = SimpleImputer()
X_train_imputed = pd.DataFrame(my_imputer.fit_transform(X_train))
X_test_imputed = pd.DataFrame(my_imputer.transform(X_test))

X_train_imputed.columns = X_train.columns
X_test_imputed.columns = X_test.columns

# feature scaling
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train_imputed)
X_test_sc = sc.transform(X_test_imputed)

Initial random forest model

In [None]:
rf_reg = RandomForestRegressor(n_estimators=100, max_leaf_nodes=5,
    n_jobs=1, random_state=0)
rf_reg.fit(X_train_sc, y_train)
preds = rf_reg.predict(X_test_sc)

print('\nRandom Forest\n')
print('Mean absoulte error: %.2f'
        % mean_absolute_error(y_test, preds))

#print('Coefficients: \n', rf_reg.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, preds))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, preds))
