In [None]:
import pandas as pd
import pycountry
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from pathlib import Path
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ProgbarLogger
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping
from keras import regularizers
from collections import Counter

In [None]:
dfolder = 'data/'

In [None]:
df = pd.read_csv(dfolder + 'merged.csv')

In [None]:
df

In [None]:
df = df[df['year'] >= 1989]

In [None]:
df.columns

In [None]:
cols = ['country', 'deaths', 'state_deaths', 'nonstate_deaths', 'onesided_deaths', 'civilian_deaths']
for col in cols:
    pct_missing = df[col].isnull().sum() * 100 / len(df)
    print(f'{col}: {pct_missing}%')


In [None]:
df_no_nan_country = df[df['country'].notna()]
df_no_nan_country

In [None]:
df['nonstate_deaths'].fillna(0, inplace=True)
df['onesided_deaths'].fillna(0, inplace=True)
df['civilian_deaths'].fillna(0, inplace=True)
df['deaths'].fillna(0, inplace=True)
df['state_deaths'].fillna(0, inplace=True)


In [None]:
df.MonthYear.dtype
df.year.dtype
df.month.dtype

Preprocessing

In [None]:
# 'year' and 'month' columns to a datetime type
df['Date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))

# columns to be filled
fill_cols = [col for col in df.columns if 'count' in col]

dfs = []

# Iterate through unique 'isocode' values
for isocode in df['isocode'].unique():
    df_isocode = df[df['isocode'] == isocode].copy()

    # Create a new dataframe that covers all months between the min and max 'Date' for this isocode
    min_date = df_isocode['Date'].min()
    max_date = df_isocode['Date'].max()
    all_dates = pd.DataFrame(pd.date_range(min_date, max_date, freq='MS'), columns=['Date'])

    # merge onto the existing dataframe
    df_isocode = pd.merge(all_dates, df_isocode, on='Date', how='left')

    df_isocode['isocode'] = isocode

    # forward-fill and then fill any remaining NaNs with 0
    df_isocode[fill_cols] = df_isocode[fill_cols].ffill().fillna(0)

    # convert the 'date' back to 'year' and 'month'
    df_isocode['year'] = df_isocode['Date'].dt.year
    df_isocode['month'] = df_isocode['Date'].dt.month

    # append to list
    dfs.append(df_isocode)

# concatenate all DataFrames in the list into a final DataFrame
df_final = pd.concat(dfs)

df_final.drop(columns=['Date'], inplace=True)

# sort final dataframe by 'isocode', 'year' and 'month'
df_final.sort_values(['year', 'month', 'isocode'], inplace=True)
df_final.fillna(0)


In [None]:
df_final
df_final.drop(['MonthYear'],axis=1, inplace=True)

In [None]:
def get_country_name(iso_code):
    try:
        return pycountry.countries.get(alpha_3=iso_code).name
    except AttributeError:
        return None

df_final['country'] = df_final['isocode'].apply(get_country_name)

In [None]:
filtered_df = df_final[df_final['country'] == 'Afghanistan']
filtered_df

Every country isocode has the correct number of months in MonthYear. No months are missing.
For every country, the months are in ascending order, meaning the data for each isocode is ordered correctly by time.


In [None]:
df_final

In [None]:
def check_month_year_sequence(group):
    # Create the expected sequence of months and years
    min_year, min_month = group[['year', 'month']].iloc[0] # use the first row of each group
    max_year, max_month = group[['year', 'month']].iloc[-1] # use the last row of each group

    expected_month_years = [(y, m) for y in range(min_year, max_year + 1) for m in range(1, 13)]
    
    # If there is only one year in the data, filter for months within the min and max range
    if min_year == max_year:
        expected_month_years = [my for my in expected_month_years if min_month <= my[1] <= max_month]
    else:
        # If there are multiple years, adjust for the first and last years
        expected_month_years = [my for my in expected_month_years if 
                                not (my[0] == min_year and my[1] < min_month) and 
                                not (my[0] == max_year and my[1] > max_month)]
                                
    # Check if the sequence of month-years in the group is equal to the expected sequence
    actual_month_years = sorted(list(zip(group['year'], group['month'])))
    
    if actual_month_years != expected_month_years:
        print(f"Incorrect sequence for isocode: {group['isocode'].iloc[0]}")
        print(f"Expected: {expected_month_years}")
        print(f"Actual: {actual_month_years}")
        
    return actual_month_years == expected_month_years


In [None]:
# Apply the function to each group
is_sequence_correct = df_final.groupby('isocode').apply(check_month_year_sequence)

# Check if the sequence of month-years is correct for all isocodes
assert is_sequence_correct.all(), "The sequence of month-years is not correct for some isocodes"


In [None]:
df_final

In [None]:
df_final['date'] = pd.to_datetime(df_final[['year', 'month']].assign(day=1))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(15,6))
df_final.set_index('date')['state_deaths'].plot()
plt.title('State Deaths over Time')
plt.xlabel('Date')
plt.ylabel('State Deaths')
plt.show()


In [None]:

# df_final['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))
# df_final.set_index('date', inplace=True)

# # list of all countries
# countries = df_final['country'].unique()

# # store the lagged features
# df_lagged = pd.DataFrame()

# # Loop over each country
# for country in countries:
#     df_country = df[df['country'] == country].copy()
#     df_country['deaths_lag'] = df_country['deaths'].shift(1) 
#     df_lagged = pd.concat([df_lagged, df_country])

# # Drop rows with missing values
# df_lagged.dropna(subset=['deaths_lag'], inplace=True)


In [None]:
df_final = df_final.sort_values(['year', 'month', 'isocode'])


In [None]:
df_final

In [None]:
# List of all event count columns for gov, opp and total events
event_cols = ['count_events_{}'.format(i) for i in range(1, 21)]
event_cols_gov = ['count_events_{}_gov'.format(i) for i in range(1, 21)]
event_cols_opp = ['count_events_{}_opp'.format(i) for i in range(1, 21)]

# Compute the total events for each group
df_final['total_events'] = df_final[event_cols].sum(axis=1)
df_final['total_events_gov'] = df_final[event_cols_gov].sum(axis=1)
df_final['total_events_opp'] = df_final[event_cols_opp].sum(axis=1)

# Compute the share of each type of event for each group and create new columns
for col in event_cols:
    df_final['share_events_{}'.format(col)] = df_final[col] / df_final['total_events']
    
for col in event_cols_gov:
    df_final['share_events_{}'.format(col)] = df_final[col] / df_final['total_events_gov']

for col in event_cols_opp:
    df_final['share_events_{}'.format(col)] = df_final[col] / df_final['total_events_opp']

# Drop the original count_events_* columns
df_final.drop(columns=event_cols + event_cols_gov + event_cols_opp, inplace=True)
df_final = df_final.fillna(0)



In [None]:
df_final

In [None]:
# columns to drop
cols_to_drop = ['total_events', 'total_events_gov', 'total_events_opp',
                'nonstate_deaths', 'onesided_deaths', 'civilian_deaths', 'state_deaths', 'date', 'country']

# Drop the columns
df_final = df_final.drop(columns=cols_to_drop, axis=1)


In [None]:
df_final.reset_index(inplace=True)

In [None]:
df_final

I am not applying any shifting as my sequence creation in my model handles that. 

In [None]:
# Define conflict column
df_final['conflict'] = df_final['deaths'].apply(lambda x: 1 if x > 100 else 0)

# Drop rows with missing values
df_final = df_final.dropna()

# Train and test splits
train, test = train_test_split(df_final, test_size=0.2, shuffle=False)

# Separate target variable
y_train = train['conflict']
y_test = test['conflict']
train = train.drop(columns=['conflict'])
test = test.drop(columns=['conflict'])

# One-hot encoding for 'isocode' column
enc = OneHotEncoder(handle_unknown='ignore')

# Fit on train data
train_encoded = enc.fit_transform(train['isocode'].values.reshape(-1, 1)).toarray()
test_encoded = enc.transform(test['isocode'].values.reshape(-1, 1)).toarray()

# Get the list of unique categories from the encoder
categories = enc.categories_[0]

# Create DataFrame from encoded data, with original column names
dfOneHot_train = pd.DataFrame(train_encoded, columns = ["isocode_"+str(c) for c in categories])
dfOneHot_test = pd.DataFrame(test_encoded, columns = ["isocode_"+str(c) for c in categories])

# Reset index for concatenation
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

# Concatenate the original dataframe and the one-hot encoded dataframe
train = pd.concat([train, dfOneHot_train], axis=1)
test = pd.concat([test, dfOneHot_test], axis=1)

In [None]:
# Create a list of columns to scale. Exclude the one-hot encoded columns, 'year', 'month', and 'conflict' 
scale_cols = [col for col in train.columns if 'isocode' not in col and col not in ['year', 'month', 'conflict', 'deaths']]

# Instantiate the scaler
scaler = MinMaxScaler()

# Fit the scaler on the train data and transform both train and test data
train[scale_cols] = scaler.fit_transform(train[scale_cols])
test[scale_cols] = scaler.transform(test[scale_cols])

In [None]:
test

PCA might be worth implementing but not working correctly right now.

In [None]:
# # # Apply PCA on the features
# pca = PCA(n_components=0.99)
# train_pca = pca.fit_transform(train)
# test_pca = pca.transform(test)

# # Explained variance ratios
# explained_variances = pca.explained_variance_ratio_

# # Selected components
# component_names = ["PC" + str(i) for i in range(1, len(explained_variances)+1)]

# # DataFrames with the transformed data and original column names
# train = pd.DataFrame(train_pca, columns=component_names)
# train['conflict'] = y_train  # append y_train back to the dataframe

# test = pd.DataFrame(test_pca, columns=component_names)
# test['conflict'] = y_test  # append y_test back to the dataframe


In [None]:
# turning them into numpy arrays to make them easier to work with
X_train = train.values
X_test = test.values
# Append y_train and y_test back to the datasets
train['conflict'] = y_train.values
test['conflict'] = y_test.values 
test.drop(['index'], axis=1, inplace=True)
train.drop(['index'], axis=1, inplace=True)

# Iterate over all test data

# here we grab the number of unique countries so that we know how many countries to expect at each 'batch', in this case, monthly sequence
n_countries = test['isocode'].nunique()
n_countries_train= train['isocode'].nunique()

train = train.sort_values(['year', 'month', 'isocode'])
test = test.sort_values(['year', 'month', 'isocode'])

train = train.drop(['isocode'], axis=1)
test = test.drop(['isocode'],axis=1)


In [None]:
train['conflict'].sum()

In [None]:
test['conflict'].sum()

In [None]:
def create_sequences(input_data, tw, target_column='conflict'):
    assert target_column in input_data.columns, f"The target column must be in the dataframe."
    inout_seq = []
    L = len(input_data)
    for i in range(L-tw-1):
        train_seq = input_data.iloc[i:i+tw].drop(target_column, axis=1).values
        train_label = input_data.iloc[i+tw+1][target_column] # here's the shift of the target 
        inout_seq.append((train_seq ,train_label))
    return inout_seq

sequence_length = 1

# Create sequences from the training data
train_sequences = create_sequences(train, sequence_length)  # this also creates the target
X_train = np.array([seq[0] for seq in train_sequences])
y_train = np.array([seq[1] for seq in train_sequences]).flatten() 

# Reshape X to fit LSTM's expected input shape
X_train = X_train.reshape((-1, sequence_length, train.shape[1]-1))

# Create sequences from the test data
test_sequences = create_sequences(test, sequence_length) # this also creates the target
X_test = np.array([seq[0] for seq in test_sequences])
y_test = np.array([seq[1] for seq in test_sequences]).flatten() 

# Reshape X to fit LSTM's expected input shape
X_test = X_test.reshape((-1, sequence_length, test.shape[1]-1))

# Define the LSTM model
model = Sequential()
model.add(LSTM(32, dropout=0.2, input_shape=(X_train.shape[1], X_train.shape[2])))  # Single LSTM layer with 32 neurons
model.add(Dense(1, activation='sigmoid'))  # Output layer

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])
#progbar_logger = ProgbarLogger()

The starting point of the training data stays constant (all_data[:end_of_train]), while the ending point moves forward with each iteration (all_data[:end_of_train+i]). Thus, with each iteration, the model is trained on all the previous data plus some new data, which is the definition of an expanding window approach.

In [None]:
# Concatenate X_train and X_test back into a single DataFrame, for easy shifting between train and test sets
all_data = np.concatenate((X_train, X_test), axis=0)
all_targets = np.concatenate((y_train, y_test), axis=0)

# Initialize arrays to store predictions
train_preds = np.zeros_like(y_train)
test_preds = np.zeros_like(y_test)

# function to calculate class weights
def get_class_weights(y):
    counter = Counter(y)
    total = len(y)
    return {cls: (1 / count)*(total)/2.0 for cls, count in counter.items()}

# Initialize the end of the train data
end_of_train = len(X_train)

# Iterate over all test data in monthly batches
for i in range(0, len(X_test), n_countries):  # we increase the window one batch at a time
    print(f"Training on window {i//n_countries+1}/{len(X_test)//n_countries}")
    
    # Get the current training data and targets
    current_data = all_data[:end_of_train+i]
    current_targets = all_targets[:end_of_train+i]
    
    # Remove NaN values
    nan_mask = np.isnan(current_targets)
    current_data = current_data[~nan_mask]
    current_targets = current_targets[~nan_mask]
    
    # Calculate class weights for current data
    class_weights = get_class_weights(current_targets)
    
    # Fit model on all available training data
    history = model.fit(current_data, current_targets, class_weight=class_weights,
              epochs=5, batch_size=64, verbose=0, shuffle=False)
    
    # Predict the next unseen data point for each country
    next_predictions = model.predict(all_data[end_of_train+i:end_of_train+i+n_countries])  # we predict the next point for each country
    
    # Store the predictions
    test_preds[i:i+n_countries] = next_predictions.flatten() # storing predictions for all countries

# Now the expanding window training and prediction is complete
print(f"Final predictions: {test_preds}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc

y_true = y_test.flatten() 
y_pred = test_preds.flatten()

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_true, y_pred)

# Calculate Precision and Recall scores
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

# Plot ROC AUC curve
plt.figure(figsize=(10, 6))
plt.plot(*roc_curve(y_true, y_pred)[:2], label='LSTM Model (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')  # Random guessing line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC AUC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Plot Precision-Recall curve
plt.figure(figsize=(10, 6))
plt.plot(recall, precision, label='LSTM Model')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# classes
y_pred_classes = (test_preds > 0.5).astype(int)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_classes))

print("\nClassification Report:\n", classification_report(y_test, y_pred_classes))

roc_auc_minority = roc_auc_score(y_test == 1, test_preds)
print("\nROC AUC for minority class:", roc_auc_minority)


In [None]:
# def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
#     n_vars = 1 if type(data) is list else data.shape[1]
#     df = pd.DataFrame(data)
#     cols, names = list(), list()
#     # Input sequence (t-n, ... t-1)
#     for i in range(n_in, 0, -1):
#         cols.append(df.shift(i))
#         names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
#     # Current timestep (t=0)
#     cols.append(df)
#     names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
#     # Put it all together
#     agg = pd.concat(cols, axis=1)
#     agg.columns = names
#     # Drop rows with missing values
#     if dropnan:
#         agg.dropna(inplace=True)
#     return agg

# # Preprocessing the data
# train_values = train.drop(columns=['conflict']).values
# test_values = test.drop(columns=['conflict']).values

# # Define LSTM model
# months = 1  
# n_features = train_values.shape[1]

# train_transformed = series_to_supervised(train_values)
# test_transformed = series_to_supervised(test_values)

# model = Sequential()
# model.add(LSTM(60, return_sequences=True, input_shape=(months, n_features), dropout=0.2))
# model.add(LSTM(30, dropout=0.2))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

# # Define EarlyStopping callback
# early_stop = EarlyStopping(monitor='loss', patience=3)


In [None]:
# predictions = []

# previous_year, previous_month = None, None  # Initialize to None

# for i in range(len(test)):

#     train = series_to_supervised(train)
    
#     # split into input and output variables
#     train_X, train_y = train.iloc[0, :-1], train.iloc[0, -1]
#     test_X, test_y = test.iloc[0, :-1], test.iloc[0, -1]

#     # Check if year or month has changed
#     current_year, current_month = test.iloc[0]['year'], test.iloc[0]['month']
#     if current_year != previous_year or current_month != previous_month:
#         print(f'Year: {current_year}  Month: {current_month}')
#         previous_year, previous_month = current_year, current_month

#     # reshape input to be 3D [samples, timesteps, features]
#     train_X = train_X.values.reshape((1, months, n_features))
#     test_X = test_X.values.reshape((1, months, n_features))  # Corrected line

#     # Convert targets to float32
#     train_y = np.array([train_y]).astype('float32')
#     test_y = np.array([test_y]).astype('float32')

#     # fit model
#     history = model.fit(train_X, train_y, epochs=5, batch_size=32, verbose=0, callbacks=[early_stop])

#     # make a one-step prediction
#     yhat = model.predict(test_X)

#     # store predictions
#     predictions.append(yhat[0,0])

#     # append the first row of test set to the end of the training set
#     train = train.append(test.iloc[0], ignore_index=True)

#     # then remove the first row of test set
#     test = test.iloc[1:]
