In [1]:
# loading libraries 

import matplotlib as plt
import pandas as pd
import os
import pycountry
import string
from unidecode import unidecode
import category_encoders as ce

path = os.getcwd()

# loading the data

full = pd.read_csv(path + '/data/merged.csv')

# Dropping columns before 1989
full.drop(full.loc[full['year']<1989].index, inplace=True)

# Filling missing UCDP data with 0
full[["deaths","state_deaths","nonstate_deaths","onesided_deaths","civilian_deaths","avg_sources","conflict_counts",
     "conflict_freq","dyad_counts","dyad_freq"]] = full[["deaths","state_deaths","nonstate_deaths","onesided_deaths",
                                                         "civilian_deaths","avg_sources","conflict_counts",
                                                         "conflict_freq","dyad_counts","dyad_freq"]].fillna(0)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [24]:
full.head(10)

Unnamed: 0,MonthYear,isocode,month,year,count_events_1,count_events_2,count_events_3,count_events_4,count_events_5,count_events_6,...,deaths,state_deaths,nonstate_deaths,onesided_deaths,civilian_deaths,avg_sources,conflict_counts,conflict_freq,dyad_counts,dyad_freq
23666,198901,AFG,1,1989,462.0,256.0,338.0,2024.0,264.0,88.0,...,693.0,693.0,0.0,0.0,0.0,0.0,7.0,333.0,8.0,727.0
23667,198901,AGO,1,1989,110.0,72.0,152.0,430.0,242.0,24.0,...,249.0,249.0,0.0,0.0,0.0,0.0,12.0,327.0,12.0,714.0
23668,198901,ALB,1,1989,64.0,44.0,116.0,192.0,18.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23669,198901,ARE,1,1989,10.0,10.0,16.0,24.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23670,198901,ARG,1,1989,146.0,36.0,10.0,200.0,58.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23671,198901,ARM,1,1989,202.0,62.0,54.0,214.0,84.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23672,198901,ATA,1,1989,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23673,198901,ATG,1,1989,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23674,198901,AUS,1,1989,142.0,26.0,116.0,280.0,72.0,38.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23675,198901,AUT,1,1989,126.0,54.0,114.0,314.0,106.0,20.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Create a copy of the original DataFrame
full_copy = full.copy()
full_copy['deaths'] = full_copy['deaths'].astype(int)

# Define the window size for the trend calculation
window_size = 3

# Calculate the lagged deaths column
full_copy['lag_deaths'] = full_copy.groupby('isocode')['deaths'].shift()

# Fill missing values with appropriate values (e.g., 0) in 'full_copy' DataFrame
full_copy['lag_deaths'] = full_copy.groupby('isocode')['lag_deaths'].fillna(0).astype(int)
full_copy['deaths'] = full_copy['deaths'].fillna(0)

# Create a dictionary to store the beta coefficients for each isocode
beta_coefficients_dict = {}

# Create the 'this_month_beta' column with default NaN values
full_copy['this_month_beta'] = np.nan

# Iterate over each unique isocode
for isocode in full_copy['isocode'].unique():
    # Filter the data for the current isocode
    mask = (full_copy['isocode'] == isocode)

    # Get the lagged deaths and deaths for the current isocode
    lag_deaths = full_copy.loc[mask, 'lag_deaths']
    deaths = full_copy.loc[mask, 'deaths']

    # Reshape the lag_deaths and deaths to 2D arrays
    X = lag_deaths.values.reshape(-1, 1)
    y = deaths.values.reshape(-1, 1)

    # Fit the linear regression model separately for each month
    beta_coefficients = []
    for i in range(window_size, len(X)):
        # Select the window of data for the regression
        X_window = X[i-window_size:i]
        y_window = y[i-window_size:i]

        # Check if all values in the window are 0
        if np.all((X_window == 0) & (y_window == 0)):
            # Set this_month_beta to 0 for the corresponding rows
            full_copy.loc[mask, 'this_month_beta'].iloc[i] = 0
        else:
            # Fit the linear regression model
            model = LinearRegression()
            model.fit(X_window, y_window)

            # Get the beta coefficient for the current month
            beta_coefficient = model.coef_[0][0]

            # Append the beta coefficient to the list
            beta_coefficients.append(beta_coefficient)

    # Pad the beta coefficients list with NaNs to match the length of the isocode's data
    pad_length = len(full_copy.loc[mask]) - len(beta_coefficients)
    beta_coefficients.extend([np.nan] * pad_length)

    # Assign the beta coefficients to the dictionary for the current isocode
    beta_coefficients_dict[isocode] = beta_coefficients

# Flatten the beta coefficients dictionary into a list
beta_coefficients_flat = [beta for betas in beta_coefficients_dict.values() for beta in betas]

# Assign the beta coefficients to the 'this_month_beta' column
full_copy.loc[:, 'this_month_beta'] = beta_coefficients_flat

# Create a new DataFrame to store the results
result = full_copy[['isocode', 'MonthYear', 'deaths', 'lag_deaths', 'this_month_beta']].copy()

# Group the data by 'isocode' and calculate the 75th percentile of the previous 24 months' betas
result['prev_months_beta_percentile'] = result.groupby('isocode')['this_month_beta'].rolling(24, min_periods=1).apply(lambda x: np.percentile(x, 75), raw=True).reset_index(level=0, drop=True)

# Check if the current month's beta is in the 75th percentile of the previous 24 months' betas
result['spike_occurred'] = result['this_month_beta'] > result['prev_months_beta_percentile']

# Display the result
result = result[['isocode', 'MonthYear', 'deaths', 'lag_deaths', 'this_month_beta', 'prev_months_beta_percentile', 'spike_occurred']].sort_values(['isocode', 'MonthYear']).reset_index(drop=True)
result.head(50)


Unnamed: 0,isocode,MonthYear,deaths,lag_deaths,this_month_beta,prev_months_beta_percentile,spike_occurred
0,ABW,198903,0,0,0.077606,0.077606,False
1,ABW,198908,0,0,,,False
2,ABW,199004,0,0,,,False
3,ABW,199006,0,0,,,False
4,ABW,199007,0,0,,,False
5,ABW,199104,0,0,-0.5,,False
6,ABW,199107,0,0,,,False
7,ABW,199212,0,0,,,False
8,ABW,199306,0,0,,,False
9,ABW,199307,0,0,,,False


## The actual one

In [48]:
df = full.copy()
df['lag_deaths'] = df.groupby('isocode')['deaths'].shift(1)

df['delta_deaths'] = np.where((df['lag_deaths'] == 0) & (df['deaths'] == 0), 0,
                              np.where((df['lag_deaths'] == 0) & (df['deaths'] != 0), np.inf,
                                      np.where((df['lag_deaths']).isna() == True, 0,
                                               (df['deaths'] - df['lag_deaths']) / df['lag_deaths'])))

# Group the data by 'isocode' and calculate the 75th percentile of the previous 24 months' betas
df['threshold'] = df.groupby('isocode')['delta_deaths'].transform(lambda x: x.shift(1).rolling(window=24, min_periods=1).quantile(0.75))
df['threshold'] = df['threshold'].fillna(0)

# Check if the current month's beta is in the 75th percentile of the previous 24 months' betas
df['escalation'] = (df['deaths'] >= 0.05) & ((df['delta_deaths'] > df['threshold']) | (df['delta_deaths'] == np.inf))
df['escalation'] = df['escalation'].astype(int)

Unnamed: 0,MonthYear,isocode,month,year,count_events_1,count_events_2,count_events_3,count_events_4,count_events_5,count_events_6,...,civilian_deaths,avg_sources,conflict_counts,conflict_freq,dyad_counts,dyad_freq,lag_deaths,delta_deaths,escalation,threshold
23666,198901,AFG,1,1989,462.0,256.0,338.0,2024.0,264.0,88.0,...,0.0,0.0,7.0,333.0,8.0,727.0,,0.0,0,0.0
23863,198902,AFG,2,1989,916.0,516.0,630.0,2228.0,680.0,276.0,...,23.0,0.0,8.0,333.0,9.0,724.0,693.0,-0.875902,0,0.0
24059,198903,AFG,3,1989,546.0,274.0,206.0,1148.0,370.0,104.0,...,96.0,0.017857,4.0,333.0,5.0,724.0,86.0,19.0,1,-0.218975
24260,198904,AFG,4,1989,364.0,262.0,182.0,976.0,290.0,68.0,...,86.0,0.0,5.0,333.0,5.0,727.0,1720.0,-0.712791,0,9.5
24460,198905,AFG,5,1989,312.0,150.0,156.0,706.0,190.0,20.0,...,28.0,0.017857,7.0,333.0,8.0,727.0,494.0,-0.080972,0,4.75
24658,198906,AFG,6,1989,222.0,64.0,130.0,782.0,224.0,46.0,...,25.0,0.0,5.0,333.0,5.0,727.0,454.0,0.090308,1,0.0
24855,198907,AFG,7,1989,264.0,158.0,180.0,806.0,204.0,52.0,...,10.0,0.0,7.0,333.0,7.0,727.0,495.0,0.252525,1,0.067731
25057,198908,AFG,8,1989,204.0,120.0,122.0,784.0,124.0,30.0,...,1.0,0.0,6.0,333.0,6.0,5321.0,620.0,-0.33871,0,0.171417
25253,198909,AFG,9,1989,198.0,106.0,110.0,564.0,154.0,24.0,...,11.0,0.0,5.0,333.0,5.0,724.0,410.0,-0.826829,0,0.130863
25454,198910,AFG,10,1989,158.0,124.0,72.0,476.0,134.0,16.0,...,19.0,0.083333,3.0,333.0,3.0,724.0,71.0,2.366197,1,0.090308


In [51]:
true_counts = df['escalation'].sum()
total_counts = df['escalation'].count()
percentage_true = (true_counts / total_counts) * 100

summary_table = pd.DataFrame({'True Count': pd.Series(true_counts), 'Percentage True': pd.Series(percentage_true)})
print(summary_table)

   True Count  Percentage True
0        5522         5.960901


In [1]:
import numpy as np

def calculate_escalation(df):
    df['lag_deaths'] = df.groupby('isocode')['deaths'].shift(1)

    df['delta_deaths'] = np.where((df['lag_deaths'] == 0) & (df['deaths'] == 0), 0,
                                  np.where((df['lag_deaths'] == 0) & (df['deaths'] != 0), np.inf,
                                           np.where((df['lag_deaths']).isna() == True, 0,
                                                    (df['deaths'] - df['lag_deaths']) / df['lag_deaths'])))

    # Group the data by 'isocode' and calculate the 75th percentile of the previous 24 months' delta_deaths
    df['threshold'] = df.groupby('isocode')['delta_deaths'].transform(lambda x: x.shift(1).rolling(window=24, min_periods=1).quantile(0.75))
    df['threshold'] = df['threshold'].fillna(0)

    # Check if the current month's delta_deaths exceeds the threshold or is infinity
    df['escalation'] = (df['deaths'] >= 0.05) & ((df['delta_deaths'] > df['threshold']) | (df['delta_deaths'] == np.inf))
    df['escalation'] = df['escalation'].astype(int)

    return df

In [3]:
true_counts = df['escalation'].sum()
total_counts = df['escalation'].count()
percentage_true = (true_counts / total_counts) * 100

summary_table = pd.DataFrame({'True Count': pd.Series(true_counts), 'Percentage True': pd.Series(percentage_true)})
print(summary_table)

   True Count  Percentage True
0           3             50.0
