In [144]:
# loading libraries 

import matplotlib as plt
import pandas as pd
import os
import pycountry
import string
from unidecode import unidecode
import category_encoders as ce

path = os.getcwd()

# loading the data

full = pd.read_csv(path + '/data/merged.csv')

# Dropping columns before 1989
full.drop(full.loc[full['year']<1989].index, inplace=True)

# Filling missing UCDP data with 0
full[["deaths","state_deaths","nonstate_deaths","onesided_deaths","civilian_deaths","avg_sources","conflict_counts",
     "conflict_freq","dyad_counts","dyad_freq"]] = full[["deaths","state_deaths","nonstate_deaths","onesided_deaths",
                                                         "civilian_deaths","avg_sources","conflict_counts",
                                                         "conflict_freq","dyad_counts","dyad_freq"]].fillna(0)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [145]:
full.head(10)

Unnamed: 0,MonthYear,isocode,month,year,count_events_1,count_events_2,count_events_3,count_events_4,count_events_5,count_events_6,...,deaths,state_deaths,nonstate_deaths,onesided_deaths,civilian_deaths,avg_sources,conflict_counts,conflict_freq,dyad_counts,dyad_freq
23666,198901,AFG,1,1989,462.0,256.0,338.0,2024.0,264.0,88.0,...,693.0,693.0,0.0,0.0,0.0,0.0,7.0,333.0,8.0,727.0
23667,198901,AGO,1,1989,110.0,72.0,152.0,430.0,242.0,24.0,...,249.0,249.0,0.0,0.0,0.0,0.0,12.0,327.0,12.0,714.0
23668,198901,ALB,1,1989,64.0,44.0,116.0,192.0,18.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23669,198901,ARE,1,1989,10.0,10.0,16.0,24.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23670,198901,ARG,1,1989,146.0,36.0,10.0,200.0,58.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23671,198901,ARM,1,1989,202.0,62.0,54.0,214.0,84.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23672,198901,ATA,1,1989,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23673,198901,ATG,1,1989,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23674,198901,AUS,1,1989,142.0,26.0,116.0,280.0,72.0,38.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23675,198901,AUT,1,1989,126.0,54.0,114.0,314.0,106.0,20.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Create a copy of the original DataFrame
full_copy = full.copy()

# Define the window size for the trend calculation
window_size = 3

# Calculate the lagged deaths column
full_copy['lag_deaths'] = full_copy.groupby('isocode')['deaths'].shift()

# Create a dictionary to store the beta coefficients for each isocode
beta_coefficients_dict = {}

# Iterate over each unique isocode
for isocode in full_copy['isocode'].unique():
    # Filter the data for the current isocode
    mask = (full_copy['isocode'] == isocode)
    
    # Get the lagged deaths and deaths for the current isocode
    lag_deaths = full_copy.loc[mask, 'lag_deaths']
    deaths = full_copy.loc[mask, 'deaths']
        
    # Fill missing values with appropriate values (e.g., median, mean)
    lag_deaths = lag_deaths.fillna(lag_deaths.median())
    deaths = deaths.fillna(deaths.median())
    
    # Reshape the lagged deaths and deaths to 2D arrays
    X = lag_deaths.values.reshape(-1, 1)
    y = deaths.values.reshape(-1, 1)
    
    # Fit the linear regression model separately for each month
    beta_coefficients = []
    for i in range(window_size, len(X)):
        # Select the window of data for the regression
        X_window = X[i-window_size:i]
        y_window = y[i-window_size:i]
        
        # Fit the linear regression model
        model = LinearRegression()
        model.fit(X_window, y_window)
        
        # Get the beta coefficient for the current month
        beta_coefficient = model.coef_[0][0]
        
        # Append the beta coefficient to the list
        beta_coefficients.append(beta_coefficient)
    
    # Pad the beta coefficients list with NaNs to match the length of the isocode's data
    pad_length = len(full_copy.loc[mask]) - len(beta_coefficients)
    beta_coefficients.extend([np.nan] * pad_length)
    
    # Assign the beta coefficients to the dictionary for the current isocode
    beta_coefficients_dict[isocode] = beta_coefficients

# Flatten the beta coefficients dictionary into a list
beta_coefficients_flat = [beta for betas in beta_coefficients_dict.values() for beta in betas]

# Assign the beta coefficients to the 'this_month_beta' column
full_copy['this_month_beta'] = beta_coefficients_flat

# Create a new DataFrame to store the results
result = full_copy[['isocode', 'MonthYear', 'deaths', 'this_month_beta']].copy()

# Group the data by 'isocode' and calculate the 75th percentile of the previous 24 months' betas
result['prev_months_beta_percentile'] = result.groupby('isocode')['this_month_beta'].rolling(24, min_periods=1).apply(lambda x: np.percentile(x, 75), raw=True).reset_index(level=0, drop=True)

# Check if the current month's beta is in the 75th percentile of the previous 24 months' betas
result['spike_occurred'] = result['this_month_beta'] > result['prev_months_beta_percentile']

# Display the result
result = result[['isocode', 'MonthYear', 'deaths', 'this_month_beta', 'spike_occurred']].sort_values(['isocode', 'MonthYear']).reset_index(drop=True)
result.head(50)


In [86]:
define_escalation(full)

In [81]:
full[['isocode','MonthYear','deaths','lag_deaths','escalation','this_month_beta','beta_threshold']][(full['year']>2020) & (full['year']<2023)].sort_values(['isocode','MonthYear']).head(50)


Unnamed: 0,isocode,MonthYear,deaths,lag_deaths,escalation,this_month_beta,beta_threshold
109436,ABW,202101,0.0,0.0,0.0,0.0,
109673,ABW,202102,0.0,0.0,0.0,0.0,
109911,ABW,202103,0.0,0.0,0.0,0.0,
110148,ABW,202104,0.0,0.0,0.0,0.0,
110386,ABW,202105,0.0,0.0,0.0,0.0,
110624,ABW,202106,0.0,0.0,0.0,0.0,
110861,ABW,202107,0.0,0.0,0.0,0.0,
111098,ABW,202108,0.0,0.0,0.0,0.0,
111335,ABW,202109,0.0,0.0,0.0,0.0,
111572,ABW,202110,0.0,0.0,0.0,0.0,


In [None]:
# figure out if beta makes sense (what are we regressing?) and why the quantile is NaN

In [180]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Create a copy of the original DataFrame
full_copy = full.copy()

# Define the window size for the trend calculation
window_size = 3

# Calculate the lagged deaths column and trend values
full_copy['lag_deaths'] = full_copy.groupby('isocode')['deaths'].shift(1)
full_copy['this_month_trend'] = np.nan

# Create a dictionary to store the trend values for each isocode
trend_values_dict = {}

# Iterate over each unique isocode
for isocode in full_copy['isocode'].unique():
    # Filter the data for the current isocode
    mask = (full_copy['isocode'] == isocode)
    
    # Get the lagged deaths and deaths for the current isocode
    lag_deaths = full_copy.loc[mask, 'lag_deaths']
    deaths = full_copy.loc[mask, 'deaths']
            
    # Fill missing values with appropriate values
    lag_deaths = lag_deaths.fillna(0)
    deaths = deaths.fillna(0)
    
    # Reshape the lagged deaths and deaths to 2D arrays
    X = lag_deaths.values.reshape(-1, 1)
    y = deaths.values.reshape(-1, 1)
    
    # Fit the linear regression model separately for each month
    model = LinearRegression()

    for i in range(window_size, len(X)):
        # Select the window of data for the regression
        X_window = X[i-window_size:i]
        y_window = y[i-window_size:i]
        
        # Fit the linear regression model
        model = LinearRegression()
        model.fit(X_window, y_window)
        
        # Get the slope coefficient for the current month
        slope = model.coef_[0][0] # check that this is accessing the right coefficient (should be)

        # Assign the slope coefficient to the 'this_month_trend' column for the current month
        full_copy.loc[mask, 'this_month_trend'].values[i] = slope

        # Append the slope coefficient to the list
        #trend_values.append(slope) # can we put this into a column, instead of a list? and access with the window?
    
    # Pad the trend values list with NaNs to match the length of the isocode's data
    #pad_length = len(full_copy.loc[mask]) - len(trend_values)
    #trend_values.extend([np.nan] * pad_length) # what exactly is this doing? how do we know its in the right order?
    
    # Assign the trend values to the dictionary for the current isocode
    #trend_values_dict[isocode] = trend_values

# Flatten the trend values dictionary into a list
#trend_values_flat = [trend for trends in trend_values_dict.values() for trend in trends] # not sure why...?

# Assign the trend values to the 'this_month_trend' column
#full_copy['this_month_trend'] = trend_values_flat # doesn't this get the entire trend? and not just the window?

# Create a new DataFrame to store the results
result = full_copy[['isocode', 'MonthYear', 'deaths', 'lag_deaths', 'this_month_trend']].copy()

# Create a new column for the previous months' trend percentiles
result['prev_months_trend_percentile'] = result.groupby('isocode')['this_month_trend'].rolling(24, min_periods=1).apply(lambda x: np.percentile(x, 75), raw=True).reset_index(level=0, drop=True)

# Check if the current month's trend is in the 75th percentile of the previous 24 months' trends
result['spike_occurred'] = result['this_month_trend'] > result['prev_months_trend_percentile']

# Display the result
result = result[['isocode', 'MonthYear', 'deaths', 'lag_deaths', 'this_month_trend', 
                 'prev_months_trend_percentile', 'spike_occurred']].sort_values(['isocode', 'MonthYear']).reset_index(drop=True)
result.head(50)

Unnamed: 0,isocode,MonthYear,deaths,lag_deaths,this_month_trend,prev_months_trend_percentile,spike_occurred
0,ABW,198903,0.0,,,,False
1,ABW,198908,0.0,0.0,,,False
2,ABW,199004,0.0,0.0,,,False
3,ABW,199006,0.0,0.0,,,False
4,ABW,199007,0.0,0.0,,,False
5,ABW,199104,0.0,0.0,,,False
6,ABW,199107,0.0,0.0,,,False
7,ABW,199212,0.0,0.0,,,False
8,ABW,199306,0.0,0.0,,,False
9,ABW,199307,0.0,0.0,,,False


In [181]:
result = result[(result['MonthYear']>202001) & (result['MonthYear']<202301)].sort_values(['isocode', 'MonthYear'])
result.head(50)

Unnamed: 0,isocode,MonthYear,deaths,lag_deaths,this_month_trend,prev_months_trend_percentile,spike_occurred
283,ABW,202002,0.0,0.0,,,False
284,ABW,202003,0.0,0.0,,,False
285,ABW,202004,0.0,0.0,,,False
286,ABW,202005,0.0,0.0,,,False
287,ABW,202006,0.0,0.0,,,False
288,ABW,202007,0.0,0.0,,,False
289,ABW,202008,0.0,0.0,,,False
290,ABW,202009,0.0,0.0,,,False
291,ABW,202010,0.0,0.0,,,False
292,ABW,202011,0.0,0.0,,,False


In [161]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Example dataset
data = {
    'isocode': ['ABW', 'ABW', 'ABW', 'ABW', 'ABW', 'USA', 'USA', 'USA', 'USA', 'USA'],
    'MonthYear': [198901, 198902, 198903, 198904, 198905, 198901, 198902, 198903, 198904, 198905],
    'deaths': [0.0, 0.0, 10.0, 10.0, 10.0, 0.0, 0.0, 0.0, 0.0, 0.0]
}

# Create DataFrame
full_copy = pd.DataFrame(data)

# Define the window size for the trend calculation
window_size = 3

# Calculate the lagged deaths column
full_copy['lag_deaths'] = full_copy.groupby('isocode')['deaths'].shift(1)

# Create a dictionary to store the trend values for each isocode
trend_values_dict = {}

# Iterate over each unique isocode
for isocode in full_copy['isocode'].unique():
    # Filter the data for the current isocode
    mask = (full_copy['isocode'] == isocode)

    # Get the lagged deaths and deaths for the current isocode
    lag_deaths = full_copy.loc[mask, 'lag_deaths']
    deaths = full_copy.loc[mask, 'deaths']

    # Check for extremely large values and replace them with appropriate values
    lag_deaths = lag_deaths.replace([np.inf, -np.inf], np.nan)
    deaths = deaths.replace([np.inf, -np.inf], np.nan)

    # Fill missing values with appropriate values (e.g., median, mean)
    lag_deaths = lag_deaths.fillna(0)
    deaths = deaths.fillna(0)

    # Reshape the lagged deaths and deaths to 2D arrays
    X = lag_deaths.values.reshape(-1, 1)
    y = deaths.values.reshape(-1, 1)

    # Fit the linear regression model separately for each month
    trend_values = []
    for i in range(window_size, len(X)):
        # Select the window of data for the regression
        X_window = X[i-window_size:i]
        y_window = y[i-window_size:i]

        # Check if all values in the window are the same
        if np.all(y_window == y_window[0]):
            trend_values.append(0)  # Assign a trend value of 0 for no changes
        else:
            # Fit the linear regression model
            model = LinearRegression()
            model.fit(X_window, y_window)

            # Get the slope coefficient for the current month
            slope = model.coef_[0][0]

            # Append the slope coefficient to the list
            trend_values.append(slope)

    # Pad the trend values list with NaNs to match the length of the isocode's data
    pad_length = len(full_copy.loc[mask]) - len(trend_values)
    trend_values.extend([np.nan] * pad_length)

    # Assign the trend values to the dictionary for the current isocode
    trend_values_dict[isocode] = trend_values

# Flatten the trend values dictionary into a list
trend_values_flat = [trend for trends in trend_values_dict.values() for trend in trends]

# Assign the trend values to the 'this_month_trend' column
full_copy['this_month_trend'] = trend_values_flat

# Create a new DataFrame to store the results
result = full_copy[['isocode', 'MonthYear', 'deaths', 'lag_deaths', 'this_month_trend']].copy()

# Group the data by 'isocode' and calculate the 75th percentile of the previous 24 months' trends
result['prev_months_trend_percentile'] = result.groupby('isocode')['this_month_trend'].rolling(24, min_periods=1).apply(
    lambda x: np.percentile(pd.Series(x).dropna(), 75) if len(pd.Series(x).dropna()) > 0 else np.nan, raw=True
).reset_index(level=0, drop=True)

# Check if the current month's trend is in the 75th percentile of the previous 24 months' trends
result['spike_occurred'] = result['this_month_trend'] > result['prev_months_trend_percentile']

# Display the result
result = result[['isocode', 'MonthYear', 'deaths', 'lag_deaths', 'this_month_trend',
                 'prev_months_trend_percentile', 'spike_occurred']].sort_values(['isocode', 'MonthYear']).reset_index(drop=True)
print(result)

  isocode  MonthYear  deaths  lag_deaths  this_month_trend  \
0     ABW     198901     0.0         NaN               0.0   
1     ABW     198902     0.0         0.0               0.5   
2     ABW     198903    10.0         0.0               NaN   
3     ABW     198904    10.0        10.0               NaN   
4     ABW     198905    10.0        10.0               NaN   
5     USA     198901     0.0         NaN               0.0   
6     USA     198902     0.0         0.0               0.0   
7     USA     198903     0.0         0.0               NaN   
8     USA     198904     0.0         0.0               NaN   
9     USA     198905     0.0         0.0               NaN   

   prev_months_trend_percentile  spike_occurred  
0                         0.000           False  
1                         0.375            True  
2                         0.375           False  
3                         0.375           False  
4                         0.375           False  
5                

In [166]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Create a DataFrame with the given data
data = {
    'isocode': ['ABW', 'ABW', 'ABW'],
    'MonthYear': [198901, 198902, 198903],
    'deaths': [0.0, 0.0, 0.0],
    'lag_deaths': [None, 0.0, 0.0]
}
df = pd.DataFrame(data)

# Fill missing values with appropriate values (e.g., median, mean)
df.lag_deaths = df.lag_deaths.fillna(0)
df.deaths = df.deaths.fillna(0)

# Convert 'lag_deaths' column to numeric
df['lag_deaths'] = pd.to_numeric(df['lag_deaths'])

# Reshape the data for regression
X = df['lag_deaths'].values.reshape(-1, 1)
y = df['deaths'].values

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Get the coefficient and intercept of the model
coefficient = model.coef_[0]
intercept = model.intercept_

# Print the coefficient and intercept
print("Coefficient:", coefficient)
print("Intercept:", intercept)


Coefficient: 0.0
Intercept: 0.0
