In [22]:
import pandas as pd
import os
import time

# work with directories
from pathlib import Path 

# datetime
#import datetime as dt
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

# work with data
import numpy as np

# visualize data
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# Proceess, models & metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score, precision_recall_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate

# so it looks pretty
import warnings
warnings.filterwarnings('ignore')

In [10]:
path = os.getcwd()
parent_path = os.path.abspath(os.path.join(path, os.pardir))

df_og = pd.read_csv(os.path.abspath(parent_path + '/data/preprocessed_df.csv'))

print(df_og.shape)
print(df_og.isna().sum().sum())
print(df_og.month_year.min()), print(df_og.month_year.max())

#df_og.rename(columns={'armedconf_intp_pop': 'armedconf', 'deaths_all_intp_pop_pc': 'deaths_all_pc'}, inplace=True)
df_og.rename(columns={'armedconf_intp_pop_since': 'armedconf_since'}, inplace=True)

df_og.columns

(54915, 69)
0
2000-01-01
2023-03-01


Index(['MonthYear', 'isocode', 'month', 'year', 'deaths', 'month_year',
       'Country Name', 'deaths_all_pc', 'armedconf', 'past6', 'past12',
       'past60', 'past120', 'armedconf_since', 'deaths_stock',
       'share_events_1', 'share_events_2', 'share_events_3', 'share_events_4',
       'share_events_5', 'share_events_6', 'share_events_7', 'share_events_8',
       'share_events_9', 'share_events_10', 'share_events_11',
       'share_events_12', 'share_events_13', 'share_events_14',
       'share_events_15', 'share_events_16', 'share_events_17',
       'share_events_18', 'share_events_19', 'share_events_20',
       'share_events_gov', 'share_events_opp', 'norm_total_events',
       'event_share_1_stock', 'event_share_2_stock', 'event_share_3_stock',
       'event_share_4_stock', 'event_share_5_stock', 'event_share_6_stock',
       'event_share_7_stock', 'event_share_8_stock', 'event_share_9_stock',
       'event_share_10_stock', 'event_share_11_stock', 'event_share_12_stock',
     

In [38]:
df = df_og.copy()
df['month_year'] = pd.to_datetime(df['month_year'])

In [39]:
unit_of_analyis = 'isocode'
target_clsf = 'armedconf'
target_regr = 'deaths_all_pc'
start_month_year = dt(2018, 1, 1)
end_month_year = dt(2023,3,1)#(2018, 12, 1) #(2021,12,1)

In [50]:
def make_target(df: pd.DataFrame, shifter: int, target: str): # t is the number of periods for the shift

    # loop through each period and generate the shift variables
    for i in range(1, shifter+1):
        col_name = f'{target}{i}' # name doesn't matter, just interimset str(target) + f'{i}' # 

        df[col_name] = df.groupby(unit_of_analyis)[str(target)].shift(-i)

    # take the maximum for t periods forward and create the new variable
    df['target_clsf_f{}'.format(shifter)] = df[[f'{target}{i}' for i in range(1, shifter+1)]].max(axis=1, skipna=False)

    # drop the shift variables
    df = df.drop(columns=[f'{target}{i}' for i in range(1, shifter+1)])
    
    return(df)

In [60]:
# For just gridsearching on the train data, these separate dataframes are not needed
# I thought of keeping them in case I want to compare optimal parameters on the test/ whole data
# However in that case we would just not use the final 6 months of data, which is fine

# df_clsf = df.copy()
# df_clsf_f1 = make_target(df_clsf, shifter = 1, target = target_clsf)
# df_clsf = df.copy()
# df_clsf_f3 = make_target(df_clsf, shifter = 3, target = target_clsf)
# df_clsf = df.copy()
# df_clsf_f6 = make_target(df_clsf, shifter = 6, target = target_clsf)

df_clsf = df.copy()

fwds = [1, 3, 6]
for fwd in fwds:
    df_clsf = make_target(df_clsf, shifter = fwd, target = target_clsf)

df_clsf.dropna(axis=0, inplace=True)

df_clsf.shape

# ((54718, 70), (54324, 70), (53733, 70))

(51369, 72)

In [61]:
# Columns dropped while fitting, but added to identify predictions

time_var = ['month_year']
id_var = ['isocode']

# Lists of variables to keep for each model (NB past currently only based on total number of deaths per )

id_var = ['isocode']
historical_vars = ['deaths_all_pc', 'armedconf', 'armedconf_since', 'share_state_deaths', 'share_nonstate_deaths', 'share_onesided_deaths', 'share_civilian_deaths', 'deaths_stock', 'past6', 'past12', 'past60', 'past120',]

eventshare_names = ['share_events_{}'.format(i) for i in range(1, 21)] 
eventshare_stocks = ['event_share_{}_stock'.format(i) for i in range(1, 21)]

text_gdelt_shares = eventshare_names + eventshare_stocks + ['share_events_gov', 'share_events_opp', 'norm_total_events']
text_gdelt_stocks = ['event_share_{}_stock'.format(i) for i in range(1, 21)]
text_gdelt_admin1 = ['num_regions', 'Adm1_Max', 'Adm1_Mean', 'Adm1_Median']

text_gdelt_vars = text_gdelt_shares + text_gdelt_admin1 #+text_gdelt_stocks

other_vars = ['month_cos', 'month_sin']
only_all_vars = ['cluster'] #['IncomeGroup', 'Region']

cols_not_used = set(df.columns) - set(id_var + historical_vars + text_gdelt_vars + time_var + only_all_vars + other_vars)
cols_not_used

{'Country Name', 'MonthYear', 'deaths', 'month', 'target_clsf_f6', 'year'}

In [62]:
X_train_hs = df_clsf.loc[df_clsf['month_year'] < start_month_year, other_vars + historical_vars] # id_var + time_var + 
X_train_tx = df_clsf.loc[df_clsf['month_year'] < start_month_year, other_vars+ text_gdelt_vars]
X_train_all = df_clsf.loc[df_clsf['month_year'] < start_month_year, other_vars+ text_gdelt_vars + historical_vars + only_all_vars]

print(X_train_hs.shape, X_train_tx.shape, X_train_all.shape)

(42504, 14) (42504, 49) (42504, 62)


In [63]:
y_train_f1 = df_clsf.loc[df_clsf['month_year'] < start_month_year, ['target_clsf_f1']]
y_train_f3 = df_clsf.loc[df_clsf['month_year'] < start_month_year, ['target_clsf_f3']]
y_train_f6 = df_clsf.loc[df_clsf['month_year'] < start_month_year, ['target_clsf_f6']]

print(y_train_f1.shape, y_train_f3.shape, y_train_f6.shape)

(42504, 1) (42504, 1) (42504, 1)


In [64]:
from sklearn.metrics import make_scorer

param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 5, 10],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

scoring = {
    # 'accuracy': accuracy_score,
    # 'precision': precision_score,
    # 'recall': recall_score
    
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score)

}

In [66]:
grid_search_clsf_f1 = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid,
                        scoring=scoring,
                        refit='accuracy',
                        return_train_score=True, cv=5, n_jobs=None) #-1

grid_search_clsf_f1.fit(X_train_tx, y_train_f1)

best_params = grid_search_clsf_f1.best_params_
best_score = grid_search_clsf_f1.best_score_

# Access results for each scoring metric
results_clsf_f1 = grid_search_clsf_f1.cv_results_
mean_test_accuracy = results_clsf_f1['mean_test_accuracy']
mean_test_precision = results_clsf_f1['mean_test_precision']
mean_test_recall = results_clsf_f1['mean_test_recall']


KeyboardInterrupt: 