In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.style
matplotlib.style.use("seaborn")
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow_probability as tfp

import seaborn as sns
sns.color_palette("hls", 17)
import scipy.stats as st
import math

from pingouin import rcorr
import pingouin as pg
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV, RidgeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
#from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_validate, cross_val_score, learning_curve
from sklearn.metrics import label_ranking_average_precision_score, accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score, plot_roc_curve, roc_curve, auc, roc_auc_score,precision_recall_curve, hamming_loss, multilabel_confusion_matrix
from sklearn.preprocessing import MinMaxScaler

import pickle

from sklearn import feature_selection #import chi2

## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K
tfd = tfp.distributions

In [2]:
import sys
import os
sys.path.append('../')
from module.trend_analysis_functions import *
from module.topic_model_plus_class import Topic_Model_plus

In [3]:
test_data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()),'data','ICS_data','ICS_predictive_sitreps_full_test.csv')).drop(["Unnamed: 0"], axis=1)
train_data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()),'data','ICS_data','ICS_predictive_sitreps_full_train.csv')).drop(["Unnamed: 0"], axis=1)
val_data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()),'data','ICS_data','ICS_predictive_sitreps_full_val.csv')).drop(["Unnamed: 0"], axis=1)

In [4]:
meta_predictors = ["TOTAL_PERSONNEL", "TOTAL_AERIAL", "PCT_CONTAINED_COMPLETED",
              "ACRES",  "WF_FSR", "INJURIES", "FATALITIES", "EST_IM_COST_TO_DATE", "STR_DAMAGED",
              "STR_DESTROYED", "NEW_ACRES", "EVACUATION_IN_PROGRESS", 
              "NUM_REPORTS", "DAYS_BURING", #'Combined_Text', 
              'Incident_region_AICC', 
              'Incident_region_CA', 'Incident_region_EACC','Incident_region_GBCC', 'Incident_region_HICC', 
              'Incident_region_NRCC','Incident_region_NWCC', 'Incident_region_RMCC', 'Incident_region_SACC',
              'Incident_region_SWCC', 'INC_MGMT_ORG_ABBREV_1', 'INC_MGMT_ORG_ABBREV_2','INC_MGMT_ORG_ABBREV_3', 
              'INC_MGMT_ORG_ABBREV_4','INC_MGMT_ORG_ABBREV_5', 'INC_MGMT_ORG_ABBREV_B','INC_MGMT_ORG_ABBREV_C', 
              'INC_MGMT_ORG_ABBREV_D','INC_MGMT_ORG_ABBREV_E', 'INC_MGMT_ORG_ABBREV_F']
targets = ["Traffic","Command_Transitions","Evacuations", "Inaccurate_Mapping", "Aerial_Grounding", 
           "Resource_Issues", "Injuries", "Cultural_Resources","Livestock", "Law_Violations", "Military_Base", 
           "Infrastructure", "Extreme_Weather", "Ecological", "Hazardous_Terrain", "Floods", "Dry_Weather"]

# Prepare the Data

In [5]:
def remove_quote_marks(word_list):
    word_list = word_list.strip("[]").split(", ")
    word_list = [w.replace("'","") for w in word_list]
    word_list = " ".join(word_list)
    return word_list

In [6]:
dfs = [train_data, val_data, test_data]
for df in dfs:
    cleaned_combined_text = []
    for text in df['Combined_Text']:
        cleaned_text = remove_quote_marks(text)
        cleaned_combined_text.append(cleaned_text)
    df['Combined_Text'] = cleaned_combined_text

In [7]:
Xtrain = train_data['Raw_Combined_Text']; ytrain = train_data[targets]
Xval = val_data['Raw_Combined_Text']; yval = val_data[targets]
Xtest = test_data['Raw_Combined_Text']; ytest = test_data[targets]

# Vectorize the data

In [8]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

INFO:absl:Using C:\Users\srandrad\AppData\Local\Temp\1\tfhub_modules to cache modules.


In [9]:
#fine tune
# model = tf.keras.Sequential([hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4',input_shape=[], dtype=tf.string, trainable=True, name="use"),
#                    tf.keras.layers.Dense(17, activation='sigmoid')])
# model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
# model.fit(Xtrain, ytrain)

In [10]:
# embed = model.get_layer('use')

In [11]:
Xtrain_vec = embed(Xtrain)
Xval_vec = embed(Xval)
Xtest_vec = embed(Xtest)

In [12]:
Xtest_vec.shape

TensorShape([4710, 512])

In [13]:
print("Training:", len(train_data['INCIDENT_ID'].unique()), len(train_data))
print("Test:", len(test_data['INCIDENT_ID'].unique()), len(test_data))
print("Val:", len(val_data['INCIDENT_ID'].unique()), len(val_data))

Training: 7177 39357
Test: 898 4710
Val: 898 4800


In [14]:
Xtest.shape

(4710,)

In [15]:
scaler = MinMaxScaler(clip=True)
Xtrain_vec = pd.DataFrame(scaler.fit_transform(Xtrain_vec))
Xval_vec = pd.DataFrame(scaler.transform(Xval_vec))
Xtest_vec = pd.DataFrame(scaler.transform(Xtest_vec))
predictors = meta_predictors + [str(c) for c in Xtrain_vec.columns]

In [16]:
train = pd.concat([train_data, Xtrain_vec], axis=1)
train.columns = train.columns.astype(str)
val = pd.concat([val_data, Xval_vec], axis=1)
val.columns = val.columns.astype(str)
test = pd.concat([test_data, Xtest_vec], axis=1)
test.columns = test.columns.astype(str)

In [17]:
train[predictors]

Unnamed: 0,TOTAL_PERSONNEL,TOTAL_AERIAL,PCT_CONTAINED_COMPLETED,ACRES,WF_FSR,INJURIES,FATALITIES,EST_IM_COST_TO_DATE,STR_DAMAGED,STR_DESTROYED,...,502,503,504,505,506,507,508,509,510,511
0,0.043860,0.052083,0.000017,0.000077,0.000117,0.000031,0.0,1.135074e-05,0.000000,0.000000,...,0.636268,0.846329,0.264865,0.454591,0.128039,0.556742,0.700797,0.697829,0.303885,0.878636
1,0.043860,0.052083,0.000012,0.000554,0.000838,0.000031,0.0,1.021566e-04,0.000000,0.000000,...,0.431682,0.370078,0.613983,0.707533,0.485717,0.347669,0.802114,0.411110,0.185656,0.879484
2,0.031465,0.041667,0.000006,0.000496,0.000750,0.000031,0.0,5.107832e-05,0.000000,0.000000,...,0.485911,0.523517,0.535792,0.721856,0.332552,0.429471,0.630374,0.560841,0.173279,0.913014
3,0.036677,0.045139,0.000021,0.000088,0.000017,0.000031,0.0,1.135074e-05,0.000000,0.000000,...,0.684792,0.288797,0.271450,0.320276,0.149942,0.336192,0.659898,0.644933,0.416248,0.871129
4,0.036677,0.045139,0.000012,0.000554,0.000000,0.000061,0.0,5.675369e-05,0.000000,0.000000,...,0.604857,0.197506,0.366873,0.363177,0.294586,0.370587,0.583706,0.484965,0.373709,0.878155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39352,0.000191,0.000000,0.000021,0.000516,0.000000,0.000031,0.0,1.135074e-05,0.000000,0.000000,...,0.793546,0.858439,0.268195,0.577951,0.433157,0.412017,0.497516,0.162952,0.510388,0.686927
39353,0.001812,0.000000,0.000021,0.000231,0.000000,0.000031,0.0,5.675369e-06,0.000000,0.000000,...,0.825975,0.680934,0.521232,0.539170,0.404797,0.938619,0.504618,0.564023,0.463166,0.791047
39354,0.001812,0.000000,0.000021,0.000231,0.000000,0.000031,0.0,7.945516e-06,0.000000,0.000000,...,0.778549,0.705868,0.587098,0.605629,0.147264,0.901612,0.204673,0.554008,0.722456,0.659162
39355,0.003623,0.000000,0.000008,0.000386,0.000121,0.000031,0.0,6.810443e-06,0.000000,0.000000,...,0.113637,0.628373,0.715513,0.910740,0.590067,0.694333,0.398491,0.298083,0.177326,0.528702


In [18]:
train[predictors+['INCIDENT_ID']+targets].to_csv(os.path.join(os.path.dirname(os.getcwd()),'data','ICS_data',"ICS_train_sitreps_preprocessed.csv"))
test[predictors+['INCIDENT_ID']+targets].to_csv(os.path.join(os.path.dirname(os.getcwd()),'data','ICS_data',"ICS_test_sitreps_preprocessed.csv"))
val[predictors+['INCIDENT_ID']+targets].to_csv(os.path.join(os.path.dirname(os.getcwd()),'data','ICS_data',"ICS_val_sitreps_preprocessed.csv"))

In [19]:
per_hazard_comparisons = [] #need method, input
comparisons = [] #need method, input

In [129]:
def train_test_model_method(Xtrain, ytrain, Xtest, ytest, input_type, multilabel_name, multilabel_func=None, classifiers=[], classifier_names=[], classifier_params=[{}]):
    test_hamming_loss = []; train_hamming_loss = []
    test_acc = []; train_acc = []
    test_f1 = []; train_f1 = []
    test_precision = []; train_precision = []
    test_recall = []; train_recall = []
    per_hazard = {'test precision':[], 'train precision':[],
                 'test recall': [], 'train recall': [],
                 'test f1': [], 'train f1':[]}
    trained_clfs = []
    i=0
    for clf in tqdm(classifiers):
        if multilabel_func is not None:
            classifier = multilabel_func(clf(**classifier_params[classifier_names[i]]))
        else: classifier = clf(**classifier_params[classifier_names[i]])
        classifier.fit(Xtrain, ytrain)
        train_preds = classifier.predict(Xtrain)
        predictions = classifier.predict(Xtest)

        test_acc.append(accuracy_score(ytest, predictions)); train_acc.append(accuracy_score(ytrain, train_preds))
        per_hazard['test f1'].append(f1_score(ytest ,predictions, average=None,zero_division=0)); test_f1.append(f1_score(ytest, predictions, average='weighted',zero_division=0))
        per_hazard['train f1'].append(f1_score(ytrain, train_preds, average=None,zero_division=0)); train_f1.append(f1_score(ytrain, train_preds, average='weighted',zero_division=0))
        per_hazard['test precision'].append(precision_score(ytest, predictions, average=None, zero_division=0)); test_precision.append(precision_score(ytest, predictions, average='weighted', zero_division=0))
        per_hazard['train precision'].append(precision_score(ytrain, train_preds, average=None, zero_division=0)); train_precision.append(precision_score(ytrain, train_preds, average='weighted', zero_division=0))
        per_hazard['test recall'].append(recall_score(ytest, predictions, average=None, zero_division=0)); test_recall.append(recall_score(ytest, predictions, average='weighted', zero_division=0))
        per_hazard['train recall'].append(recall_score(ytrain, train_preds, average=None, zero_division=0)); train_recall.append(recall_score(ytrain, train_preds, average='weighted', zero_division=0))
        test_hamming_loss.append(hamming_loss(ytest, predictions))
        train_hamming_loss.append(hamming_loss(ytrain, train_preds))
        trained_clfs.append(classifier)
        i+=1
    comparison = pd.DataFrame({"method":[multilabel_name for i in range(len(classifier_names))],
                               "input": [input_type for i in range(len(classifier_names))],
                               "Base Estimator": classifier_names,
                               "train f1": train_f1, "test f1": test_f1,
                              "train accuracy":train_acc, "test accuracy": test_acc,
                              "train recall": train_recall, "test recall": test_recall,
                              "train precision": train_precision, "test precision": test_precision,
                              "train hamming loss": train_hamming_loss, "test hamming loss": test_hamming_loss}).round(3)
    iterables = [[multilabel_name], [input_type], classifier_names, targets]
    per_hazard = {key:[val for arr in per_hazard[key] for val in arr] for key in per_hazard}
    per_hazard_comparison = pd.DataFrame(per_hazard, index=pd.MultiIndex.from_product(iterables, names=["Method", "Input","Base Estimator", "Hazard"])).round(3)
    return comparison, per_hazard_comparison, trained_clfs

### Method: Complement Bayes
- for each target build a binary bayesian classifier for text data
- for each target build a binary bayesian classifier for meta data

In [137]:
xtrain = pd.concat([train[predictors], test[predictors]]); Ytrain = pd.concat([ytrain,ytest])[targets]; xtest = val[predictors]; Ytest = yval[targets]
input_type = 'text'; multilabel_name = 'BR'; multilabel_func=OneVsRestClassifier; classifiers=[ComplementNB]#, MultinomialNB()]; 
classifier_names=['Complement NB']#, 'Multinomial NB']
classifier_params = {'Complement NB':{}}
comparison, per_hazard_comparison, trained_clfs = train_test_model_method(xtrain, Ytrain, xtest, Ytest, input_type, multilabel_name, multilabel_func, classifiers, classifier_names,classifier_params)

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [03:55<00:00, 235.51s/it]


In [143]:
per_hazard_comparison = per_hazard_comparison.drop([col for col in per_hazard_comparison.columns if 'train' in col], axis=1)

In [154]:
per_hazard_comparison['Support'] = support

In [156]:
per_hazard_comparison.index = per_hazard_comparison.index.droplevel('Method').droplevel('Input').droplevel('Base Estimator')

In [161]:
per_hazard_comparison['Hazard'] = per_hazard_comparison.index

In [166]:
per_hazard_comparison.columns = ['Precision', 'Recall', 'F1', 'Support','Hazard']

In [164]:
per_hazard_comparison.index = ['H'+str(i) for i in range(1,len(per_hazard_comparison)+1)]

In [168]:
per_hazard_comparison = per_hazard_comparison[['Hazard','Precision', 'Recall', 'F1', 'Support',]]

In [169]:
per_hazard_comparison

Unnamed: 0,Hazard,Precision,Recall,F1,Support
H1,Traffic,0.379,0.7,0.492,874
H2,Command_Transitions,0.575,0.724,0.641,1488
H3,Evacuations,0.317,0.725,0.441,655
H4,Inaccurate_Mapping,0.212,0.672,0.322,567
H5,Aerial_Grounding,0.046,0.47,0.084,134
H6,Resource_Issues,0.323,0.758,0.453,683
H7,Injuries,0.331,0.714,0.452,693
H8,Cultural_Resources,0.088,0.823,0.159,141
H9,Livestock,0.057,0.517,0.103,143
H10,Law_Violations,0.0,0.0,0.0,0


In [170]:
print(per_hazard_comparison.to_latex())

\begin{tabular}{llrrrr}
\toprule
{} &               Hazard &  Precision &  Recall &     F1 &  Support \\
\midrule
H1  &              Traffic &      0.379 &   0.700 &  0.492 &      874 \\
H2  &  Command\_Transitions &      0.575 &   0.724 &  0.641 &     1488 \\
H3  &          Evacuations &      0.317 &   0.725 &  0.441 &      655 \\
H4  &   Inaccurate\_Mapping &      0.212 &   0.672 &  0.322 &      567 \\
H5  &     Aerial\_Grounding &      0.046 &   0.470 &  0.084 &      134 \\
H6  &      Resource\_Issues &      0.323 &   0.758 &  0.453 &      683 \\
H7  &             Injuries &      0.331 &   0.714 &  0.452 &      693 \\
H8  &   Cultural\_Resources &      0.088 &   0.823 &  0.159 &      141 \\
H9  &            Livestock &      0.057 &   0.517 &  0.103 &      143 \\
H10 &       Law\_Violations &      0.000 &   0.000 &  0.000 &        0 \\
H11 &        Military\_Base &      0.000 &   0.000 &  0.000 &        2 \\
H12 &       Infrastructure &      0.098 &   0.704 &  0.172 &      179 \\
H13

In [151]:
cr = classification_report(yval[targets], trained_clfs[0].predict(val[predictors]),zero_division=0, output_dict=True)


In [153]:
support = []
for key in cr:
    if 'avg' not in key:
        support.append(cr[key]['support'])

In [138]:
comparison

Unnamed: 0,method,input,Base Estimator,train f1,test f1,train accuracy,test accuracy,train recall,test recall,train precision,test precision,train hamming loss,test hamming loss
0,BR,text,Complement NB,0.489,0.496,0.343,0.358,0.75,0.718,0.385,0.402,0.262,0.242


In [28]:
filename = os.path.join(os.path.dirname(os.getcwd()),'models','hazard_model_CB.sav')
pickle.dump(classifier, open(filename, 'wb'))

In [29]:
probs =  classifier.predict_proba(np.concatenate((train[predictors], val[predictors], test[predictors])))
pd.qcut([p for prob in probs for p in prob], 5)

[(0.0477, 0.287], (0.0477, 0.287], (0.00628, 0.0477], (0.0477, 0.287], (0.00628, 0.0477], ..., (-0.000999693, 0.00628], (-0.000999693, 0.00628], (-0.000999693, 0.00628], (-0.000999693, 0.00628], (-0.000999693, 0.00628]]
Length: 830739
Categories (5, interval[float64, right]): [(-0.000999693, 0.00628] < (0.00628, 0.0477] < (0.0477, 0.287] < (0.287, 0.819] < (0.819, 1.0]]

### Method: classifier chain
Note: classifier chains tend to perform worse on larget sets of targets. Also the performance is highly dependent on the order of the chain, so all orderings would ideally be tested.

In [28]:
models = {'knn':KNeighborsClassifier, "svm":SVC, "decision tree":DecisionTreeClassifier, 
          "random forest":RandomForestClassifier, "logisitc regression":LogisticRegression,
          'ridge':RidgeClassifier,'xgboost':XGBClassifier, 'adaboost':AdaBoostClassifier, 
          'CNB':ComplementNB, "mNB":MultinomialNB}
classifier_params = {'knn': [{'n_neighbors':255, 'weights':'uniform', 'p':2},
                             {'n_neighbors':255, 'weights':'uniform', 'p':2},
                             {'n_neighbors':255, 'weights':'uniform', 'p':2}],
                     'svm': [{'C':0.1, 'break_ties':True, 'gamma':'auto'},
                             {'C':0.1, 'break_ties':True, 'gamma':'auto'},
                             {'C':0.1, 'break_ties':True, 'gamma':'auto'}],
                     'decision tree': [{'criterion': 'entropy','max_features': 'auto','class_weight': None,'splitter': 'best'},
                                       {'criterion': 'entropy','max_features': 'auto','class_weight': 'balanced','splitter': 'best'},
                                       {'criterion': 'entropy','max_features': 'auto','class_weight': 'balanced','splitter': 'best'}],
                     'random forest': [{'criterion': 'entropy','max_features': 'auto','n_estimators': 350},
                                      {'criterion': 'entropy','max_features': 'auto','n_estimators': 350},
                                      {'criterion': 'entropy','max_features': 'auto','n_estimators': 350}],
                     'logisitc regression': [{'max_iter': 10000,'multi_class': 'ovr','solver': 'newton-cg','C': 0.1,'class_weight': None},
                                             {'max_iter': 10000, 'multi_class': 'ovr', 'solver': 'newton-cg', 'C': 0.001, 'class_weight': None},
                                             {'max_iter': 10000, 'multi_class': 'ovr', 'solver': 'newton-cg', 'C': 0.001, 'class_weight': None}],
                     'ridge': [{'alpha': 0.0001, 'class_weight': None},
                               {'alpha': 100, 'class_weight': None},
                                {'alpha': 0.0001, 'class_weight': None}],
                     'xgboost': [{'eval_metric': 'logloss','max_depth': 3,'booster': 'gbtree','n_estimators': 125},
                                 {'eval_metric': 'logloss','max_depth': 14,'booster': 'gbtree','n_estimators': 225},
                                 {'eval_metric': 'logloss','max_depth': 20,'booster': 'gbtree','n_estimators': 250}],
                    'adaboost': [{},{},{}],
                    'CNB':[{},{},{}],
                     'mNB':[{},{},{}]}

classifiers = [models[key] for key in models]
classifier_names = [key for key in models]

input_types = ['meta', 'text', 'meta+text']
X_train_inputs = [pd.concat([train[meta_predictors], test[meta_predictors]]),
                  pd.concat([Xtrain_vec, Xtest_vec]), 
                  pd.concat([train[predictors], test[predictors]])]
y_train_inputs = [pd.concat([ytrain,ytest]),pd.concat([ytrain,ytest]),pd.concat([ytrain,ytest])]
X_test_inputs = [val[meta_predictors], Xval_vec, val[predictors]]
y_test_inputs = [yval, yval, yval]

per_hazard_comparisons = [] #need method, input
comparisons = [] #need method, input
cc_mdls = []
for i in range(len(input_types)):
    classifier_params_for_input = {classifier_name: classifier_params[classifier_name][i] for classifier_name in classifier_params}
    comparison, per_hazard_comparison, trained_cc_clfs = train_test_model_method(Xtrain=X_train_inputs[i], ytrain=y_train_inputs[i],
                                                                              Xtest=X_test_inputs[i], ytest=y_test_inputs[i], 
                                                                              input_type=input_types[i], multilabel_name='CC', multilabel_func=ClassifierChain, 
                                                                              classifiers=classifiers, classifier_names=classifier_names, classifier_params=classifier_params_for_input)
    comparisons.append(comparison)
    per_hazard_comparisons.append(per_hazard_comparison)
    cc_mdls.append(trained_cc_clfs)
comparisons_df = pd.concat(comparisons)
per_hazard_comparisons_df = pd.concat(per_hazard_comparisons)

100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [31:42<00:00, 190.21s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 10/10 [4:18:45<00:00, 1552.56s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 10/10 [4:22:02<00:00, 1572.22s/it]


In [29]:
comparisons_df

Unnamed: 0,method,input,Base Estimator,train f1,test f1,train accuracy,test accuracy,train recall,test recall,train precision,test precision,train hamming loss,test hamming loss
0,CC,meta,knn,0.335,0.224,0.415,0.419,0.242,0.16,0.637,0.454,0.105,0.106
1,CC,meta,svm,0.123,0.114,0.4,0.415,0.105,0.095,0.286,0.268,0.112,0.106
2,CC,meta,decision tree,1.0,0.357,1.0,0.302,1.0,0.335,1.0,0.384,0.0,0.13
3,CC,meta,random forest,1.0,0.329,1.0,0.424,1.0,0.249,1.0,0.564,0.0,0.098
4,CC,meta,logisitc regression,0.317,0.229,0.412,0.425,0.225,0.164,0.609,0.462,0.106,0.105
5,CC,meta,ridge,0.295,0.21,0.414,0.426,0.204,0.145,0.688,0.519,0.105,0.103
6,CC,meta,xgboost,0.64,0.364,0.511,0.428,0.528,0.287,0.833,0.535,0.069,0.098
7,CC,meta,adaboost,0.476,0.331,0.425,0.417,0.38,0.253,0.67,0.525,0.095,0.101
8,CC,meta,CNB,0.443,0.422,0.312,0.317,0.673,0.6,0.36,0.356,0.318,0.308
9,CC,meta,mNB,0.389,0.301,0.4,0.417,0.337,0.243,0.493,0.432,0.14,0.137


In [30]:
file = os.path.join(os.path.dirname(os.getcwd()),'results','model_comparisons.xlsx')
with pd.ExcelWriter(file) as writer:
    comparisons_df.to_excel(writer, sheet_name="Model Comparisons")  
    per_hazard_comparisons_df.to_excel(writer, sheet_name="Per Hazard Performance")  

In [31]:
cc_comp = comparisons_df

### Method: multioutput classifier

### Power set label 
problem transformation to multiclass

In [26]:
ytrain['powerlabel'] = ytrain.apply(lambda x : sum([(2**i)*x[targets[i]] for i in range(len(targets))]),axis=1)
yval['powerlabel'] = yval.apply(lambda x : sum([(2**i)*x[targets[i]] for i in range(len(targets))]),axis=1)
ytest['powerlabel'] = ytest.apply(lambda x : sum([(2**i)*x[targets[i]] for i in range(len(targets))]),axis=1)
#ytrain['powerlabel'].hist(bins=np.unique(ytrain['powerlabel']))

In [32]:
input_types = ['meta', 'text', 'meta+text']
X_train_inputs = [pd.concat([train[meta_predictors], test[meta_predictors]]),
                  pd.concat([Xtrain_vec, Xtest_vec]), 
                  pd.concat([train[predictors], test[predictors]])]
y_train_inputs = [pd.concat([ytrain,ytest])['powerlabel'],pd.concat([ytrain,ytest])['powerlabel'],pd.concat([ytrain,ytest])['powerlabel']]
X_test_inputs = [val[meta_predictors], Xval_vec, val[predictors]]
y_test_inputs = [yval['powerlabel'], yval['powerlabel'], yval['powerlabel']]

per_hazard_comparisons = [] #need method, input
comparisons = [] #need method, input
lc_mdls = []

In [33]:
models = {'knn':KNeighborsClassifier, "svm":LinearSVC, "decision tree":DecisionTreeClassifier, 
          #"random forest":RandomForestClassifier, 
          "logisitc regression":LogisticRegression,
          'ridge':RidgeClassifier,'xgboost':XGBClassifier, 'adaboost':AdaBoostClassifier, 
          'CNB':ComplementNB, "mNB":MultinomialNB}

classifiers = [models[key] for key in models]
classifier_names = [key for key in models]

classifier_params = {'knn': [{'n_neighbors': 455, 'weights': 'uniform', 'p': 2},
                             {'n_neighbors': 755, 'weights': 'uniform', 'p': 2},
                             {'n_neighbors': 455, 'weights': 'uniform', 'p': 2}],
                     'svm': [{'multi_class':'crammer_singer','max_iter':100000, 'class_weight':'balanced'},
                            {'multi_class':'crammer_singer','max_iter':100000, 'class_weight':'balanced'},
                            {'multi_class':'crammer_singer','max_iter':100000, 'class_weight':'balanced'}],
                     'decision tree': [{'criterion': 'gini','max_features': 'log2','class_weight': None,'splitter': 'random'},
                                       {'criterion': 'entropy','max_features': 'auto','class_weight': None,'splitter': 'best'},
                                       {'criterion': 'entropy','max_features': 'auto','class_weight': None,'splitter': 'best'}],
                     'random forest': [{'criterion':'entropy','n_estimators':350},
                                      {'criterion':'entropy','n_estimators':350},
                                      {'criterion':'entropy','n_estimators':350}],
                     'logisitc regression': [{'max_iter': 10000,'multi_class': 'multinomial','solver': 'newton-cg','C': 0.1,'class_weight': None},
                                            {'max_iter': 10000, 'multi_class': 'multinomial','solver': 'newton-cg','C': 0.001, 'class_weight': None},
                                            {'max_iter': 10000, 'multi_class': 'multinomial','solver': 'newton-cg','C': 0.001, 'class_weight': None}],
                     'ridge': [{'alpha': 0.0001, 'class_weight': None}, 
                               {'alpha': 10, 'class_weight': None}, 
                               {'alpha': 0.0001, 'class_weight': None}],
                     'xgboost':  [{},{},{}],
                    'adaboost': [{},{},{}],
                    'CNB':[{},{},{}],
                     'mNB':[{},{},{}]
                    }

In [34]:
for i in range(len(input_types)):
    classifier_params_for_input = {classifier_name: classifier_params[classifier_name][i] for classifier_name in classifier_params}
    comparison = train_test_model_method(Xtrain=X_train_inputs[i], ytrain=y_train_inputs[i],
                                                                              Xtest=X_test_inputs[i], ytest=y_test_inputs[i], 
                                                                              input_type=input_types[i], multilabel_name='LC', 
                                                                              classifiers=classifiers, classifier_names=classifier_names, classifier_params=classifier_params_for_input)
    comparisons.append(comparison)
    #per_hazard_comparisons.append(per_hazard_comparison)
    #lc_mdls.append(trained_clfs)
comparisons_df = pd.concat(comparisons)
#per_hazard_comparisons_df = pd.concat(per_hazard_comparisons)

 56%|██████████████████████████████████████████████                                     | 5/9 [15:13<10:37, 159.36s/it]



100%|███████████████████████████████████████████████████████████████████████████████████| 9/9 [21:56<00:00, 146.29s/it]
 56%|███████████████████████████████████████████▎                                  | 5/9 [6:27:35<3:25:37, 3084.42s/it]



100%|████████████████████████████████████████████████████████████████████████████████| 9/9 [8:07:54<00:00, 3252.76s/it]
 56%|███████████████████████████████████████████▎                                  | 5/9 [6:45:49<3:34:22, 3215.60s/it]



100%|████████████████████████████████████████████████████████████████████████████████| 9/9 [8:43:41<00:00, 3491.24s/it]


In [35]:
comparisons_df

Unnamed: 0,method,input,Base Estimator,train f1,test f1,train accuracy,test accuracy,train recall,test recall,train precision,test precision,train hamming loss,test hamming loss
0,LC,meta,knn,0.273,0.276,0.421,0.423,0.421,0.423,0.208,0.213,0.579,0.577
1,LC,meta,svm,0.071,0.002,0.111,0.001,0.111,0.001,0.369,0.443,0.889,0.999
2,LC,meta,decision tree,1.0,0.292,1.0,0.294,1.0,0.294,1.0,0.292,0.0,0.706
3,LC,meta,logisitc regression,0.266,0.274,0.418,0.425,0.418,0.425,0.204,0.204,0.582,0.575
4,LC,meta,ridge,0.261,0.271,0.416,0.424,0.416,0.424,0.195,0.199,0.584,0.576
5,LC,meta,xgboost,0.332,0.298,0.369,0.31,0.369,0.31,0.313,0.289,0.631,0.69
6,LC,meta,adaboost,0.242,0.253,0.41,0.422,0.41,0.422,0.171,0.18,0.59,0.578
7,LC,meta,CNB,0.297,0.307,0.364,0.367,0.364,0.367,0.257,0.267,0.636,0.633
8,LC,meta,mNB,0.272,0.283,0.413,0.419,0.413,0.419,0.212,0.213,0.587,0.581
0,LC,text,knn,0.288,0.287,0.426,0.426,0.426,0.426,0.239,0.222,0.574,0.574


In [36]:
lc_comp = comparisons_df

In [38]:
file = os.path.join(os.path.dirname(os.getcwd()),'results','model_comparisons.xlsx')
prev_comparisons = pd.read_excel(file, sheet_name="Model Comparisons")
prev_hazard_comparisons = pd.read_excel(file, sheet_name="Per Hazard Performance")
comparisons_df = pd.concat([prev_comparisons, comparisons_df])
per_hazard_comparisons_df = pd.concat([prev_hazard_comparisons])#, per_hazard_comparisons_df])

In [39]:
with pd.ExcelWriter(file) as writer:
    comparisons_df.to_excel(writer, sheet_name="Model Comparisons")  
    per_hazard_comparisons_df.to_excel(writer, sheet_name="Per Hazard Performance")  

### Method: One vs Rest classifier
note: this one performs better without the extreme over sampling -> maybe a simple over sampling approach is preferred here. Over fitting is definitely occuring

In [33]:
models = {'knn':KNeighborsClassifier, "svm":SVC, "decision tree":DecisionTreeClassifier, 
          "random forest":RandomForestClassifier, "logisitc regression":LogisticRegression,
          'ridge':RidgeClassifier,'xgboost':XGBClassifier, 'adaboost':AdaBoostClassifier, 
          'CNB':ComplementNB, "mNB":MultinomialNB}
classifiers = [models[key] for key in models]
classifier_names = [key for key in models]
best_ovr_params = {'knn': [{'n_neighbors': 155, 'weights': 'uniform', 'p': 1},
                           {'n_neighbors': 355, 'weights': 'uniform', 'p': 2},
                           {'n_neighbors': 255, 'weights': 'uniform', 'p': 2}],
                   'svm': [{'C': 1, 'class_weight': None, 'gamma': 'auto', 'break_ties': True},
                           {'C': 1, 'class_weight': None, 'gamma': 'auto', 'break_ties': True},
                           {'C': 1, 'class_weight': None, 'gamma': 'auto', 'break_ties': True}],
                   'decision tree': [{'criterion':'gini', 'max_features':'auto', 'class_weight':'balanced', 'splitter':'best'}, 
                                     {'criterion':'entropy', 'max_features':'auto', 'class_weight':'balanced', 'splitter':'best'}, 
                                     {'criterion':'entropy', 'max_features':'sqrt', 'class_weight':'balanced', 'splitter':'best'}],
                   'random forest': [{'criterion':'entropy', 'max_features':'auto', 'class_weight':None, 'n_estimators':350},
                                    {'criterion':'entropy', 'max_features':'auto', 'class_weight':None, 'n_estimators':350},
                                    {'criterion':'entropy', 'max_features':'auto', 'class_weight':None, 'n_estimators':350}],
                   'logisitc regression': [{'max_iter':10000, 'multi_class':'ovr', 'solver':'newtong-cg', 'C':1, 'class_weight':None},
                                           {'max_iter':10000, 'multi_class':'ovr', 'solver':'newtong-cg', 'C':0.001, 'class_weight':None},
                                           {'max_iter': 10000, 'multi_class': 'ovr', 'solver': 'newton-cg', 'C': 0.001, 'class_weight': None}],
                   'ridge': [{'alpha': 0.0001, 'class_weight': None}, 
                             {'alpha': 100, 'class_weight': None}, 
                             {'alpha': 0.0001, 'class_weight': None}],
                   'xgboost': [{'max_depth': 3, 'booster': 'gbtree', 'n_estimators': 100, 'eval_metric':'logloss','use_label_encoder':False},
                               {'max_depth': 23, 'booster': 'gbtree', 'n_estimators': 200, 'eval_metric':'logloss','use_label_encoder':False},
                              {'max_depth': 23, 'booster': 'gbtree', 'n_estimators': 300, 'eval_metric':'logloss','use_label_encoder':False}],#not tuned
                   'adaboost': [{},{},{}],
                   'CNB':[{},{},{}],
                   'mNB':[{},{},{}]}

In [34]:
input_types = ['meta', 'text', 'meta+text']
X_train_inputs = [pd.concat([train[meta_predictors], test[meta_predictors]]),
                  pd.concat([Xtrain_vec, Xtest_vec]), 
                  pd.concat([train[predictors], test[predictors]])]
y_train_inputs = [pd.concat([ytrain,ytest]),pd.concat([ytrain,ytest]),pd.concat([ytrain,ytest])]
X_test_inputs = [val[meta_predictors], Xval_vec, val[predictors]]
y_test_inputs = [yval, yval, yval]

per_hazard_comparisons = [] #need method, input
comparisons = [] #need method, input
ovr_mdls = []

In [35]:
for i in range(len(input_types)):
    classifier_params_for_input = {classifier_name: classifier_params[classifier_name][i] for classifier_name in classifier_params}
    comparison, per_hazard_comparison, trained_clfs = train_test_model_method(Xtrain=X_train_inputs[i], ytrain=y_train_inputs[i],
                                                                              Xtest=X_test_inputs[i], ytest=y_test_inputs[i], 
                                                                              input_type=input_types[i], multilabel_name='BR', multilabel_func=OneVsRestClassifier, 
                                                                              classifiers=classifiers, classifier_names=classifier_names, classifier_params=classifier_params_for_input)
    comparisons.append(comparison)
    per_hazard_comparisons.append(per_hazard_comparison)
    ovr_mdls.append(trained_clfs)
comparisons_df = pd.concat(comparisons)
per_hazard_comparisons_df = pd.concat(per_hazard_comparisons)

100%|███████████████████████████████████████████████████████████████████████████████| 10/10 [1:28:34<00:00, 531.46s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 10/10 [4:38:32<00:00, 1671.29s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 10/10 [4:48:54<00:00, 1733.48s/it]


In [36]:
comparisons_df

Unnamed: 0,method,input,Base Estimator,train f1,test f1,train accuracy,test accuracy,train recall,test recall,train precision,test precision,train hamming loss,test hamming loss
0,BR,meta,knn,0.376,0.28,0.402,0.411,0.293,0.22,0.655,0.472,0.1,0.101
1,BR,meta,svm,0.223,0.227,0.387,0.401,0.203,0.2,0.252,0.262,0.109,0.102
2,BR,meta,decision tree,1.0,0.353,0.999,0.266,1.0,0.337,1.0,0.373,0.0,0.136
3,BR,meta,random forest,1.0,0.357,1.0,0.418,1.0,0.288,1.0,0.567,0.0,0.095
4,BR,meta,logisitc regression,0.329,0.275,0.403,0.415,0.256,0.215,0.64,0.495,0.103,0.1
5,BR,meta,ridge,0.331,0.268,0.408,0.419,0.253,0.208,0.669,0.52,0.102,0.1
6,BR,meta,xgboost,0.669,0.379,0.498,0.412,0.567,0.315,0.846,0.52,0.065,0.097
7,BR,meta,adaboost,0.487,0.355,0.412,0.412,0.397,0.289,0.72,0.539,0.091,0.097
8,BR,meta,CNB,0.461,0.435,0.164,0.168,0.692,0.607,0.371,0.359,0.283,0.273
9,BR,meta,mNB,0.379,0.314,0.388,0.402,0.324,0.265,0.523,0.417,0.11,0.109


In [37]:
ovr_comp = comparisons_df

In [38]:
file = os.path.join(os.path.dirname(os.getcwd()),'results','model_comparisons.xlsx')
prev_comparisons = pd.read_excel(file, sheet_name="Model Comparisons")
prev_hazard_comparisons = pd.read_excel(file, sheet_name="Per Hazard Performance")
comparisons_df = pd.concat([prev_comparisons, comparisons_df])
per_hazard_comparisons_df = pd.concat([prev_hazard_comparisons, per_hazard_comparisons_df])

In [39]:
with pd.ExcelWriter(file) as writer:
    comparisons_df.to_excel(writer, sheet_name="Model Comparisons")  
    per_hazard_comparisons_df.to_excel(writer, sheet_name="Per Hazard Performance")  

In [40]:
cols = ['Method', 'Model', 'Input', 'Hamming', 'Precision', 'Recall', 'F1']
curr_cols = ['method', 'Base Estimator', 'input', 'test hamming loss', 'test precision', 'test recall', 'test f1']

In [113]:
results = comparisons_df#pd.concat([cc_comp,ovr_comp, lc_comp])
results = results[curr_cols]
results.columns = cols
results = results.drop_duplicates()

In [114]:
results = results.loc[results['Model']!='mNB'].reset_index(drop=True)

In [116]:
cc_models = ['knn', 'svm', 'decision tree', 'random forest', 'logistic regression', 'ridge', 'xgboost', 'adaboost', 'CNB']
reindex = []
for i in range(len(cc_models)): #cc models
    reindex.append(i)
    reindex.append(i+len(cc_models))
    reindex.append(i+2*len(cc_models))
for i in range(len(cc_models)): #BR models
    reindex.append(i+3*len(cc_models))
    reindex.append(i+4*len(cc_models))
    reindex.append(i+5*len(cc_models))
for i in range(len(cc_models)-1): #lc models
    reindex.append(i+6*len(cc_models))
    reindex.append(i+(6*(len(cc_models)))+(len(cc_models)-1))
    reindex.append(i+(6*(len(cc_models)))+2*(len(cc_models)-1))

In [117]:
results = results.reindex(reindex)

In [118]:
df = results[['Method', 'Model', 'Input']]
ind = pd.MultiIndex.from_frame(df)
results.index = ind

In [119]:
results = results.drop(['Method', 'Model', 'Input'], axis=1)

In [120]:
results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Hamming,Precision,Recall,F1
Method,Model,Input,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CC,knn,meta,0.106,0.454,0.160,0.224
CC,knn,text,0.106,0.450,0.356,0.383
CC,knn,meta+text,0.101,0.485,0.349,0.388
CC,svm,meta,0.106,0.268,0.095,0.114
CC,svm,text,0.104,0.276,0.082,0.123
...,...,...,...,...,...,...
LC,adaboost,text,0.575,0.181,0.425,0.254
LC,adaboost,meta+text,0.575,0.181,0.425,0.254
LC,CNB,meta,0.633,0.267,0.367,0.307
LC,CNB,text,0.625,0.288,0.375,0.324


In [121]:
print(results.to_latex())

\begin{tabular}{lllrrrr}
\toprule
   &     &           &  Hamming &  Precision &  Recall &     F1 \\
Method & Model & Input &          &            &         &        \\
\midrule
CC & knn & meta &    0.106 &      0.454 &   0.160 &  0.224 \\
   &     & text &    0.106 &      0.450 &   0.356 &  0.383 \\
   &     & meta+text &    0.101 &      0.485 &   0.349 &  0.388 \\
   & svm & meta &    0.106 &      0.268 &   0.095 &  0.114 \\
   &     & text &    0.104 &      0.276 &   0.082 &  0.123 \\
   &     & meta+text &    0.101 &      0.275 &   0.129 &  0.172 \\
   & decision tree & meta &    0.130 &      0.384 &   0.335 &  0.357 \\
   &     & text &    0.145 &      0.341 &   0.325 &  0.332 \\
   &     & meta+text &    0.137 &      0.366 &   0.336 &  0.350 \\
   & random forest & meta &    0.098 &      0.564 &   0.249 &  0.329 \\
   &     & text &    0.099 &      0.543 &   0.182 &  0.254 \\
   &     & meta+text &    0.096 &      0.577 &   0.215 &  0.286 \\
   & logisitc regression & meta &    

In [180]:
comp = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()),'results','dynamic_static_rates_comparison.csv'), index_col=0)

In [184]:
tuples = [('Likelihood', '% Same'), ('Likelihood', 'Average Distance'), ('Severity', '% Same'), ('Severity', 'Average Distance'), ('Likelihood and Severity', '% Same')]
index = pd.MultiIndex.from_tuples(tuples)
comp.columns =index

In [187]:
print(comp.round(3).to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} & \multicolumn{2}{l}{Likelihood} & \multicolumn{2}{l}{Severity} & Likelihood and Severity \\
{} &     \% Same & Average Distance &   \% Same & Average Distance &                  \% Same \\
\midrule
Traffic             &     24.904 &            1.794 &   35.717 &           -0.765 &                  19.164 \\
Command\_Transitions &     30.640 &            1.680 &   27.020 &           -0.908 &                  18.730 \\
Evacuations         &     24.389 &            1.984 &   38.071 &           -0.613 &                  19.363 \\
Inaccurate\_Mapping  &     29.662 &            0.506 &   47.756 &           -0.608 &                  16.670 \\
Aerial\_Grounding    &     19.506 &            1.007 &   29.181 &           -0.730 &                   7.811 \\
Resource\_Issues     &     20.951 &            0.767 &   55.502 &           -0.564 &                  14.603 \\
Injuries            &     17.793 &            0.910 &   97.452 &           -0.021 &            