In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.style
matplotlib.style.use("seaborn")
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from tqdm import tqdm

import seaborn as sns
sns.color_palette("hls", 17)
import scipy.stats as st
import math

from pingouin import rcorr
import pingouin as pg
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV, RidgeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
#from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_validate, cross_val_score, learning_curve
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score, plot_roc_curve, roc_curve, auc, roc_auc_score,precision_recall_curve, hamming_loss
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.preprocessing import MinMaxScaler

from sklearn import feature_selection #import chi2

import gensim
import gensim.downloader as gensim_api
## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

In [2]:
import sys
import os

from sys import platform
if platform == "darwin":
    sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/..")
    smart_nlp_path = ''
elif platform == "win32":
    sys.path.append('../')
    smart_nlp_path = os.getcwd()
    smart_nlp_path = "\\".join([smart_nlp_path.split("\\")[i] for i in range(0,len(smart_nlp_path.split("\\"))-1)]+["/"])

from module.trend_analysis_functions import *
from module.topic_model_plus_class import Topic_Model_plus

In [3]:
test_data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()),'data','ICS_predictive_sitreps_test.csv')).drop(["Unnamed: 0"], axis=1)
train_data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()),'data','ICS_predictive_sitreps_train.csv')).drop(["Unnamed: 0"], axis=1)
val_data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()),'data','ICS_predictive_sitreps_val.csv')).drop(["Unnamed: 0"], axis=1)

In [4]:
predictors = ["TOTAL_PERSONNEL", "TOTAL_AERIAL", "PCT_CONTAINED_COMPLETED",
              "ACRES",  "WF_FSR", "INJURIES", "FATALITIES", "EST_IM_COST_TO_DATE", "STR_DAMAGED",
              "STR_DESTROYED", "NEW_ACRES", "EVACUATION_IN_PROGRESS", 
              "NUM_REPORTS", "DAYS_BURING", 'Combined_Text', 'Incident_region_AICC', 
              'Incident_region_CA', 'Incident_region_EACC','Incident_region_GBCC', 'Incident_region_HICC', 
              'Incident_region_NRCC','Incident_region_NWCC', 'Incident_region_RMCC', 'Incident_region_SACC',
              'Incident_region_SWCC', 'INC_MGMT_ORG_ABBREV_1', 'INC_MGMT_ORG_ABBREV_2','INC_MGMT_ORG_ABBREV_3', 
              'INC_MGMT_ORG_ABBREV_4','INC_MGMT_ORG_ABBREV_5', 'INC_MGMT_ORG_ABBREV_B','INC_MGMT_ORG_ABBREV_C', 
              'INC_MGMT_ORG_ABBREV_D','INC_MGMT_ORG_ABBREV_E', 'INC_MGMT_ORG_ABBREV_F']
targets = ["Traffic","Command_Transitions","Evacuations", "Inaccurate_Mapping", "Aerial_Grounding", 
           "Resource_Issues", "Injuries", "Cultural_Resources","Livestock", "Law_Violations", "Military_Base", 
           "Infrastructure", "Extreme_Weather", "Ecological", "Hazardous_Terrain", "Floods", "Dry_Weather"]

# Prepare the Data

In [5]:
def remove_quote_marks(word_list):
    word_list = word_list.strip("[]").split(", ")
    word_list = [w.replace("'","") for w in word_list]
    word_list = " ".join(word_list)
    return word_list

In [6]:
dfs = [train_data, val_data, test_data]
for df in dfs:
    cleaned_combined_text = []
    for text in df['Combined_Text']:
        cleaned_text = remove_quote_marks(text)
        cleaned_combined_text.append(cleaned_text)
    df['Combined_Text'] = cleaned_combined_text

In [7]:
Xtrain = train_data['Combined_Text']; ytrain = train_data[targets]
Xval = val_data['Combined_Text']; yval = val_data[targets]
Xtest = test_data['Combined_Text']; ytest = test_data[targets]

# Vectorize the data

In [8]:
from sentence_transformers import SentenceTransformer
vec_model = SentenceTransformer('all-MiniLM-L6-v2')
Xtrain_vec = vec_model.encode(Xtrain)
Xval_vec = vec_model.encode(Xval)
Xtest_vec = vec_model.encode(Xtest)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1175.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=10177.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=612.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=116.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=39265.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=349.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=90888945.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=53.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=466247.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=350.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=13156.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=190.0), HTML(value='')))




In [12]:
Xtest_vec.shape

(2504, 384)

In [13]:
Xtest.shape

(2504,)

### Method: classifier chain
Note: classifier chains tend to perform worse on larget sets of targets. Also the performance is highly dependent on the order of the chain, so all orderings would ideally be tested.

In [14]:
classifiers = [LogisticRegression(max_iter=10000,multi_class='ovr'), RandomForestClassifier(random_state=1),
              KNeighborsClassifier(weights='distance'), MLPClassifier(random_state=1), RidgeClassifierCV()]
classifier_names = ['logistic regression', 'random forest', 'knn', 'MLP NN', 'Ridge']
test_hamming_loss = []; train_hamming_loss = []
test_acc = []; train_acc = []
test_f1 = []; train_f1 = []
test_precision = []; train_precision = []
test_recall = []; train_recall = []
for clf in classifiers:
    classifier = ClassifierChain(clf)
    classifier.fit(Xtrain_vec, ytrain[targets])
    # predict
    predictions = classifier.predict(Xtest_vec)
    test_acc.append(round(accuracy_score(ytest,predictions),3)); train_acc.append(round(accuracy_score(ytrain,train_preds),3))
    test_f1.append(round(f1_score(ytest,predictions, average='macro',zero_division=0),3))
    train_f1.append(round(f1_score(ytrain,train_preds, average='macro',zero_division=0),3))
    test_precision.append(round(precision_score(ytest,predictions, average='macro', zero_division=0),3))
    train_precision.append(round(precision_score(ytrain,train_preds, average='macro', zero_division=0),3))
    test_recall.append(round(recall_score(ytest,predictions, average='macro', zero_division=0),3))
    train_recall.append(round(recall_score(ytrain,train_preds, average='macro', zero_division=0),3))
    test_hamming_loss.append(round(hamming_loss(ytest,predictions),3))
    train_hamming_loss.append(round(hamming_loss(ytrain,train_preds),3))
    
comparison = pd.DataFrame({"Base Estimator": classifier_names,
                           "train f1": train_f1, "test f1": test_f1,
                          "train accuracy":train_acc, "test accuracy": test_acc,
                          "train recall": train_recall, "test recall": test_recall,
                          "train precision": train_precision, "test precision": test_precision,
                          "train hamming loss": train_hamming_loss, "test hamming loss": test_hamming_loss})
                    



In [15]:
comparison

Unnamed: 0,Base Estimator,hamming loss
0,logistic regression,0.05833
1,random forest,0.059505
2,knn,0.073717
3,MLP NN,0.063968
4,Ridge,0.058941


### Method: multioutput classifier

In [16]:
test_hamming_loss = []; train_hamming_loss = []
test_acc = []; train_acc = []
test_f1 = []; train_f1 = []
test_precision = []; train_precision = []
test_recall = []; train_recall = []

In [17]:
classifiers = [KNeighborsClassifier(weights='distance'),#SVC(kernel="linear", C=0.025),SVC(gamma=2, C=1),
               LinearSVC(multi_class='crammer_singer',max_iter=100000, class_weight='balanced'), DecisionTreeClassifier(),
               RandomForestClassifier(random_state=1, n_estimators=200),LogisticRegression(max_iter=10000,multi_class='multinomial'),
               MLPClassifier(alpha=1, max_iter=1000), RidgeClassifierCV(), AdaBoostClassifier()]
               #GaussianNB(), QuadraticDiscriminantAnalysis()]
total_names = ['Knn', #"Linear SVM", "RBF SVM",
                "Linear SVM", "Decision Tree", "Random Forest", 
                   "Logisitc Regression", 'MLP NN', 'Ridge', 'Adaboost']#, 'Gaussian NB', 'QDA']

In [18]:
for clf in tqdm(classifiers):
    classifier = clf#MultiOutputClassifier(clf)#, n_jobs=-1)
    classifier.fit(Xtrain_vec, ytrain['powerlabel'])
    predictions = classifier.predict(Xtest_vec)
    train_preds = classifier.predict(Xtrain_vec)
    test_acc.append(round(accuracy_score(ytest['powerlabel'],predictions),3)); train_acc.append(round(accuracy_score(ytrain['powerlabel'],train_preds),3))
    test_f1.append(round(f1_score(ytest['powerlabel'],predictions, average='macro',zero_division=0),3))
    train_f1.append(round(f1_score(ytrain['powerlabel'],train_preds, average='macro',zero_division=0),3))
    test_precision.append(round(precision_score(ytest['powerlabel'],predictions, average='macro', zero_division=0),3))
    train_precision.append(round(precision_score(ytrain['powerlabel'],train_preds, average='macro', zero_division=0),3))
    test_recall.append(round(recall_score(ytest['powerlabel'],predictions, average='macro', zero_division=0),3))
    train_recall.append(round(recall_score(ytrain['powerlabel'],train_preds, average='macro', zero_division=0),3))
    test_hamming_loss.append(round(hamming_loss(ytest['powerlabel'],predictions),3))
    train_hamming_loss.append(round(hamming_loss(ytrain['powerlabel'],train_preds),3))

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [10:43<00:00, 80.49s/it]


In [19]:
comparison = pd.DataFrame({"Model":total_names,
                           "train f1": train_f1, "test f1": test_f1,
                          "train accuracy":train_acc, "test accuracy": test_acc,
                          "train recall": train_recall, "test recall": test_recall,
                          "train precision": train_precision, "test precision": test_precision,
                          "train hamming loss": train_hamming_loss, "test hamming loss": test_hamming_loss})
comparison

Unnamed: 0,Model,train f1,test f1,train accuracy,test accuracy,train recall,test recall,train precision,test precision,train hamming loss,test hamming loss
0,Knn,0.985,0.006,0.991,0.532,0.982,0.007,0.991,0.012,0.009,0.468
1,Linear SVM,0.432,0.004,0.253,0.038,0.962,0.007,0.321,0.007,0.747,0.962
2,Decision Tree,0.985,0.008,0.991,0.448,0.982,0.008,0.991,0.009,0.009,0.552
3,Random Forest,0.986,0.006,0.991,0.538,0.984,0.007,0.99,0.009,0.009,0.462
4,Logisitc Regression,0.054,0.008,0.653,0.543,0.045,0.01,0.098,0.01,0.347,0.457
5,MLP NN,0.002,0.006,0.594,0.538,0.002,0.008,0.002,0.005,0.406,0.462
6,Ridge,0.172,0.008,0.682,0.539,0.157,0.009,0.26,0.007,0.318,0.461
7,Adaboost,0.005,0.007,0.596,0.538,0.006,0.01,0.005,0.006,0.404,0.462


### Method: One vs Rest classifier
note: this one performs better without the extreme over sampling -> maybe a simple over sampling approach is preferred here. Over fitting is definitely occuring

In [16]:
ytrain = ytrain[targets]
ytest = ytest[targets]
yval = yval[targets]

In [17]:
classifiers = [KNeighborsClassifier(weights='uniform', p=1),SVC(kernel="linear", C=0.025),SVC(gamma=2, C=1),
               #GaussianProcessClassifier(1.0 * RBF(1.0)),
               DecisionTreeClassifier(criterion='entropy', max_features='sqrt', class_weight=None, splitter='best'),
               RandomForestClassifier(criterion='entropy',max_features='auto', class_weight='balanced', n_estimators=100),
               LogisticRegression(max_iter=10000,multi_class='ovr',solver='sag'),
               MLPClassifier(alpha=1, max_iter=1000), RidgeClassifier(alpha=10), AdaBoostClassifier(learning_rate=1), 
               XGBClassifier(booster='gbtree', n_estimators=100, max_depth=4)
              ]#GaussianNB(), QuadraticDiscriminantAnalysis()]
classifier_name = ['Knn', "Linear SVM", "RBF SVM", #"Gaussian Process",
                   "Decision Tree", "Random Forest", 
                   "Logisitc Regression", 'MLP NN', 'Ridge', 'Adaboost', 'XGBoost' ]#,'Gaussian NB', 'QDA'
                  

In [18]:
test_hamming_loss = []; train_hamming_loss = []
test_acc = []; train_acc = []
test_f1 = []; train_f1 = []
test_precision = []; train_precision = []
test_recall = []; train_recall = []

In [19]:
for clf in tqdm(classifiers):
    classifier = OneVsRestClassifier(clf)#, n_jobs=-1)
    classifier.fit(Xtrain_vec, ytrain)
    predictions = classifier.predict(Xtest_vec)
    train_preds = classifier.predict(Xtrain_vec)
    test_acc.append(round(accuracy_score(ytest,predictions),3)); train_acc.append(round(accuracy_score(ytrain,train_preds),3))
    test_f1.append(round(f1_score(ytest,predictions, average='macro',zero_division=0),3))
    train_f1.append(round(f1_score(ytrain,train_preds, average='macro',zero_division=0),3))
    test_precision.append(round(precision_score(ytest,predictions, average='macro', zero_division=0),3))
    train_precision.append(round(precision_score(ytrain,train_preds, average='macro', zero_division=0),3))
    test_recall.append(round(recall_score(ytest,predictions, average='macro', zero_division=0),3))
    train_recall.append(round(recall_score(ytrain,train_preds, average='macro', zero_division=0),3))
    test_hamming_loss.append(round(hamming_loss(ytest,predictions),3))
    train_hamming_loss.append(round(hamming_loss(ytrain,train_preds),3))





100%|███████████████████████████████████████████████████████████████████████████████| 10/10 [1:23:44<00:00, 502.42s/it]


In [20]:
comparison = pd.DataFrame({"Model":classifier_name,
                          "train f1": train_f1, "test f1": test_f1,
                          "train accuracy":train_acc, "test accuracy": test_acc,
                          "train recall": train_recall, "test recall": test_recall,
                          "train precision": train_precision, "test precision": test_precision,
                          "train hamming loss": train_hamming_loss, "test hamming loss": test_hamming_loss})

In [21]:
comparison

Unnamed: 0,Model,train f1,test f1,train accuracy,test accuracy,train recall,test recall,train precision,test precision,train hamming loss,test hamming loss
0,Knn,0.625,0.105,0.72,0.464,0.537,0.093,0.768,0.149,0.027,0.073
1,Linear SVM,0.001,0.001,0.587,0.537,0.001,0.001,0.039,0.05,0.053,0.062
2,RBF SVM,0.646,0.053,0.777,0.534,0.508,0.04,0.928,0.082,0.02,0.056
3,Decision Tree,1.0,0.111,1.0,0.311,1.0,0.102,1.0,0.151,0.0,0.092
4,Random Forest,1.0,0.032,1.0,0.535,0.999,0.02,1.0,0.083,0.0,0.059
5,Logisitc Regression,0.108,0.06,0.595,0.531,0.073,0.045,0.552,0.147,0.047,0.057
6,MLP NN,0.048,0.05,0.589,0.534,0.035,0.036,0.078,0.081,0.049,0.057
7,Ridge,0.044,0.042,0.59,0.537,0.032,0.03,0.103,0.083,0.049,0.057
8,Adaboost,0.34,0.106,0.574,0.508,0.274,0.076,0.591,0.269,0.049,0.062
9,XGBoost,0.889,0.069,0.856,0.508,0.854,0.051,0.933,0.163,0.01,0.06


## Hyper parameter optomization