In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.style
matplotlib.style.use("seaborn")
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from tqdm import tqdm

import seaborn as sns
sns.color_palette("hls", 17)
import scipy.stats as st
import math

from pingouin import rcorr
import pingouin as pg
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV, RidgeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
#from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_validate, cross_val_score, learning_curve
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score, plot_roc_curve, roc_curve, auc, roc_auc_score,precision_recall_curve, hamming_loss
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.preprocessing import MinMaxScaler

from sklearn import feature_selection #import chi2

import gensim
import gensim.downloader as gensim_api
## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

In [2]:
import sys
import os

from sys import platform
if platform == "darwin":
    sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/..")
    smart_nlp_path = ''
elif platform == "win32":
    sys.path.append('../')
    smart_nlp_path = os.getcwd()
    smart_nlp_path = "\\".join([smart_nlp_path.split("\\")[i] for i in range(0,len(smart_nlp_path.split("\\"))-1)]+["/"])

from module.trend_analysis_functions import *
from module.topic_model_plus_class import Topic_Model_plus

In [3]:
test_data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()),'data','ICS_predictive_sitreps_test.csv')).drop(["Unnamed: 0"], axis=1)
train_data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()),'data','ICS_predictive_sitreps_train.csv')).drop(["Unnamed: 0"], axis=1)
val_data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()),'data','ICS_predictive_sitreps_val.csv')).drop(["Unnamed: 0"], axis=1)

In [4]:
predictors = ["TOTAL_PERSONNEL", "TOTAL_AERIAL", "PCT_CONTAINED_COMPLETED",
              "ACRES",  "WF_FSR", "INJURIES", "FATALITIES", "EST_IM_COST_TO_DATE", "STR_DAMAGED",
              "STR_DESTROYED", "NEW_ACRES", "EVACUATION_IN_PROGRESS", 
              "NUM_REPORTS", "DAYS_BURING", 'Combined_Text', 'Incident_region_AICC', 
              'Incident_region_CA', 'Incident_region_EACC','Incident_region_GBCC', 'Incident_region_HICC', 
              'Incident_region_NRCC','Incident_region_NWCC', 'Incident_region_RMCC', 'Incident_region_SACC',
              'Incident_region_SWCC', 'INC_MGMT_ORG_ABBREV_1', 'INC_MGMT_ORG_ABBREV_2','INC_MGMT_ORG_ABBREV_3', 
              'INC_MGMT_ORG_ABBREV_4','INC_MGMT_ORG_ABBREV_5', 'INC_MGMT_ORG_ABBREV_B','INC_MGMT_ORG_ABBREV_C', 
              'INC_MGMT_ORG_ABBREV_D','INC_MGMT_ORG_ABBREV_E', 'INC_MGMT_ORG_ABBREV_F']
targets = ["Traffic","Command_Transitions","Evacuations", "Inaccurate_Mapping", "Aerial_Grounding", 
           "Resource_Issues", "Injuries", "Cultural_Resources","Livestock", "Law_Violations", "Military_Base", 
           "Infrastructure", "Extreme_Weather", "Ecological", "Hazardous_Terrain", "Floods", "Dry_Weather"]

# Prepare the Data

In [5]:
def remove_quote_marks(word_list):
    word_list = word_list.strip("[]").split(", ")
    word_list = [w.replace("'","") for w in word_list]
    word_list = " ".join(word_list)
    return word_list

In [6]:
dfs = [train_data, val_data, test_data]
for df in dfs:
    cleaned_combined_text = []
    for text in df['Combined_Text']:
        cleaned_text = remove_quote_marks(text)
        cleaned_combined_text.append(cleaned_text)
    df['Combined_Text'] = cleaned_combined_text

In [7]:
Xtrain = train_data['Combined_Text']; ytrain = train_data[targets]
Xval = val_data['Combined_Text']; yval = val_data[targets]
Xtest = test_data['Combined_Text']; ytest = test_data[targets]

# Vectorize the data

In [8]:
from sentence_transformers import SentenceTransformer
vec_model = SentenceTransformer('all-MiniLM-L6-v2')
Xtrain_vec = vec_model.encode(Xtrain)
Xval_vec = vec_model.encode(Xval)
Xtest_vec = vec_model.encode(Xtest)

In [9]:
Xtest_vec.shape

(2504, 384)

In [10]:
Xtest.shape

(2504,)

In [11]:
Xtest_vec[0]

array([ 2.08430318e-03,  5.29891811e-02,  5.63805848e-02,  5.24846353e-02,
       -1.98009294e-02, -5.37135340e-02, -2.94713005e-02,  3.00843758e-03,
       -5.36048114e-02,  3.71615738e-02, -7.25085139e-02,  3.40168476e-02,
       -2.22536419e-02,  2.83411741e-02, -2.35227887e-02,  6.15694299e-02,
       -1.17620267e-01,  1.12996018e-02, -3.51913013e-02, -2.40760781e-02,
       -1.79756782e-03,  7.55831748e-02, -5.51044494e-02,  5.87755553e-02,
       -4.46466506e-02,  2.95554064e-02, -2.21362077e-02,  6.35944530e-02,
        4.99554202e-02, -2.42515374e-02, -5.02032600e-03,  3.16083916e-02,
       -6.29145056e-02,  7.50913890e-03,  7.35858306e-02,  3.09250131e-02,
       -7.78815374e-02,  3.37398276e-02, -2.95815654e-02,  1.87330078e-02,
        2.31621880e-02,  1.03461696e-02,  2.11322047e-02, -2.33576652e-02,
        2.65683723e-03,  1.29131125e-02,  1.86523721e-02, -6.27204701e-02,
        3.05400137e-03, -3.37219276e-02,  1.33483753e-01, -2.40058247e-02,
       -2.51906421e-02, -

In [12]:
Xtest = test_data[predictors]

In [16]:
Xtest['Combined_Text'] = [a for a in Xtest_vec]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Xtest['Combined_Text'] = [a for a in Xtest_vec]


In [17]:
Xtest['Combined_Text']

0       [0.0020843032, 0.05298918, 0.056380585, 0.0524...
1       [-0.014200934, -0.019812403, 0.034033157, 0.01...
2       [-0.08205955, 0.044193313, -0.05975568, 0.0192...
3       [-0.022130225, 0.07685305, 0.11996939, 0.05023...
4       [0.01735855, 0.015201129, 0.14353183, 0.004974...
                              ...                        
2499    [0.023895172, 0.032366198, 0.08411811, 0.05927...
2500    [0.032280732, 0.03709234, 0.04344326, 0.068825...
2501    [-0.073423855, 0.06858109, -0.032805484, 0.028...
2502    [-0.104871266, 0.014583408, 0.037773974, 0.043...
2503    [0.022372281, 0.13807611, -0.0038996136, -0.00...
Name: Combined_Text, Length: 2504, dtype: object

In [19]:
import pickle

In [32]:
filename = os.path.join(os.path.dirname(os.getcwd()),'models','sbert_model.sav')
pickle.dump(vec_model, open(filename, 'wb'))

In [21]:
loaded_model = pickle.load(open(filename, 'rb'))

In [33]:
clf = MLPClassifier(random_state=1)
classifier = ClassifierChain(clf)
classifier.fit(Xtrain_vec, ytrain[targets])
filename = os.path.join(os.path.dirname(os.getcwd()),'models','sbert_likelihood_model1.sav')
pickle.dump(classifier, open(filename, 'wb'))

In [23]:
training_probs = classifier.predict_proba(Xtrain_vec)

In [24]:
training_probs.shape

(18579, 17)

In [25]:
probs_df = pd.DataFrame(training_probs, columns=targets)

In [28]:
Xtrain_meta = train_data[predictors].drop(["Combined_Text"],axis=1)

In [29]:
Xtrain_combined = pd.concat([Xtrain_meta, probs_df], axis=1)

In [30]:
classifier_meta = ClassifierChain(LogisticRegression(max_iter=10000,multi_class='ovr'))
classifier_meta.fit(Xtrain_combined, ytrain[targets])
# predict
Xtest_combined = pd.concat([Xtest.drop(["Combined_Text"],axis=1), pd.DataFrame(classifier.predict_proba(Xtest_vec), columns=targets)], axis=1)
predictions = classifier_meta.predict(Xtest_combined)

In [31]:
filename = os.path.join(os.path.dirname(os.getcwd()),'models','sbert_likelihood_model_2test.sav')
pickle.dump(classifier_meta, open(filename, 'wb'))