In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.style
matplotlib.style.use("seaborn")
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import seaborn as sn
sn.color_palette("hls", 17)
import scipy.stats as st
import math

from pingouin import rcorr
import pingouin as pg

In [2]:
import sys
import os

from sys import platform
if platform == "darwin":
    sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/..")
    smart_nlp_path = ''
elif platform == "win32":
    sys.path.append('../')
    smart_nlp_path = os.getcwd()
    smart_nlp_path = "\\".join([smart_nlp_path.split("\\")[i] for i in range(0,len(smart_nlp_path.split("\\"))-1)]+["/"])

from module.trend_analysis_functions import *
from module.topic_model_plus_class import Topic_Model_plus

In [3]:
incident_file = smart_nlp_path+r"input data\summary_reports_cleaned.csv"
incident_summary_df = pd.read_csv(incident_file)
incident_summary_df = incident_summary_df.drop("Unnamed: 0", axis=1)
incident_summary_df = incident_summary_df.loc[incident_summary_df["START_YEAR"]>=2006].reset_index(drop=True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Hazard Metrics

In [4]:
document_id_col = "INCIDENT_ID"
extra_cols = ["CY","DISCOVERY_DATE", "START_YEAR", "REPORT_DOY", "DISCOVERY_DOY",
              "TOTAL_PERSONNEL", "TOTAL_AERIAL", "PCT_CONTAINED_COMPLETED"]
list_of_attributes = ["Combined Text"]
file = smart_nlp_path+r"\input data\ICS_filtered_preprocessed_data_extra_cols.csv"

ICS = Topic_Model_plus(document_id_col=document_id_col, extra_cols=extra_cols, list_of_attributes=list_of_attributes, combine_cols=False)
ICS.extract_preprocessed_data(file)

  if (await self.run_code(code, result,  async_=asy)):


Preprocessed data extracted from:  C:\Users\srandrad\smart_nlp\/\input data\ICS_filtered_preprocessed_data_extra_cols.csv


In [5]:
preprocessed_df = ICS.data_df
hazard_file = smart_nlp_path+r"\output data\hazard_interpretation_v2.xlsx"

In [6]:
time_of_occurence_days, time_of_occurence_pct_contained, frequency, fires, frequency_fires, categories, hazards, years, ids = calc_metrics(hazard_file, preprocessed_df, rm_outliers=True)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  numpy.float,
100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [05:01<00:00, 33.45s/it]


# Data Preparation
to prepare the data for the models, we need to add new columns including: 
- Region - categorical
- Hazard Occurence - one hot
- Severity - continuous
- Days Burning - continuous
- Reports so far - continuous
- total text from all reports

#### Region

In [10]:
regions = {'AICC':['AK'],
          'EACC':['MN', 'IA', 'WI', 'IL', 'IN', 'MO', 'MI', 'OH', 'WV', 'PA',
                 'NY', 'VT', 'NH', 'ME', 'MA', 'RI', 'CT', 'NJ', 'DE', 'MD'],
          'GBCC':['NV', 'UT', {'ID':["46n"]}],
           'SACC': [{'TX':["100w"]}, 'OK', 'AR', 'LA', 'MS', 'AL', 'GA', 'FL', 'SC', 'NC',
                   'TN', 'KY', 'VA'],
           'NWCC': ['OR','WA'],
           'CA':['CA'], #OSCC and ONCC about 38 N
           'NRCC':['MT', 'ND', {'ID':["46n"]}],
           'RMCC':['CO', 'WY', 'SD', 'KS', 'NE'],
           'SWCC':['NM', 'AZ', {'TX':["100w"]}],
           'HICC':['HI']
          }
states = [state for region in regions for state in regions[region] if not isinstance(state,dict)]
region_by_state = {state: region for state in states for region in regions if state in regions[region]}
#region_by_state

In [11]:
split_states = {'TX':[("100",'w'), ('less','SACC'), ('greater', 'SWCC')],
                'ID':[("46","n"), ('less', 'GBCC'), ('greater', 'NRCC')]}
direction_dict = {'n':'POO_LATITUDE', 's':'POO_LATITUDE', 'w':'POO_LONGITUDE', 'e':'POO_LONGITUDE'}

In [12]:
region_per_incident = []#{}
for i in range(len(incident_summary_df)):
    incident_region = None
    id_ = incident_summary_df.iloc[i]['INCIDENT_ID']
    state = incident_summary_df.iloc[i]['POO_STATE']
    if isinstance(state, float):
    #    print(id_)
        id_ = id_.split("_")
        if len(id_)>1:
            state = id_[1].split("-")[0]
            if len(state)>2:
                state = 'CA'#only one instance of this manually checked and verified it is in california
                print(id_)
    if state in region_by_state:
        incident_region = region_by_state[state]
    elif state in split_states:
        (pos, direct) = split_states[state][0]
        incident_loc = float(incident_summary_df.iloc[i][direction_dict[direct]])
        if incident_loc <= float(pos):
            incident_region = split_states[state][1][1]
        else:
            incident_region = split_states[state][2][1]
   
    region_per_incident.append(incident_region)

incident_summary_df['Incident_region'] = region_per_incident
#incident_summary_df

['2012', '004462', 'COPCO']


In [13]:
sitrep_regions = []
for i in range(len(preprocessed_df)):
    rep_region = None
    id_ = preprocessed_df.iloc[i]['INCIDENT_ID']
    incident_df = incident_summary_df.loc[incident_summary_df['INCIDENT_ID']==id_].reset_index(drop=True)
    if len(incident_df)>1: 
        print("multiple summary reports")
    else:
        rep_region = incident_df.at[0,'Incident_region']
    sitrep_regions.append(rep_region)
preprocessed_df['Incident_region'] = sitrep_regions
#preprocessed_df

#### Severity and Hazard Occurrence for sit reps

In [14]:
fires_per_hazard = {hazard:[fire for year in fires[hazard] for fire in fires[hazard][year]] for hazard in fires}
hazard_df_data = {hazard:[] for hazard in fires_per_hazard}
incident_ids = preprocessed_df['INCIDENT_ID'].tolist()
for hazard in fires_per_hazard:
    for fire_id in incident_ids:
        target = 0
        if fire_id in fires_per_hazard[hazard]:
            target = 1
        hazard_df_data[hazard].append(target)
for hazard in fires_per_hazard:
    preprocessed_df[hazard] = hazard_df_data[hazard]
#preprocessed_df

In [15]:
total_severity = []
for i in range(len(preprocessed_df)):
    severity = int(preprocessed_df.iloc[i]['FATALITIES']) + int(preprocessed_df.iloc[i]['INJURIES']) + int(preprocessed_df.iloc[i]['STR_DAMAGED']) +  int(preprocessed_df.iloc[i]['STR_DESTROYED'])
    total_severity.append(severity)
preprocessed_df['Severity'] = total_severity

#### Severity and Hazard occurence for summary reports

In [16]:
fires_per_hazard = {hazard:[fire for year in fires[hazard] for fire in fires[hazard][year]] for hazard in fires}
incident_ids = incident_summary_df['INCIDENT_ID'].tolist()
hazards = [hazard.replace(" ", "_") for hazard in fires_per_hazard]
hazard_df_data = {hazard:[] for hazard in hazards}
for hazard in hazards:
    for fire_id in incident_ids:
        target = 0
        if fire_id in fires_per_hazard[hazard.replace("_"," ")]:
            target = 1
        hazard_df_data[hazard].append(target)
for hazard in hazards:
    incident_summary_df[hazard] = hazard_df_data[hazard]
total_severity = []
for i in range(len(incident_summary_df)):
    severity = int(incident_summary_df.iloc[i]['FATALITIES']) + int(incident_summary_df.iloc[i]['INJURIES_TOTAL']) + int(incident_summary_df.iloc[i]['STR_DAMAGED_TOTAL']) +  int(incident_summary_df.iloc[i]['STR_DESTROYED_TOTAL'])
    total_severity.append(severity)
incident_summary_df['Severity'] = total_severity

#### Number of Reports for incident so far & Days Burning

In [17]:
#add col for num of reports so far and Days Burning
reports_so_far = []
days_burning_so_far = []
for i in range(len(preprocessed_df)):
    unique_id = preprocessed_df.iloc[i]['Unique IDs'].split("_")[-1]
    reports_so_far.append(int(unique_id))
    #correct dates
    time_of_hazard = int(preprocessed_df.iloc[i]['REPORT_DOY'])
    start_date = int(preprocessed_df.iloc[i]['DISCOVERY_DOY'])
    if time_of_hazard<start_date: 
        #print("dates corrected")
        if time_of_hazard<30 and start_date<330: #report day is days since start, not doy 
            time_of_hazard+=start_date
        elif time_of_hazard<30 and start_date>=330:
            start_date = start_date-365 #fire spans two years
        else: #start and report day were incorrectly switched
            temp_start = start_date
            start_date = time_of_hazard
            time_of_hazard = temp_start
    days_burning_so_far.append(time_of_hazard-int(start_date))
preprocessed_df['NUM_REPORTS'] = reports_so_far
preprocessed_df['DAYS_BURING'] = days_burning_so_far

#### Total Text from incident reports so far

In [18]:
total_text_so_far = []
preprocessed_df['Total Incident Text'] = [[] for i in range(len(preprocessed_df))]
prev_id = 0 
for i in tqdm(range(len(preprocessed_df))):
    incident_id = preprocessed_df.iloc[i]['INCIDENT_ID']
    if prev_id == incident_id:
        preprocessed_df.at[i,'Total Incident Text'] = preprocessed_df.iloc[i-1]['Total Incident Text'] + preprocessed_df.iloc[i]['Combined Text']
    else:
        preprocessed_df.at[i,'Total Incident Text'] = preprocessed_df.iloc[i]['Combined Text']
    prev_id = incident_id


100%|██████████████████████████████████████████████████████████████████████████| 44363/44363 [00:17<00:00, 2609.21it/s]


In [19]:
preprocessed_df

Unnamed: 0,CY,DISCOVERY_DATE,INCIDENT_ID,PCT_CONTAINED_COMPLETED,START_YEAR,TOTAL_AERIAL,TOTAL_PERSONNEL,REPORT_DOY,DISCOVERY_DOY,Combined Text,...,Infrastructure,Extreme Weather,Ecological,Hazardous Terrain,Floods,Dry Weather,Severity,NUM_REPORTS,DAYS_BURING,Total Incident Text
0,2010,2010-07-15 15:00:00,2000_CA-RRU-062485_VALLEY COMPLEX,80.0,2010.0,5.000000,230.000000,197,196,"[resource, share, cactus]",...,0,1,0,1,0,0,0,0,1,"[resource, share, cactus]"
1,2010,2010-07-15 15:00:00,2000_CA-RRU-062485_VALLEY COMPLEX,60.0,2010.0,5.000000,230.000000,197,196,"[resource, share, incident, cactus, incident, ...",...,0,1,0,1,0,0,0,1,1,"[resource, share, cactus, resource, share, inc..."
2,2010,2010-07-15 15:00:00,2000_CA-RRU-062485_VALLEY COMPLEX,30.0,2010.0,4.000000,165.000000,197,196,"[resource, share, cactus, erratic, wind, due, ...",...,0,1,0,1,0,0,0,2,1,"[resource, share, cactus, resource, share, inc..."
3,2010,2010-07-15 15:00:00,2000_CA-RRU-062485_VALLEY COMPLEX,100.0,2010.0,4.333333,192.333333,197,196,"[resource, share, cactus, cactus, become, vall...",...,0,1,0,1,0,0,0,4,1,"[resource, share, cactus, resource, share, inc..."
4,2010,2010-07-15 15:00:00,2000_CA-RRU-062485_VALLEY COMPLEX,60.0,2010.0,4.333333,192.333333,197,196,"[resource, share, cactus, cactus, become, vall...",...,0,1,0,1,0,0,1,5,1,"[resource, share, cactus, resource, share, inc..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44358,2014,2014-03-15 14:30:00,2014_VAVAS1403037_BEAVER LODGE RD.,100.0,2014.0,0.000000,13.000000,74,74,"[fast, spread, field]",...,0,0,0,0,0,0,2,0,0,"[fast, spread, field]"
44359,2014,2014-03-19 14:00:00,2014_VAVAS1406037_AIRPORT MOUNTAIN,85.0,2014.0,0.000000,18.500000,80,78,"[heavy, plume, primary, carrier]",...,0,0,0,0,0,0,0,1,2,"[heavy, plume, primary, carrier]"
44360,2014,2014-08-20 13:00:00,2014_WA-WFS-513_SAND RIDGE,0.0,2014.0,1.000000,95.000000,234,232,"[heavy, canyon, river, mainly, canyon, come, e...",...,0,0,0,0,0,0,0,0,2,"[heavy, canyon, river, mainly, canyon, come, e..."
44361,2014,2014-08-20 13:00:00,2014_WA-WFS-513_SAND RIDGE,86.0,2014.0,1.000000,120.000000,235,232,"[laid, night, test, wind, remain, canyon, peri...",...,0,0,0,0,0,0,0,1,3,"[heavy, canyon, river, mainly, canyon, come, e..."


In [20]:
incident_summary_df.columns = incident_summary_df.columns.str.replace(" ", "_")
preprocessed_df.columns = preprocessed_df.columns.str.replace(" ","_")

In [22]:
preprocessed_df.to_csv("ICS_predictive_model_sitreps.csv")
incident_summary_df.to_csv("ICS_predictive_model_summaryreps.csv")