In [1]:
import pandas as pd
import os
import geopandas as gpd

# Load the ICS-209 data from local file that is larger than 100MB
# and cannot be uploaded to GitHub
sitrep_path="data\ics209-plus-wf_sitreps_1999to2022.csv"
sitreps_df = pd.read_csv(sitrep_path, low_memory=False)
sitreps_df.head()
sitreps_df.columns

Index(['Unnamed: 0', 'ACRES', 'ADDNTL_FUEL_MODEL',
       'ADDTNL_COOP_ASSIST_ORG_NARR', 'ANTICIPATED_COMPLETION_DATE',
       'AREA_CLOSURE_FLAG', 'CAUSE', 'COMPLEX', 'COMPLEXITY_LEVEL_NARR',
       'COMPLEX_NAME',
       ...
       'INCTYP_DESC', 'INCTYP_ABBREVIATION', 'DISCOVERY_DOY', 'REPORT_DOY',
       'EVENT_ID', 'NEW_ACRES', 'REPORT_DAY_SPAN', 'EVENT_FINAL_ACRES',
       'WF_FSR', 'MAX_FIRE_PCT_FINAL_SIZE'],
      dtype='object', length=155)

In [2]:
sitreps_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205589 entries, 0 to 205588
Columns: 155 entries, Unnamed: 0 to MAX_FIRE_PCT_FINAL_SIZE
dtypes: float64(57), int64(3), object(95)
memory usage: 243.1+ MB


In [3]:
# Exploring relevant social impact fields

# Risk Assesment Values ranging from low-Medium-High-Extreme-Nan
sitreps_df["RISK_ASSESSMENT"].unique()

# About 16699 data entries
# Impacts to personel too
len(sitreps_df["LIFE_SAFETY_HEALTH_STATUS_NARR"].unique())

16699

In [9]:
# Get an array of unique values
uniquerisk= sitreps_df["CURRENT_THREAT_NARR"].unique()

# Convert the array into a DataFrame
uniquerisk_df = pd.DataFrame(uniquerisk, columns=["CURRENT_THREAT_NARR"])

# Save narrative training subset
uniquerisk_df.to_csv('data/processed/uniquerisk_df.csv', index=False)

In [5]:
sitreps_df[sitreps_df['INCIDENT_ID']=='2021_12908560_CALDOR']

Unnamed: 0.1,Unnamed: 0,ACRES,ADDNTL_FUEL_MODEL,ADDTNL_COOP_ASSIST_ORG_NARR,ANTICIPATED_COMPLETION_DATE,AREA_CLOSURE_FLAG,CAUSE,COMPLEX,COMPLEXITY_LEVEL_NARR,COMPLEX_NAME,...,INCTYP_DESC,INCTYP_ABBREVIATION,DISCOVERY_DOY,REPORT_DOY,EVENT_ID,NEW_ACRES,REPORT_DAY_SPAN,EVENT_FINAL_ACRES,WF_FSR,MAX_FIRE_PCT_FINAL_SIZE
191486,191486,400.0,,,2021-08-31 12:00:00,N,U,False,,,...,Wildfire,WF,226.0,227,12908560,400.0,1.0,221835.0,400.0,0.001803
191487,191487,400.0,,,2021-08-31 12:00:00,N,U,False,,,...,Wildfire,WF,226.0,228,12908560,0.0,1.0,221835.0,0.0,0.001803
191488,191488,2261.0,,,2021-08-31 12:00:00,Y,U,False,,,...,Wildfire,WF,226.0,228,12908560,1861.0,1.0,221835.0,1861.0,0.010192
191489,191489,6500.0,,,2021-08-31 12:00:00,Y,U,False,,,...,Wildfire,WF,226.0,229,12908560,4239.0,1.0,221835.0,4239.0,0.029301
191490,191490,22919.0,Light Logging Slash,,2021-08-31 12:00:00,Y,H,False,,,...,Wildfire,WF,226.0,229,12908560,16419.0,1.0,221835.0,16419.0,0.103316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191584,191584,221835.0,Timber (Litter and Understory),,2021-10-31 12:00:00,Y,U,False,,,...,Wildfire,WF,226.0,290,12908560,0.0,1.0,221835.0,0.0,1.000000
191585,191585,221835.0,Timber (Litter and Understory),,2021-10-31 12:00:00,Y,U,False,,,...,Wildfire,WF,226.0,291,12908560,0.0,1.0,221835.0,0.0,1.000000
191586,191586,221835.0,Timber (Litter and Understory),,2021-10-31 12:00:00,Y,U,False,,,...,Wildfire,WF,226.0,292,12908560,0.0,1.0,221835.0,0.0,1.000000
191587,191587,221835.0,Timber (Litter and Understory),,2021-10-20 12:00:00,Y,U,False,,,...,Wildfire,WF,226.0,293,12908560,0.0,1.0,221835.0,0.0,1.000000


In [10]:
# Subset sitreps_df to focus on Caldor and Dixie fire events 
specific_event_id = ['2021_12908560_CALDOR','2021_12993824_DIXIE']
focusfires_df = sitreps_df[sitreps_df['INCIDENT_ID'].isin(specific_event_id)]

focusfires_df

# Save the focus fires subset
focusfires_df.to_csv('data/processed/focusfires_df.csv', index=False)

focusfires_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 276 entries, 191486 to 194404
Columns: 155 entries, Unnamed: 0 to MAX_FIRE_PCT_FINAL_SIZE
dtypes: float64(57), int64(3), object(95)
memory usage: 336.4+ KB


Text preprocessing is a crucial step in Natural Language Processing (NLP) that transforms text into a format that is suitable for further analysis. Here are some common techniques:

1. **Tokenization**: This is the process of breaking down text into individual words (or tokens). This is usually the first step in text preprocessing.

    ```python
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(text)
    ```

2. **Lowercasing**: This is done to avoid having multiple copies of the same words. For example, 'Hello' and 'hello' should be treated as the same word.

    ```python
    text = text.lower()
    ```

3. **Stopwords Removal**: Stopwords are common words that do not contain important meaning and are usually removed from texts. Examples of stopwords are 'is', 'the', 'and', etc.

    ```python
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    filtered_text = [word for word in tokens if word not in stop_words]
    ```

4. **Stemming**: This is the process of reducing inflected (or sometimes derived) words to their word stem or root form. For example, 'jumps', 'jumping', 'jumped' are all transformed to 'jump'.

    ```python
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    stemmed_text = [stemmer.stem(word) for word in filtered_text]
    ```

5. **Lemmatization**: Similar to stemming, but it reduces words to their base or root form (lemma) considering the context. It's more accurate but slower than stemming.

    ```python
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    ```

6. **Removing Punctuation**: Punctuation can provide grammatical context to a sentence which supports our understanding. But for our vectorizer which counts the number of words and not the context, it does not add value, so it is often removed.

    ```python
    import string
    text = text.translate(str.maketrans('', '', string.punctuation))
    ```

7. **Removing HTML tags**: When dealing with HTML data, we often have to clean it to remove all the HTML tags in it.

    ```python
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    ```