# Case Study: NTSB Reports

This notebook provides an example of MIKA capabilties via a case study on NTSB reports.
The capabilities shown in this example include:
- Information Retrieval 
- Topic Modeling
- FMEA extraction through custom NER
- Trend Analysis

## Case Study set up:

- Package imports
- Data import

In [5]:

import sys, os
#sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..",".."))
sys.path.append(os.path.join("..",".."))
import numpy as np
import pandas as pd
from mika.utils import Data
from mika.ir import search
from datetime import datetime as dt
from mika.kd.topic_model_plus import Topic_Model_plus
from mika.kd import trend_analysis
from mika.kd import FMEA
from sklearn.feature_extraction.text import CountVectorizer
from torch import cuda

  if LooseVersion(module.__version__) < minver:
  other = LooseVersion(other)
  if LooseVersion(module.__version__) < minver:
  other = LooseVersion(other)
  if LooseVersion(module.__version__) < minver:
  other = LooseVersion(other)
  if LooseVersion(module.__version__) < minver:
  other = LooseVersion(other)
  if LooseVersion(module.__version__) < minver:
  other = LooseVersion(other)
  duck_array_version = LooseVersion(duck_array_module.__version__)
  duck_array_version = LooseVersion("0.0.0")
  duck_array_version = LooseVersion("0.0.0")
  other = LooseVersion(other)
  other = LooseVersion(other)
  if LooseVersion(np.__version__) >= "1.20.0":
  other = LooseVersion(other)
  if LooseVersion(pd.__version__) < "0.25.0":
  other = LooseVersion(other)
  if LooseVersion(ipywidgets.__version__) >= LooseVersion("7.0.0"):
  if LooseVersion(ipywidgets.__version__) >= LooseVersion("7.0.0"):
  if LooseVersion(ipywidgets.__version__) >= LooseVersion("7.0.0"):
  if LooseVersion(ipywidgets.__versi

In [9]:
os.getcwd()
os.chdir('../')
os.chdir('../')
os.getcwd()

'c:\\Users\\srandrad\\smart_nlp'

In [16]:
ntsb_filepath = os.path.join("data/NTSB/ntsb_full.csv")
ntsb_data = Data()
ntsb_text_columns = ['narr_cause', 'narr_accf'] # narrative accident cause and narrative accident final
ntsb_document_id_col = 'id'
ntsb_database_name = 'NTSB'
ntsb_data.load(ntsb_filepath, preprocessed=False, text_columns=ntsb_text_columns, name=ntsb_database_name, load_kwargs={'dtype':str}) # way to load as str?
ntsb_data.prepare_data(create_ids=True, combine_columns=ntsb_text_columns, remove_incomplete_rows=False)

  self.data_df['index'] = self.data_df.index
Combining Columns…: 100%|██████████| 243/243 [00:00<?, ?it/s]
  self.data_df["Combined Text"] = combined_text
Creating Unique IDs…: 100%|██████████| 243/243 [00:00<00:00, 2026.35it/s]

data preparation:  0.0 minutes 




  self.data_df["Unique IDs"] = unique_ids


## Information Retrieval
Two model options are available for IR:
1. fine-tuned model
2. pre-trained distilroberta model

In [None]:
# fine tuned model
model = os.path.join('models', 'fine_tuned_llis_model')
query = 'fatigue crack'
ir_ntsb = search(ntsb_data, model)
embeddings_path = os.path.join('data', 'LLIS', 'llis_sentence_embeddings_finetune.npy')
#ir_ntsb.get_sentence_embeddings(embeddings_path) # comment this out if the embeddings already exist
ir_ntsb.load_sentence_embeddings(embeddings_path) # uncomment this if you wish to load sentence embeddings that already exist
print(ir_ntsb.run_search(query,return_k=5))

In [None]:
# pretrained model
model = 'all-distilroberta-v1'
query = 'fatigue crack'
ir_ntsb = search(ntsb_data, model)
embeddings_path = os.path.join('data', 'LLIS', 'llis_sentence_embeddings.npy')
#ir_ntsb.get_sentence_embeddings(embeddings_path) # comment this out if the embeddings already exist
ir_ntsb.load_sentence_embeddings(embeddings_path) # uncomment this if you wish to load sentence embeddings that already exist
print(ir_ntsb.run_search(query,return_k=5))

# Topic Modeling

Here we implement BERtopic via topic model plus to create a taxonomy of failure information.

In [None]:
tm = Topic_Model_plus(text_columns=ntsb_text_columns, data=ntsb_data)
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english") #removes stopwords
tm.bert_topic(sentence_transformer_model=None, umap=None, hdbscan=None, count_vectorizor=vectorizer_model, ngram_range=(1,3), BERTkwargs={}, from_probs=False, thresh=0.01)
tm.save_bert_model()

BERTkwargs={"top_n_words": 20, 'min_topic_size':150}
tm.bert_topic(count_vectorizor=vectorizer_model, BERTkwargs=BERTkwargs, from_probs=True) #why is bertopic performed twice
tm.save_bert_results(from_probs=True)
tm.save_bert_taxonomy()

# FMEA using Named-Entity Recognition

In [None]:
#definted according to FAA order 8040.4B: https://www.faa.gov/documentLibrary/media/Order/FAA_Order_8040.4B.pdf
def NTSB_severity(damage, inj_level, inj_tot_f, persons_onboard): #damage, ev_highest_injury, inj_tot_f	
    if int(persons_onboard) == 0:
        persons_onboard = inj_tot_f
    pct_fatal = inj_tot_f/persons_onboard
    #minimal: no injuries, no damage. 
    if inj_level == 'NONE' and damage == 'UKN':
        severity = 'Minimal'
    #minor: slight (MINR) damage, physical discomfort
    elif inj_level == 'MINR' or inj_level == 'NONE':
        if damage == 'MINR':
            severity = 'Minor'
        #major: substaintail (SUBS) damage, injuries
        elif damage == 'SUBS':
            severity = 'Major' 
        elif damage == 'DEST':
            severity = 'Hazardous'
    #hazardous: multiple serious injuries, fatalities<2, hull loss (DEST)
    elif inj_level == 'SERS' or (inj_level == 'FATL' and (inj_tot_f <= 2 or pct_fatal < 0.75)) or damage == 'DEST':
        severity = 'Hazardous'
    #catatrophic: fatalities > 2, or num person on board= num fatalities,  hull loss (DEST)
    elif inj_level == 'FATL' and (inj_tot_f > 2 or pct_fatal > 0.75):
        severity = 'Catastrophic'
    return severity
    
def severity_func(df):
    severities = []
    for i in range(len(df)):
        severities.append(NTSB_severity(df.iloc[i]['Hazardous Materials'], df.iloc[i]['Injuries'], df.iloc[i]['Damages']))
    df['severity'] = severities
    return df

In [None]:
model_checkpoint = os.path.join("models", "FMEA-ner-model", "checkpoint-1424")
print(model_checkpoint)

device = 'cuda' if cuda.is_available() else 'cpu'
cuda.empty_cache()
print(device)

fmea = FMEA()
fmea.load_model(model_checkpoint)
print("loaded model")
input_data = fmea.load_data('Combined Text', ntsb_document_id_col, filepath=ntsb_filepath, formatted=False)

print("loaded data")
preds = fmea.predict()
df = fmea.get_entities_per_doc()
fmea.group_docs_with_meta(grouping_col='Occurrence_Description', additional_cols=['phase_no'])
fmea.grouped_df.to_csv(os.path.join(os.getcwd(),"ntsb_fmea_raw.csv"))
fmea.calc_severity(severity_func, from_file=False)
fmea.test_class.calc_frequency(year_col="ev_year") #add year column
fmea.test_class.calc_risk()
fmea.test_class.post_process_fmea(phase_name='additional', id_name='test', max_words=1)
fmea.fmea_df.to_csv(os.path.join(os.getcwd(),"NTSB_FMEA.csv"))