In [1]:
import pandas as pd
import networkx as nx
import regex as re
import matplotlib.pyplot as plt
import matplotlib as mat
from pyvis.network import Network

In [2]:
df = pd.read_pickle("US_Accidents_Processed_df.pkl")

In [3]:
# All of the different findings to split into network diagrams
findings_ls = list(set(df['finding_description']))
findings_ls

['Aircraft-Aircraft systems-Landing gear system-Brake-Not specified - C',
 'Organizational issues-Support/oversight/monitoring-Training-Initial training-Maintenance provider - F',
 'Environmental issues-Conditions/weather/phenomena-Wind-Dust devil/whirlwind-Ability to respond/compensate - F',
 'Environmental issues-Conditions/weather/phenomena-Wind-Tailwind-Compliance w/ procedure - F',
 'Aircraft-Aircraft power plant-Engine fuel and control-Fuel divider-Not specified - C',
 'Environmental issues-Operating environment-Airport facilities/design-Runway/landing area length-Decision related to condition',
 'Aircraft-Aircraft power plant-Engine bleed air system-Compressor bleed control-Damaged/degraded - C',
 'Environmental issues-Conditions/weather/phenomena-Turbulence-(general)-Effect on equipment',
 'Aircraft-Aircraft power plant-Engine starting-(general)-Incorrect use/operation - C',
 'Aircraft-Aircraft handling/service-Towing and taxiing-Taxiing-Incorrect use/operation - C',
 'Organiza

In [None]:
# Preprocessing the dataset to be easier to read and follow
# Removing unecessary part at end
findings_ls_ver1 = []
for finding in findings_ls:
    if re.split('-', finding)[-1] == ' C':
        finding = finding.replace('- C', '')
    if re.split('-', finding)[-1] == ' F':
        finding = finding.replace('- F', '')
    findings_ls_ver1.append(finding)

In [9]:
# Locate all of the (general) options and instead combine it with the next part
findings_ls_ver2 = []
for finding in findings_ls_ver1:
    if '(general)' in finding:
        if finding.count('(general)') == 1:
            # know which specific index the 'general' is in and recreate the previous format
            ix = finding.find('(general)')
            ls = re.split('-', finding[ix:])
            if len(ls) == 2:
                finding = finding[0:ix] + ls[0] + ' ' + ls[1] # finding = before (general) + (general) + rest of the finding
            else:
                print(finding, '- Recheck')
        else:
            # what if (general) occurs more than once? Remove one of the instances and repeat the same process as above
            finding = finding.replace('-(general)', '', 1)
            ix = finding.find('(general)')
            ls = re.split('-', finding[ix:])
            if len(ls) == 2:
                finding = finding[0:ix] + ls[0] + ' ' + ls[1] # finding = before (general) + (general) + rest of the finding
            else:
                print(finding, '- Recheck')
    findings_ls_ver2.append(finding)

In [11]:
# double check and remove all empty space after all findings and then use list(set())
findings_ls_ver3 = list(set([finding.rstrip() for finding in findings_ls_ver2]))
print(str(len(findings_ls_ver2)) + 'vs ' + str(len(findings_ls_ver3)))

6163vs 3793


In [12]:
# renaming it over the original version to keep clarity
findings_ls = findings_ls_ver3

In [13]:
# Count the number of layers that will be needed in the network diagram - max and min
word_no_ls = []
for finding in findings_ls:
    word_no = len(re.split('-', finding))
    word_no_ls.append(word_no)
max_word_no = max(word_no_ls)
min_word_no = min(word_no_ls)
print('Min connections are: ', min_word_no)
print('Max connections are: ', max_word_no)

Min connections are:  2
Max connections are:  6


In [14]:
# identify the unique words for each layer
type_ls = []
sys_ls = []
subsys_ls = []
details_ls = []
moredet_ls = []
evenmore_ls = []
ix = 0
for finding in findings_ls:
    # for each finding I want to append each word to a separate list
    # but I also want the connection
    # remove trailing characters to enable the only values to be found and shown using .rstrip()
    ix += 1
    word_ls = re.split('-', finding)
    if len(word_ls) == 2:
        type_ls.append(word_ls[0].rstrip())
        sys_ls.append(word_ls[1].rstrip())
    elif len(word_ls) == 3:
        type_ls.append(word_ls[0].rstrip())
        sys_ls.append(word_ls[1].rstrip())
        subsys_ls.append(word_ls[2].rstrip())
    elif len(word_ls) == 4:
        type_ls.append(word_ls[0].rstrip())
        sys_ls.append(word_ls[1].rstrip())
        subsys_ls.append(word_ls[2].rstrip())
        details_ls.append(word_ls[3].rstrip())
    elif len(word_ls) == 5:
        type_ls.append(word_ls[0].rstrip())
        sys_ls.append(word_ls[1].rstrip())
        subsys_ls.append(word_ls[2].rstrip())
        details_ls.append(word_ls[3].rstrip())
        moredet_ls.append(word_ls[4].rstrip())
    else:
        type_ls.append(word_ls[0].rstrip())
        sys_ls.append(word_ls[1].rstrip())
        subsys_ls.append(word_ls[2].rstrip())
        details_ls.append(word_ls[3].rstrip())
        moredet_ls.append(word_ls[4].rstrip())
        evenmore_ls.append(word_ls[5].rstrip())

In [15]:
type_ls = set(type_ls)
sys_ls = set(sys_ls)
subsys_ls = set(subsys_ls)
details_ls = set(details_ls)
moredet_ls = set(moredet_ls)
evenmore_ls = set(evenmore_ls)

## Create the network diagram of the connections

In [17]:
def NetworkLinks(type_ls, findings_ls, ix1, ix2):
    LinkDict = {}
    for type_name in type_ls:
        ls = []
        for finding in findings_ls:
            type_names_found = re.split('-', finding)
            if len(type_names_found) < 2:
                continue
            else:
                type_name_found = type_names_found[ix1]
                system = re.split('-', finding)[ix2]
                if type_name == type_name_found:
                    ls.append(system)
                else:
                    continue
        ls = list(set(ls))
        LinkDict[type_name] = ls 
    return LinkDict

In [18]:
def NetworkLinks_ver2(subsys_ls, findings_ls, ix1, ix2): # only for subsystem to more details
    LinkDict = {}
    for type_name in subsys_ls:
        ls = []
        for finding in findings_ls:
            type_names_found = re.split('-', finding)
            if len(type_names_found) <= 3: # condition for the 
                continue
            else:
                type_name_found = type_names_found[ix1]
                system = type_names_found[ix2]
                if type_name == type_name_found:
                    ls.append(system)
                else:
                    continue
        ls = list(set(ls))
        LinkDict[type_name] = ls
    return LinkDict

In [19]:
type_to_sys = NetworkLinks(type_ls, findings_ls, 0, 1)
sys_to_sub = NetworkLinks_ver2(sys_ls, findings_ls, 1, 2)
sub_to_det = NetworkLinks_ver2(subsys_ls, findings_ls, 2, 3) # need to add an extra condition for sets of strings that don't have any more words
#det_to_more = NetworkLinks_ver2(details_ls, findings_ls, 3, 4)

In [20]:
list(sys_to_sub.values())[0]

['Personality/attitude',
 'Cognitive limitation',
 'Perception/orientation/illusio',
 'Mental/emotional state',
 'Perception/orientation/illusion',
 'Attention/monitoring']

In [21]:
key_ls = list(type_to_sys.keys())
value_ls = list(type_to_sys.values())

## Choose a specific type no. for visualisation

In [22]:
type_no = 0
for key in key_ls:
    print(key, ' correspond to Type No. ', type_no)
    type_no += 1

main system  correspond to Type No.  0
Environmental issues  correspond to Type No.  1
Aircraft  correspond to Type No.  2
Not determined  correspond to Type No.  3
Organizational issues  correspond to Type No.  4
Personnel issues  correspond to Type No.  5


In [25]:
type_no = 2
# plt.figure(3,figsize=(15, 15)) 
G = nx.Graph()
for value in value_ls[type_no]:
    G.add_edge(key_ls[type_no], value)
nt = Network(notebook = True)
nt.from_nx(G)
nt.show('nx.html')

nx.html


In [24]:
# How to build on top of the graph already made?
sys_to_sub_ls = list(sys_to_sub.keys())
value_test_ls = []
#plt.figure(3,figsize=(20, 20)) 
# pos = nx.bfs_layout(G, key_ls[type_no]) # recreation of the layout I've been using for my spider diagrams
pos = nx.kamada_kawai_layout(G, scale = 10) # recommended layout type for clearest visualisation BUT overlapping labels
for value in value_ls[type_no]:
    for system in sys_to_sub_ls:
        if value == system:
            for subsys in sys_to_sub[system]:
                value_test_ls.append(subsys)
                G.add_edge(system, subsys)
nt = Network() #notebook = True
nt.from_nx(G)
nt.show('nx.html', notebook = False)
#nx.draw(G, pos=pos, with_labels = True, node_shape="s", node_color="none", bbox=dict(facecolor="skyblue", edgecolor='black', boxstyle='round, pad=0.2'), font_size=20)
#plt.show()

nx.html


## Adding Probability of fault based on the size of node

## Choose a specific system/subsystem to visualise

In [None]:
key_ls = list(sub_to_det.keys())
value_ls = list(sub_to_det.values())

In [None]:
subsys_no = 0
for key in key_ls:
    print(key, ' = Subsys No. ', type_no)
    type_no += 1

In [None]:
subsys_no = 143
plt.figure(3,figsize=(20, 20)) 
S = nx.Graph()
for value in value_ls[subsys_no]:
    S.add_edge(key_ls[subsys_no], value)
nx.draw(S, with_labels = True, node_shape="s", node_color="none", bbox=dict(facecolor="skyblue", edgecolor='black', boxstyle='round, pad=0.2'), font_size=12)
plt.show()

In [None]:
# repeat the process for the other details
sub_to_det_ls = list(sub_to_det.keys())
plt.figure(3,figsize=(25, 25)) 
# pos = nx.bfs_layout(G, key_ls[type_no]) #would this one work where each branch becomes its own layout
# Ideally limit the number of loops based on thevalues already in the chart
for value in value_test_ls:
    for subsystem in sub_to_det_ls:
        if value == subsystem:
            for subsys in sub_to_det[subsystem]:
                S.add_edge(system, subsys)
nx.draw(S, with_labels = True, node_shape="s", node_color="none", bbox=dict(facecolor="skyblue", edgecolor='black', boxstyle='round, pad=0.2'), font_size=5)
plt.show()

In [None]:
# How to build on top of the graph already made?
sys_to_sub_ls = list(sys_to_sub.keys())
sub_ls = list(sys_to_sub.values())
fig = plt.figure()
for value in value_ls[type_no]:
    for system in sys_to_sub_ls:
        if value == system:
            for subsys in sys_to_sub[system]:
                G.add_edge(system, subsys)
nx.draw(G, with_labels = True, node_shape="s", node_color="none", bbox=dict(facecolor="skyblue", edgecolor='black', boxstyle='round, pad=0.2'), font_size=5, ax=fig.add_subplot())
if True: 
    # Save plot to file
    # mat.use("Agg") 
    # fig.savefig("graph.png")

In [None]:
list(sys_to_sub.keys())

In [None]:
value_ls = list(type_to_sys.values())
value_ls[0]