## Import packages and Load Data

In [23]:
import csv
import json

with open('../data/MaintIE/gold_release.json', 'r', encoding='utf-8') as file:
    gold_data = json.load(file)

with open('../data/MaintIE/silver_release.json', 'r', encoding='utf-8') as file:
    silver_data = json.load(file)

In [24]:
# Get undesirable events from MaintIE
def get_events(maintie_data):
    """ Get the events from MaintIE data. """
    events = {'UndesirableState': [], 'UndesirableProperty': [], 'UndesirableProcess': []}
    for d in maintie_data:
        for entity in d['entities']:
            start_idx = entity['start']
            end_idx = entity['end']
            event = " ".join(d['tokens'][start_idx:end_idx])
            text = d['text']
            if 'UndesirableState' in entity['type']:
                events['UndesirableState'].append([event, text])
            if 'UndesirableProperty' in entity['type']:
                events['UndesirableProperty'].append([event, text])
            if 'UndesirableProcess' in entity['type']:
                events['UndesirableProcess'].append([event, text])
    total_events = len(events['UndesirableState']) + len(events['UndesirableProperty']) + len(events['UndesirableProcess'])
    print("{:<20} {}".format("UndesirableState", len(events['UndesirableState'])))
    print("{:<20} {}".format("UndesirableProperty", len(events['UndesirableProperty'])))
    print("{:<20} {}".format("UndesirableProcess", len(events['UndesirableProcess'])))
    print("{:<20} {}".format("Total", total_events))
    return events

# Write the data to a CSV file
def write_csv(data, csv_file, header):
    """ Write the data to a CSV file. """
    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(header)
        writer.writerows(data)

In [25]:
# Print the count of entities and relations
def print_count(title, count_dict):
    """ Print the count dictionary in a table format. """
    print("{:<30} {}".format(f"{title}", "Count"))
    print("-" * 40)
    for key, value in count_dict.items():
        print("{:<30} {}".format(key, value))
    print("-" * 40)
    print("{:<30} {}".format("Total", sum(count_dict.values())))
    print()

# Analyse the datasets from MaintIE
def maintie_analysis(maintie_data):
    """ Analyse the gold/silver dataset from MaintIE. """
    entity_count, relation_count, unique_entity_count, unique_relation_count = {}, {}, {}, {}
    seen_entities, seen_relations = [], []

    for data in maintie_data:
        # count number of entities {type: count}
        for entity in data['entities']:
            entity_type = entity['type'].split('/')[0]
            if entity_type not in entity_count:
                entity_count[entity_type] = 1
            else:
                entity_count[entity_type] += 1
        
        # count number of relations {type: count}
        for relation in data['relations']:
            relation_type = relation['type']
            if relation_type not in relation_count:
                relation_count[relation_type] = 1
            else:
                relation_count[relation_type] += 1
                
        # count number of unique entities {type: count}
        for entity in data['entities']:
            entity_type = entity['type'].split('/')[0]
            entity_text = " ".join(data['tokens'][entity['start']:entity['end']]).lower().strip()
            unique_entity_key = (entity_text, entity_type)
            if unique_entity_key not in seen_entities:
                seen_entities.append(unique_entity_key)
                if entity_type not in unique_entity_count:
                    unique_entity_count[entity_type] = 1
                else:
                    unique_entity_count[entity_type] += 1
                    
        # count number of unique relations {type: count}
        for relation in data['relations']:
            relation_type = relation['type']
            head = relation['head']
            tail = relation['tail']
            head_entity = data['entities'][head]
            tail_entity = data['entities'][tail]
            head_entity_type = head_entity['type'].split('/')[0]
            tail_entity_type = tail_entity['type'].split('/')[0]
            head_entity_text = " ".join(data['tokens'][head_entity['start']:head_entity['end']]).lower().strip()
            tail_entity_text = " ".join(data['tokens'][tail_entity['start']:tail_entity['end']]).lower().strip()
            unique_head_key = (head_entity_text, head_entity_type)
            unique_tail_key = (tail_entity_text, tail_entity_type)
            unique_relation_key = (unique_head_key, unique_tail_key, relation_type)

            if unique_relation_key not in seen_relations:
                seen_relations.append(unique_relation_key)
                if relation_type not in unique_relation_count:
                    unique_relation_count[relation_type] = 1
                else:
                    unique_relation_count[relation_type] += 1
    
    print_count("Entities", entity_count)
    print_count("Relations", relation_count)
    print_count("Unique Entities", unique_entity_count)
    print_count("Unique Relations", unique_relation_count)

In [26]:
# Print number of tokens in the maintie data
def number_tokens_analysis(maintie_data, data_name):
    """ Analyse the number of tokens in the maintie data. """
    min_tokens = 1000 # Minimum number of tokens
    max_tokens = 0    # Maximum number of tokens
    sum_tokens = 0    # Sum of tokens

    for data in maintie_data:
        tokens = data['tokens']
        if len(tokens) > max_tokens:
            max_tokens = len(tokens)
        if len(tokens) < min_tokens:
            min_tokens = len(tokens)
        sum_tokens += len(tokens)

    # Average number of tokens
    avg_tokens = round(sum_tokens / len(maintie_data), 2)
    
    print(f"{data_name} Tokens Count")
    print("{:<20} {}".format("Minimum Tokens:", min_tokens))
    print("{:<20} {}".format("Maximum Tokens:", max_tokens))
    print("{:<20} {}".format("Average Tokens:", avg_tokens))
    print()

In [27]:
# Print all possible head-tail relations from the maintie data
def maintie_head_tail(data):
    """ Get the head and tail entities from the MaintIE data. """
    head_tail_relation = []
    for d in data:
        for r in d['relations']:
            head = r['head']
            tail = r['tail']
            head_entity = d['entities'][head]
            tail_entity = d['entities'][tail]
            head_type = head_entity['type'].split('/')[0]
            tail_type = tail_entity['type'].split('/')[0]
            
            # Undesirable State / Property / Process
            if "Undesirable" in head_entity['type']:
                head_subtype = head_entity['type'].split('/')[1]
                head_tail_relation.append((head_subtype, tail_type, r['type']))
                continue
            
            if "Undesirable" in tail_entity['type']:
                tail_subtype = tail_entity['type'].split('/')[1]
                head_tail_relation.append((head_type, tail_subtype, r['type']))
                continue
            
            head_tail_relation.append((head_type, tail_type, r['type']))
    head_tail_relation = sorted(set(head_tail_relation))
    for h, t, r in head_tail_relation:
        print(f"{h} - {r} -> {t}")
    print("Total:", len(head_tail_relation))

## Gold Dataset Analysis

In [28]:
# Number of entities and relations (and unique)
print("Entities and Relations:")
maintie_analysis(gold_data)

# All types of relations
print("Types of Relations:")
maintie_head_tail(gold_data)

# Number of tokens (min, max, avg)
print("\nTokens Analysis:")
number_tokens_analysis(gold_data, "MaintIE Gold")


Entities and Relations:
Entities                       Count
----------------------------------------
PhysicalObject                 1994
State                          438
Activity                       784
Process                        146
Property                       35
----------------------------------------
Total                          3397

Relations                      Count
----------------------------------------
hasPart                        533
hasParticipant/hasPatient      1206
isA                            364
hasParticipant/hasAgent        166
contains                       38
hasProperty                    34
----------------------------------------
Total                          2341

Unique Entities                Count
----------------------------------------
PhysicalObject                 672
State                          100
Activity                       141
Process                        24
Property                       7
------------------------------

## Silver Dataset Analysis

In [29]:
# Number of entities and relations (and unique)
print("Entities and Relations:")
maintie_analysis(silver_data)

# All types of relations
print("Types of Relations:")
maintie_head_tail(silver_data)

# Number of tokens (min, max, avg)
print("\nTokens Analysis:")
number_tokens_analysis(silver_data, "MaintIE Silver")


Entities and Relations:
Entities                       Count
----------------------------------------
Activity                       5045
State                          2747
PhysicalObject                 13472
Process                        728
Property                       130
----------------------------------------
Total                          22122

Relations                      Count
----------------------------------------
hasPatient                     7761
hasPart                        3837
isA                            2512
hasAgent                       789
contains                       178
hasProperty                    123
----------------------------------------
Total                          15200

Unique Entities                Count
----------------------------------------
Activity                       373
State                          396
PhysicalObject                 2372
Process                        118
Property                       32
-----------------

## Extract Undesirable Events from MaintIE Gold Dataset

In [32]:
events_list = get_events(gold_data)
header = ["Event", "Text", "Failure Mode"]
directory = '../data/FMC-MWO2KG'
write_csv(events_list['UndesirableState'], f'{directory}/undesirable_state.csv', header)
write_csv(events_list['UndesirableProperty'], f'{directory}/uundesirable_property.csv', header)
write_csv(events_list['UndesirableProcess'], f'{directory}/uundesirable_process.csv', header)

UndesirableState     438
UndesirableProperty  32
UndesirableProcess   146
Total                616
