## imports

In [17]:
import re
import numpy as np
import pandas as pd
from typing import Text
import re
import json

!pip install gensim
import gensim

!pip install GPyM-TM
from GPyM_TM import GSDMM

!pip install bertopic
from bertopic import BERTopic

!pip install top2vec
!pip install top2vec[sentence_transformers]
!pip install top2vec[sentence_encoders]
from top2vec import Top2Vec


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting plotly>=4.7.0
  Downloading plotly-5.24.1-py3-none-any.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting umap-learn>=0.5.0
  Using cached umap_learn-0.5.6-py3-none-any.wh

  from .autonotebook import tqdm as notebook_tqdm


# Prepare Dataset

In [10]:
def cleanTokens(row, mode, tokenised):
  tokens = []
  if mode == "normal":
    text = re.sub("[^A-Za-z\s]","",row["text"])
    tokens = text.split(" ")
  elif mode == "remove_common":
    text = re.sub("[^A-Za-z\s]","",row["text"])
    tokens = text.split(" ")
    common_words = ["id", "num"]
    tokens = [token for token in tokens if token not in common_words]
  elif mode == "use_subclass":
    for e in row["entities"]:
      start = e["start"]
      end = e["end"]
      etype = e["type"].split("/")
      if len(etype) > 1 and etype[0] == "PhysicalObject":
          tokens.append(etype[-2]) # subclass with inherent function
      elif end == len(row['tokens']):
          t = " ".join(row['tokens'][start:])
          tokens.append(t)
      else:
          t = " ".join(row['tokens'][start:end])
          tokens.append(t)
  elif mode == "append_function":
    for e in row["entities"]:
      start = e["start"]
      end = e["end"]
      etype = e["type"].split("/")
      if len(etype) > 1 and etype[0] == "PhysicalObject":
          if end == len(row['tokens']):
              t = " ".join(row['tokens'][start:])
              tokens.append(t)
          else:
              t = " ".join(row['tokens'][start:end])
              tokens.append(t)
          tokens.append(etype[-2][:-6].lower()) # subclass with inherent function
      elif end == len(row['tokens']):
          t = " ".join(row['tokens'][start:])
          tokens.append(t)
      else:
          t = " ".join(row['tokens'][start:end])
          tokens.append(t)

  if not tokenised:
    tokens = " ".join(tokens)
  return tokens



def prepareDataset(filename, mode="normal", tokenised=True):

    # Opening JSON file
    f = open(filename, "r")

    # returns JSON object as a dictionary
    data = json.load(f)

    documents = []
    for row in data:
        relationTypes =[r["type"] for r in row["relations"]]
        if "hasParticipant/hasPatient" in relationTypes or "hasParticipant/hasAgent" in relationTypes:
          tokens = cleanTokens(row, mode, tokenised)
          documents.append(tokens)
          continue
        typesEntity = [r["type"].split("/")[0] for r in  row["entities"]]
        if "State" in typesEntity or "Process" in typesEntity or "Property" in typesEntity:
            tokens = cleanTokens(row, mode, tokenised)
            documents.append(tokens)
    return documents



documents = prepareDataset('../data/gold_release.json',"remove_common", False)
print(documents)



# GSDMM

### Replacing equipment with Inherent functions 

In [11]:
def runGSDMM(documents, num_topics=40):
  data_dmm = GSDMM.DMM(documents, num_topics)
  data_dmm.topicAssigmentInitialise()
  data_dmm.inference()
  psi, theta, selected_psi, selected_theta = data_dmm.worddist()
  finalAssignments = data_dmm.writeTopicAssignments() # Records the final topic assignments for the documents
  coherence_topwords = data_dmm.writeTopTopicalWords(finalAssignments) # Record the top words for each document
  score = data_dmm.coherence(coherence_topwords, len(finalAssignments)) #Calculates and stores the coherence
  print("Final number of topics found: " + str(len(finalAssignments)))
  return score, coherence_topwords, finalAssignments

In [24]:
documents = prepareDataset('../data/gold_release.json','remove_common')
score, coherence_topwords, finalAssignments = runGSDMM(documents, 40)

corpus=1058, words=897, K=40, a=0.100000, b=0.100000, nTopWords=10, iters=15
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 10
iteration: 11
iteration: 12
iteration: 13
iteration: 14
[ 1  2  3  4  6  9 10 11 12 13 14 16 20 22 23 24 25 26 28 29 30 31 34 35
 36 38]
repair hand in cracked crack left  side window right 
out change engine  pump universal drive cabin water shaft 
replace unserviceable and hose in machine inverter auxilliary batteries battery 
hand mechanical inspection hour right left roller track chain guide 
on fault brake alarm drag park unserviceable all dash light 
 change out cylinder hose replace hand leaking left steering 
bearings cylinder steering grease pulley excess have play no has 
motor and new disconnect reconnect electrical seal clean components air 
not working and boarding fault side all autospray feature are 
 and inspection NDT service machine clean week lube ho

In [16]:
documents = prepareDataset('../data/gold_release.json', "use_subclass")
score, coherence_topwords, finalAssignments = runGSDMM(documents, 40)

corpus=1058, words=297, K=40, a=0.100000, b=0.100000, nTopWords=10, iters=15
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 10
iteration: 11
iteration: 12
iteration: 13
iteration: 14
[ 1  3  5  7  8  9 10 11 13 15 16 18 21 22 25 29 30 34 37]
GeneratingObject DrivingObject change out replace EmittingObject leaking TransformingObject InterfacingObject SensingObject unserviceable 
ProtectingObject leak DrivingObject GeneratingObject TransformingObject EmittingObject repair Substance clean leaking 
MatterProcessingObject DrivingObject ProtectingObject replace PresentingObject leaking inspect change out plugged CoveringObject 
HoldingObject repair crack ProtectingObject cracks Substance leaking Organism needs SensingObject 
EmittingObject RestrictingObject error GeneratingObject temperature error not cold faults need electrical faults HumanInteractionObject 
GuidingObject HoldingObject ControllingO

# BERTopic

In [19]:
documents = prepareDataset('../data/gold_release.json', "remove_common", False)
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(documents)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,309,-1_leaking_inspection_cylinder_broken,"[leaking, inspection, cylinder, broken, out, c...",[ inspection inspection mast raise cylinder pi...
1,0,64,0_fault_transmission_diagnose_repair,"[fault, transmission, diagnose, repair, electr...","[diagnose and repair ladder fault, electrical ..."
2,1,54,1_tyre_position_tyres_change,"[tyre, position, tyres, change, damaged, and, ...","[change tyre position, change out tyre positi..."
3,2,50,2_conditioner_air_compressor_unserviceable,"[conditioner, air, compressor, unserviceable, ...","[air conditioner unserviceable, air condition..."
4,3,34,3_oil_leak_engine_leaks,"[oil, leak, engine, leaks, text, investigate, ...","[repair engine oil leak text, repair engine o..."
5,4,32,4_light_ignition_dash_switch,"[light, ignition, dash, switch, lights, blown,...","[all dash lights on, blown light on boom, repl..."
6,5,31,5_hose_blown_steering_oring,"[hose, blown, steering, oring, hoses, pump, re...","[ replace blown hose, change out blown steerin..."
7,6,31,6_pump_joints_contamination_water,"[pump, joints, contamination, water, change, f...","[ change out engine oil pump, change out univ..."
8,7,27,7_steering_ball_studs_cylinder,"[steering, ball, studs, cylinder, movement, ac...","[ replace leaking steering cylinder, change ou..."
9,8,26,8_valve_dust_safety_valves,"[valve, dust, safety, valves, air, leaking, le...","[pressure control valve in cabin leaking, comp..."


### appending inherent function to sentences

In [20]:
function_documents = prepareDataset('../data/gold_release.json',"append_function", False)

topic_model3 = BERTopic()
topics, probs = topic_model3.fit_transform(function_documents)
topic_model3.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,160,-1_transforming_radio_sub_holding,"[transforming, radio, sub, holding, error, con...",[aerial transforming radio transforming needs ...
1,0,94,0_hose_guiding_steering_blown,"[hose, guiding, steering, blown, leaking, pipe...","[blown hose guiding steering controlling, chan..."
2,1,52,1_sensing_sensor_switch_chip,"[sensing, sensor, switch, chip, contamination,...","[replace coolant sensor sensing, replace press..."
3,2,48,2_conditioner_air_emitting_compressor,"[conditioner, air, emitting, compressor, gener...","[repair air conditioner emitting, change out a..."
4,3,42,3_cylinder_driving_lift_steering,"[cylinder, driving, lift, steering, stick, cha...",[change out steering cylinder driving leaking ...
5,4,42,4_oil_leak_protecting_engine,"[oil, leak, protecting, engine, leaks, sample,...","[repair oil protecting leak engine driving, re..."
6,5,41,5_holding_roller_pin_mast,"[holding, roller, pin, mast, raise, inspection...",[inspection mast raise cylinder driving pin ho...
7,6,38,6_transforming_transmission_transformer_fault,"[transforming, transmission, transformer, faul...","[change out transmission transforming, repair ..."
8,7,38,7_brake_restricting_swing_park,"[brake, restricting, swing, park, error, tempe...",[change out brake restricting swing transformi...
9,8,36,8_tyre_guiding_damaged_rim,"[tyre, guiding, damaged, rim, change, damage, ...","[change tyre guiding, change tyre guiding, cha..."


# Top2Vec

In [None]:
model = Top2Vec(documents, embedding_model='universal-sentence-encoder',speed="deep-learn",min_count=1)

n = model.get_num_topics()
print(n)
topic_words, word_scores, topic_nums = model.get_topics(n)

for topic in topic_nums:
    model.generate_topic_wordcloud(topic)