## Investigative POS Tagging (nltk vs spaCy)

### Downloads

In [1]:
pip install -U spacy

Collecting spacy
  Using cached spacy-3.2.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Using cached spacy_loggers-1.0.1-py3-none-any.whl (7.0 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Using cached pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
Collecting thinc<8.1.0,>=8.0.12
  Using cached thinc-8.0.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (653 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Using cached langcodes-3.3.0-py3-none-any.whl (181 kB)
Collecting spacy-legacy<3.1.0,>=3.0.8
  Using cached spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Using cached catalogue-2.0.6-py3-none-any.whl (17 kB)
Collecting typer<0.5.0,>=0.3.0
  Using cached typer-0.4.0-py3-none-any.whl (27 kB)
Collecting srsly<3.0.0,>=2.4.1
  Using cached srsly-2.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (451 kB)
Collecting pathy>=0.3.5
  Using cached pathy-0

In [2]:
pip install plantuml



### Imports

In [3]:
# Spacy Imports
import spacy 
from spacy.lang.en import English

# TextBlob Imports 
from textblob import TextBlob

# nltk Imports 
import nltk
from nltk.corpus import stopwords 

#plantUML Imports 
import plantuml
from plantuml import PlantUML

# Other Imports 
import string
from os.path import abspath

### nltk downloads

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

### Text POS Tags 
Investigating the POS Tagging Attribute

In [20]:
text = " 2.	LR_Bus establishes connection between Left_Side_FGS and Right_Side_FGS."
text = nltk.word_tokenize(text)
l1 = nltk.pos_tag(text)
l1

[('2', 'CD'),
 ('.', '.'),
 ('LR_Bus', 'NNP'),
 ('establishes', 'VBZ'),
 ('connection', 'NN'),
 ('between', 'IN'),
 ('Left_Side_FGS', 'NNP'),
 ('and', 'CC'),
 ('Right_Side_FGS', 'NNP'),
 ('.', '.')]

In [21]:
# Noun Extraction (only extracting Proper Nouns)

nltk_nouns = []
for index,tuple in enumerate(l1):
  if tuple[1] == 'NNP':
    nltk_nouns.append(tuple[0])

nltk_nouns

['LR_Bus', 'Left_Side_FGS', 'Right_Side_FGS']

### Creating text file to store info to be runned in PlantUML 

In [23]:
# specification text
text = (" 1.	The FGS_System consists of four components: the Left_Side_FGS, the Right_Side_FGS, an LR_Bus, and an RL_Bus."
        " 2.	LR_Bus establishes connection between Left_Side_FGS and Right_Side_FGS."
        " 3.	RL_Bus establishes connection between Left_Side_FGS and Right_Side_FGS."
        " 4.	The Left_Side_FGS accepts as input a boolean value of Left_Transfer_Switch and Left_Primary_Side."
        " 5.	The Right_Side_FGS accepts as input a boolean value of Right_Transfer_Switch and Right_Primary_Side."
        " 6.	The Left_Side_FGS takes input from a synchronous clock CLK1."
        " 7.	The LR_Bus takes input from CLK2."
        " 8.	The Right_Side_FGS takes input from a synchronous clock CLK3."
        " 9.	The RL_Bus takes input from CLK4.")


In [24]:
 from nltk.tokenize import sent_tokenize, word_tokenize
 sent_tokenize(text)

[' 1.',
 'The FGS_System consists of four components: the Left_Side_FGS, the Right_Side_FGS, an LR_Bus, and an RL_Bus.',
 '2.',
 'LR_Bus establishes connection between Left_Side_FGS and Right_Side_FGS.',
 '3.',
 'RL_Bus establishes connection between Left_Side_FGS and Right_Side_FGS.',
 '4.',
 'The Left_Side_FGS accepts as input a boolean value of Left_Transfer_Switch and Left_Primary_Side.',
 '5.',
 'The Right_Side_FGS accepts as input a boolean value of Right_Transfer_Switch and Right_Primary_Side.',
 '6.',
 'The Left_Side_FGS takes input from a synchronous clock CLK1.',
 '7.',
 'The LR_Bus takes input from CLK2.',
 '8.',
 'The Right_Side_FGS takes input from a synchronous clock CLK3.',
 '9.',
 'The RL_Bus takes input from CLK4.']

In [25]:
# Editing or creating text file if it doesn't exists
# Creating the code for plantuml

# Used to tokenize a sentence 
doc = sent_tokenize(text)

connection_num = 1
with open('model_specs.txt', 'w') as f:
    with open('model_specs.txt','a') as f: 
      f.write("skinparam nodesep 150\n")
    # Iterate sentence-by-sentence then word-by-word 
    for sent in doc:

      # Tokenize that sentence 
      token_sentence = []
      token_sentence = nltk.word_tokenize(sent)

      # POS Tag the tokens 
      pos_tag_token = nltk.pos_tag(token_sentence)


      # Noun Extraction (only extracting Proper Nouns)

      nltk_nouns = []
      for index,tuple in enumerate(pos_tag_token):
        if tuple[1] == 'NNP':
          nltk_nouns.append(tuple[0])

      # print(nltk_nouns)
    
      i = 1
      j = 1
      for word in token_sentence: 
          if i < (len(nltk_nouns)):  
            if "connection" in token_sentence:
              # print(f"{nltk_nouns[0]}-{nltk_nouns[i]} :C{connection_num}")
              with open('model_specs.txt','a') as f: 
                f.write(f"[{nltk_nouns[0]}]-[{nltk_nouns[i]}] :C{connection_num}\n")
              
              i = i+1
              connection_num = connection_num+1
          else: 
            break
          
          if j < (len(nltk_nouns)):  
            if "input" in token_sentence: 
              # print(f"{nltk_nouns[0]}-{nltk_nouns[j]}")
              with open('model_specs.txt', 'a') as f:     
                f.write(f"[{nltk_nouns[0]}]-{nltk_nouns[j]}\n")
              j = j+1
              connection_num = connection_num + 1
          else: 
            break
f.close()

### Creating System Diagram using plantUML

In [26]:
server = PlantUML(url='http://www.plantuml.com/plantuml/img/',
                          basic_auth={},
                          form_auth={}, http_opts={}, request_opts={})

server.processes_file(abspath('model_specs.txt'))


True