In [None]:
# Importing required libraries

# docx2python is used to extract text, images, tables, and other data from .docx files
from docx2python import docx2python

# os module provides functions for interacting with the operating system
import os

# numpy is used for mathematical operations on large, multi-dimensional arrays and matrices
import numpy as np

# pandas is used for data manipulation and analysis
import pandas as pd

# TSNE from sklearn.manifold is used for dimensionality reduction
from sklearn.manifold import TSNE

# matplotlib.pyplot is used for creating static, animated, and interactive visualizations in Python
import matplotlib.pyplot as plt

# SentenceTransformer is used for training and using transformer models for generating sentence embeddings
from sentence_transformers import SentenceTransformer

# tqdm is used to make loops show a smart progress meter
from tqdm import tqdm

# torch is the main package in PyTorch, it provides a multi-dimensional array with support for autograd operations like backward()
import torch

# AutoModelForCausalLM, AutoTokenizer, pipeline are from the transformers library by Hugging Face which provides state-of-the-art machine learning models like BERT, GPT-2, etc.
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# euclidean distance and cosine distance
from scipy.spatial import distance

# random generator for the last figure
import random

In [None]:
# input folder
docInputFolder = "./short__2"

# name of file with generated requirements
generatedFile = "./output_generated_mini.xlsx"

# embedding model
#model = SentenceTransformer("sentence-t5-large")
# model = SentenceTransformer("sentence-transformers/gtr-t5-xxl")


# generative model
modelP = "microsoft/Phi-3-mini-128k-instruct"  #"JoofytheBloofy/T5LargeTest"

In [None]:
torch.cuda.is_available()

In [None]:
# suppress warnings
import warnings

warnings.filterwarnings("ignore")

## Part 1: List the documents's sections with "latency"

In the first step, we go through the documents in the folder "input_standards" and we extract which sections of these documents contain th word "latency". We store the results in a dictionary.

In [None]:
def extractLatencySections(doc):
    strSectionTitle = ""
    dictSections = {}
    listLatency = []
    skippedSections = 0

    doc_result = docx2python(doc,paragraph_styles = True, html=True)

    # we iterate over all lines
    # look for the section titles (which have the tag <h1>, <h2>, <h3>, etc.)
    # then we add the content of each section to the dictionary
    # and if there is a word "latency" somewhere in the section, we add the section title to the listLatency
    for oneLine in tqdm(doc_result.text.split('\n')):
        if "<h" in oneLine:
            strSectionTitle = oneLine
            dictSections[strSectionTitle] = []

        if strSectionTitle != "":  
            dictSections[strSectionTitle].append(oneLine)

        keywordsInLine = ["latency"]
        keywordsInSections = ["abstract", 
                              "acknowledgements", 
                              "appendix", 
                              "bibliography", 
                              "conclusion", 
                              "definition", 
                              "glossary",
                              "index", 
                              "introduction",
                              "references",
                              "table of contents", 
                              "table of figures"]

        if any(word in oneLine.lower() for word in keywordsInLine) and not any(word in strSectionTitle.lower() for word in keywordsInSections): 
            listLatency.append(strSectionTitle)
        else:
            skippedSections += 1
            
            
    # remove the keys from the dictionary if they are not part of the listLatency
    # as we want to get only the relevant sections, i.e., the one with the word latency
    for key in list(dictSections.keys()):
        if key not in listLatency:
            del dictSections[key]

    
    # return the dictionary with the relevant sections
    return dictSections

In [None]:
# this is the return list of all the lines in the document
lstAllLines = []

# for each .docx file in the input folder
# extract the sections with latency using the extractLatencySections function
# and print the sections
for doc in tqdm(os.listdir(docInputFolder)):    

    if doc.endswith(".docx"):
        
        # since things can go wrong with the latency library, 
        # we use a try except block to avoid the program to stop
        try: 
            dictSections = extractLatencySections(os.path.join(docInputFolder, doc))
        
            # we list the content as a long list of sections 
            for key in dictSections:

                lstOneLine = [key, doc]

                for line in dictSections[key]:
                    lstOneLine.append(line)
                    
                lstAllLines.append(lstOneLine)

        except Exception as e:
            print(f"Error with {doc}: {e}")

In [None]:
dfTest = pd.DataFrame(lstAllLines)

dfTest[1].unique()

In [None]:
# list with all embeddings for the sections
lstEmbeddings = []
iCounter = 0

for oneLine in tqdm(lstAllLines):

    # the content of the section starts on the third position of the list
    sentences = oneLine[3:]

    # Sentences are encoded by calling model.encode()
    #embeddings = model.encode(sentences)
        
    lstOneLine = [oneLine[0], oneLine[1], str(sentences).replace("$", "_").replace("\n", "_")]

    lstEmbeddings.append(lstOneLine)

In [None]:
# save the lstEmbeddings to an xlsx file topic_relevant.xlsx
df = pd.DataFrame(lstEmbeddings, columns=["Section", "Document", "Sentences"])
df.to_excel("topic_relevant.xlsx", index=False)

## Part 4: create requirements based on each of these sections

In the last step, we create new requirements based on the sections identified in the previous steps.

In [None]:
# read the lstDst from the Excel file
dfDistances = pd.read_excel("./topic_relevant.xlsx")

lstDist = dfDistances.values.tolist()

In [None]:
torch.random.manual_seed(0)

modelInstr = AutoModelForCausalLM.from_pretrained(
    modelP, 
    device_map="cuda", 
    torch_dtype="auto",     
    trust_remote_code=True, 
    attn_implementation='eager',
)
tokenizerInstr = AutoTokenizer.from_pretrained(modelP)

In [None]:
def createRequirement2(content, section, document, model, tokenizer):
    content1 = content.split(",")
    content1 = [x for x in content1[1:] if x not in ['', " ''", " '']"]]
    content_str = " ".join(content1)

    # check if the text contains the word Figure
    # if it does, then we add a footnote before to warn the user
    if "Figure" in content_str:
        #strContent = f"Based on this : {content_str}. Write the requirement in the following format 'The system shall '. Start with '* This part of the standard contains a figure, the generated requirement can be inaccurate, please consult the original text for details.' "
        strContent = f"Based on this : {content_str}, write the requirement about {section} from {document}. Add this text at the beginning: '* This part of the standard contains a figure, the generated requirement can be inaccurate, please consult the original text for details.' "
    # the same for tables, at least the ones that we can identify
    elif "Table " in content_str[:10]:
        #strContent = f"Summarize this table. {content_str}. Based on this summary, write the requirement in the following format 'The system shall '. Start with '* This part of the standard contains a table, the generated requirement can be inaccurate, please consult the original text for details.' "
        strContent = f"Summarize this table. {content_str}. Based on this summary, write a requirement about {section} from {document}. Add this text at the beginning '* This part of the standard contains a table, the generated requirement can be inaccurate, please consult the original text for details.' "
    
    # and for the empty text, e.g., when the word latency is only in the title
    # we do not generate anything and warn the user
    elif len(content_str) < 2:
        return "This section is empty. The word latency is probably only in a section title"
    # otherwise, we generate the requirement
    else: 
        #strContent = f"Based on this : {content_str}. Write the requirement in the following format 'The system shall ' "
        strContent = f"Write a requirement about {section} from {document} based on this: {content_str}. "
    
    messages = [
        {"role": "user", "content": strContent},
    ]

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.0,
    }

    output = pipe(messages, **generation_args)
    
    return output[0]['generated_text']

In [None]:
lstGenerated = []
iCounter = 0

# we generate new requirements for the partially covered ones
if len(lstDist) > 0:
    for oneLine in tqdm(lstDist):
        if (len(oneLine[2]) < 4095):
            strRequirement = createRequirement2(oneLine[2], oneLine[0], oneLine[1], modelInstr, tokenizerInstr)
            lstGenerated.append([oneLine[0], oneLine[1], oneLine[2], strRequirement])
            dfOutput = pd.DataFrame(lstGenerated, columns=["Section", "Document", "Content", "Generated requirement"])
            dfOutput.to_excel(generatedFile, index=False)

In [None]:
dfOutput = pd.DataFrame(lstGenerated, columns=["Section", "Document", "Content", "Generated requirement"])
dfOutput.to_excel(generatedFile, index=False)