In [12]:
import os
from docx import Document
import pandas as pd

# Function to extract text from a .docx file and find the word "latency"
def process_docx(file_path):
    document = Document(file_path)
    found_entries = []
    chapter_text = []
    chapter_start = False
    chapter = ""
    
    for para in document.paragraphs:
        if para.style.name.startswith('Heading'):  # Identify chapter start by heading styles
            if chapter_start:  # Save previous chapter if it had "latency"
                if 'latency' in " ".join(chapter_text).lower():
                    chapter_text_limited = " ".join(" ".join(chapter_text).split()[:5000])
                    found_entries.append({
                        'file_name': os.path.basename(file_path),
                        'chapter': chapter,
                        'chapter_text': chapter_text_limited
                    })
            chapter_start = True
            chapter = para.text
            chapter_text = [para.text]
        elif chapter_start:
            chapter_text.append(para.text)

    # Check last chapter
    if chapter_start and 'latency' in " ".join(chapter_text).lower():
        chapter_text_limited = " ".join(" ".join(chapter_text).split()[:5000])
        found_entries.append({
            'file_name': os.path.basename(file_path),
            'chapter': chapter,
            'chapter_text': chapter_text_limited
        })

    return found_entries

# Main function to process all .docx files in the folder
def process_folder(folder_path):
    all_entries = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.docx'):
            file_path = os.path.join(folder_path, filename)
            entries = process_docx(file_path)
            all_entries.extend(entries)

    return all_entries

# Save the extracted information to an Excel file
def save_to_excel(entries, output_file):
    df = pd.DataFrame(entries)
    if not df.empty:
        df.to_excel(output_file, index=False)
    else:
        print("No entries found.")

# Specify the folder containing the .docx files and the output Excel file
folder_path = "./short__2"
output_file = 'output.xlsx'

# Process the folder and save the results to an Excel file
entries = process_folder(folder_path)
save_to_excel(entries, output_file)

print(f"Processing complete. Data saved to {output_file}")

model = SentenceTransformer("sentence-t5-large")


Processing complete. Data saved to output.xlsx


NameError: name 'SentenceTransformer' is not defined

## Part 4: create requirements based on each of these sections

In the last step, we create new requirements based on the sections identified in the previous steps.

In [11]:
# read the lstDst from the Excel file
dfDistances = pd.read_excel("./output.xlsx")       #("./temp_sections_for_generation.xlsx")

lstDist = dfDistances.values.tolist()

model = SentenceTransformer("sentence-t5-large")

torch.cuda.is_available()

import warnings

warnings.filterwarnings("ignore")
#modelP = "microsoft/Phi-3-mini-128k-instruct"

NameError: name 'SentenceTransformer' is not defined

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set the manual seed for reproducibility
torch.manual_seed(0)

# Load the model and tokenizer
modelInstr = AutoModelForCausalLM.from_pretrained(
    modelP, 
    device_map="cuda", 
    torch_dtype=torch.float16, 
    revision="float16", 
    attn_implementation='eager',
)

tokenizerInstr = AutoTokenizer.from_pretrained(modelP)

In [None]:
torch.random.manual_seed(0)

modelInstr = AutoModelForCausalLM.from_pretrained(
    modelP, 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
    attn_implementation='eager',
)
tokenizerInstr = AutoTokenizer.from_pretrained(modelP)

In [None]:
def createRequirement(content, type, model, tokenizer):
    content1 = content.split(",")
    content1 = [x for x in content1[1:] if x not in ['', " ''", " '']"]]
    content_str = " ".join(content1)

    # this is about signalling, payload, c/c. 
    # typeStr = type.split("_")[1]
    
    # strContent = f"Based on this : {content_str}. Write the requirement in the following format 'The {typeStr} of the system shall ' "
    strContent = f"Based on this : {content_str}. Write the requirement in the following format 'The system shall ' "
    
    messages = [
        {"role": "user", "content": strContent},
    ]

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False,
    }

    output = pipe(messages, **generation_args)
    
    return output[0]['generated_text']

In [None]:
def createRequirement2(content, section, document, model, tokenizer):
    content1 = content.split(",")
    content1 = [x for x in content1[1:] if x not in ['', " ''", " '']"]]
    content_str = " ".join(content1)

    # check if the text contains the word Figure
    # if it does, then we add a footnote before to warn the user
    if "Figure" in content_str:
        #strContent = f"Based on this : {content_str}. Write the requirement in the following format 'The system shall '. Start with '* This part of the standard contains a figure, the generated requirement can be inaccurate, please consult the original text for details.' "
        strContent = f"Based on this : {content_str}, write the requirement about {section} from {document}. Add this text at the beginning: '* This part of the standard contains a figure, the generated requirement can be inaccurate, please consult the original text for details.' "
    # the same for tables, at least the ones that we can identify
    elif "Table " in content_str[:10]:
        #strContent = f"Summarize this table. {content_str}. Based on this summary, write the requirement in the following format 'The system shall '. Start with '* This part of the standard contains a table, the generated requirement can be inaccurate, please consult the original text for details.' "
        strContent = f"Summarize this table. {content_str}. Based on this summary, write a requirement about {section} from {document}. Add this text at the beginning '* This part of the standard contains a table, the generated requirement can be inaccurate, please consult the original text for details.' "
    
    # and for the empty text, e.g., when the word latency is only in the title
    # we do not generate anything and warn the user
    elif len(content_str) < 2:
        return "This section is empty. The word latency is probably only in a section title"
    # otherwise, we generate the requirement
    else: 
        #strContent = f"Based on this : {content_str}. Write the requirement in the following format 'The system shall ' "
        strContent = f"Write a requirement about {section} from {document} based on this: {content_str}. "
    
    messages = [
        {"role": "user", "content": strContent},
    ]

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.0,
    }

    output = pipe(messages, **generation_args)
    
    return output[0]['generated_text']

In [None]:
lstGenerated = []
iCounter = 0

# we generate new requirements for the partially covered ones
if len(lstDist) > 0:
    for oneLine in tqdm(lstDist):
        if (len(oneLine[3]) < 4095):
            strRequirement = createRequirement2(oneLine[3], oneLine[0], oneLine[1], modelInstr, tokenizerInstr)
            lstGenerated.append([oneLine[0], oneLine[1], oneLine[2], oneLine[3], strRequirement])
            dfOutput = pd.DataFrame(lstGenerated, columns=["Section", "Document", "Distance", "Content", "Generated requirement"])
            dfOutput.to_excel(generatedFile, index=False)

In [None]:
dfOutput = pd.DataFrame(lstGenerated, columns=["Section", "Document",  "Distance", "Content", "Generated requirement"])
dfOutput.to_excel(generatedFile, index=False)

# and generate html
dfOutput.to_html("generated.html")