# 3REQ system

Requirement analysis system.

In [1]:
from docx2python import docx2python
import os

import numpy as np
import pandas as pd

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# suppress warnings
import warnings
warnings.filterwarnings("ignore")

## Step 1: List the documents's sections with "latency"

In the first step, we go through the documents in the folder "input_standards" and we extract which sections of these documents contain th word "latency". We store the results in a dictionary.

In [3]:
def extractLatencySections(doc):
    strSectionTitle = ""
    dictSections = {}
    listLatency = []

    doc_result = docx2python(doc,paragraph_styles = True, html=True)

    # we iterate over all lines
    # look for the section titles (which have the tag <h1>, <h2>, <h3>, etc.)
    # then we add the content of each section to the dictionary
    # and if there is a word "latency" somewhere in the section, we add the section title to the listLatency
    for oneLine in doc_result.text.split('\n'):
        if "<h" in oneLine:
            strSectionTitle = oneLine
            dictSections[strSectionTitle] = []

        if strSectionTitle != "":  
            dictSections[strSectionTitle].append(oneLine)

        keywordsInLine = ["latency"]  #, "latencies"]
        keywordsInSections = ["references", "introduction"]

        if any(word in oneLine.lower() for word in keywordsInLine) and not any(word in strSectionTitle.lower() for word in keywordsInSections): 
            listLatency.append(strSectionTitle)

            
    # remove the keys from the dictionary if they are not part of the listLatency
    # as we want to get only the relevant sections, i.e., the one with the word latency
    for key in list(dictSections.keys()):
        if key not in listLatency:
            del dictSections[key]

    # return the dictionary with the relevant sections
    return dictSections


In [4]:
docInputFolder = "./23_standards"

# this is the return list of all the lines in the document
lstAllLines = []

# for each .docx file in the input folder
# extract the sections with latency using the extractLatencySections function
# and print the sections
for doc in os.listdir(docInputFolder):    

    if doc.endswith(".docx"):
        print(f"Processing {doc}")

        # since things can go wrong with the latency library, 
        # we use a try except block to avoid the program to stop
        try: 
            dictSections = extractLatencySections(os.path.join(docInputFolder, doc))
        
            # we list the content
            # as a long list of sections 
            for key in dictSections:

                lstOneLine = [key, doc]

                for line in dictSections[key]:
                    lstOneLine.append(line)
                    
                lstAllLines.append(lstOneLine)

        except Exception as e:
            print(f"Error with {doc}: {e}")

Processing 23003-i20.docx
Processing 23008-i00.docx
Processing 23015-i00.docx
Processing 23038-i00.docx
Processing 23040-i00.docx
Processing 23042-i00.docx
Processing 23053-h00.docx
Processing 23122-i60.docx
Processing 23142-i00.docx
Processing 23218-i00.docx
Processing 23247-i20.docx
Processing 23256-i10.docx
Processing 23273-i20.docx
Processing 23287-i00.docx
Processing 23288-i20.docx
Processing 23289-i60.docx
Processing 23304-i20.docx
Processing 23316-i20.docx
Processing 23380-i00.docx
Processing 23433-i00.docx
Processing 23434-i50.docx
Processing 23435-i00.docx
Processing 23436-i00.docx
Processing 23501-i22.docx
Processing 23502-i20.docx
Processing 23503-i20.docx
Processing 23527-i00.docx
Processing 23540-i00.docx
Processing 23542-i00.docx
Processing 23545-090.docx
Processing 23548-i20.docx
Processing 23554-j10.docx
Processing 23558-i30.docx
Processing 23586-i00.docx
Processing 23632-i20.docx


In [5]:
model = SentenceTransformer("all-MiniLM-L6-v2")

# list with all embeddings for the sections
lstEmbeddings = []

for oneLine in lstAllLines:

    # the content of the section starts on the third position of the list
    sentences = oneLine[3:]

    # Sentences are encoded by calling model.encode()
    embeddings = model.encode(sentences)
    
    # Print the average embeddings for all the sentences 
    # in this section
    avg_embedding = np.mean(embeddings, axis=0)
    
    lstOneLine = [oneLine[0], oneLine[1], 2, str(sentences).replace("$", "_").replace("\n", "_"), avg_embedding]

    lstEmbeddings.append(lstOneLine)


## Step 2: Find the relevant sections

In this step, we take the sections identified in Step 1 and we compare them to a list of right and wrong requirements. The list is stored in the file List.xlsx

In [6]:
# open the file List.xlsx using pandas
# and read the workshop NR
df = pd.read_excel("List_w15_A3.xlsx", sheet_name="R_NR")

# convert the dataframe to a list of lists
lstReference = df.values.tolist()

lstReference[0]

['L4S (Low Latency, Low Loss and Scalable Throughput) is described in IETF RFC 9330, IETF RFC 9331 and IETF RFC 9332.',
 1]

In [7]:
# list with all embeddings for the sections
lstEmbeddingsRef = []

for oneLine in lstReference:

    # the content of the section starts on the third position of the list
    sentences = oneLine[0]

    # Sentences are encoded by calling model.encode()
    embeddings = model.encode(sentences)
    
    # Print the average embeddings for all the sentences 
    # in this section
    avg_embedding = embeddings
    
    lstOneLine = [oneLine[0], 'REF', oneLine[1], oneLine[1], avg_embedding]

    lstEmbeddingsRef.append(lstOneLine)

In [8]:
# concatenate the two lists
lstEmbeddingsAll = lstEmbeddings + lstEmbeddingsRef
print(len(lstEmbeddings))

105


In [9]:
print(lstEmbeddingsRef[0][4])

[ 4.50192802e-02 -8.73258989e-03 -3.91247123e-02 -6.23441301e-02
  4.43251617e-02 -1.12622185e-02 -6.79076016e-02  4.40965109e-02
  2.99488716e-02  2.12680735e-02 -2.46547088e-02  3.49943787e-02
 -3.91769037e-02  8.50155891e-04 -3.32857743e-02 -2.51700990e-02
  9.21339616e-02 -4.88570379e-03 -3.33831273e-02 -5.85653819e-02
  7.03603122e-03 -1.56739987e-02 -7.60865882e-02 -1.65004861e-02
  2.01428905e-02 -4.42030765e-02 -4.38724123e-02 -7.46411756e-02
  9.20377206e-03 -3.61588374e-02  5.97690372e-03  3.19808396e-03
 -3.00377561e-03 -2.15972662e-02 -8.69956464e-02  1.54021354e-02
  1.21773876e-01 -6.39606342e-02  2.69940440e-02  9.44455042e-02
  2.97552557e-03 -2.79472619e-02  4.35789376e-02  5.96883669e-02
 -4.23361659e-02  5.33416718e-02 -3.77632752e-02 -2.56944057e-02
 -6.09357506e-02 -6.06428012e-02  6.15060739e-02  3.07978764e-02
 -2.83457283e-02  4.24012914e-02 -6.19458593e-02 -7.12357461e-02
 -1.30300120e-01  7.22962320e-02  3.31694866e-03  1.18047848e-01
  2.67540421e-02 -9.29740

In [10]:
from sklearn.metrics.pairwise import euclidean_distances

# for each line in lstEmbeddings
# we calculate the euclidean distance with each line in lstEmbeddingsRef



lstRelevant = []

for oneLine in lstEmbeddings:  
    lstDistPos = []
    lstDistNeg = []
    for oneLineRef in lstEmbeddingsRef:
        if oneLineRef[2] == 1:
            # euclidean distance between the two embeddings
            dist = euclidean_distances([oneLine[4]], [oneLineRef[4]])
            lstDistPos.append(dist[0][0])
        if oneLineRef[2] == 0:
            # euclidean distance between the two embeddings
            dist = euclidean_distances([oneLine[4]], [oneLineRef[4]])
            lstDistNeg.append(dist[0][0])
    
    # now calculate the average for both lists
    avgDistPos = np.mean(lstDistPos)
    avgDistNeg = np.mean(lstDistNeg)

    if avgDistPos < avgDistNeg:
        print(f"Section {oneLine[0]} is relevant")
        # add the class to the list
        oneLine.append(1)
        lstRelevant.append(oneLine)
    else:
        #print(f"Section {oneLine[0]} is not relevant")
        # add the class to the list
        oneLine.append(0)

Section <h4>5.4.3.3	Priority Level</h4> is relevant
Section <h4>9.7.2.1	Data transmission quality measurement </h4> is relevant
Section <h4>9.7.2.3	Data transmission quality measurement reported by SEALDD client</h4> is relevant
Section <h4>14.3.5.1	General</h4> is relevant
Section <h5>14.3.5.2.2	Procedure for a group of UEs</h5> is relevant
Section <h4>14.3.7.2	TSC stream availability discovery procedure</h4> is relevant
Section <h5>14.3.9.2.1	Procedure triggered by correlated source and destination requests</h5> is relevant
Section <h5>14.3.12.4.3	Network parameter coordination procedure</h5> is relevant
Section <h4>9.5.2.2	Network slice optimization based on VAL server policy</h4> is relevant
Section <h4>9.11.2.1	Procedure for VAL server-triggered and network-based network slice adaptation for VAL application</h4> is relevant
Section <h4>9.17.2.1	Network Slice Information delivery request</h4> is relevant
Section <h2>3.1	Definitions</h2> is relevant
Section <h2>3.2	Abbreviations</h2

In [11]:
print(len(lstRelevant))

36


In [12]:
# here we save all the relevant sections to an Excel file
# and the non-relevant ones too
import pandas as pd

dfOutput = pd.DataFrame(lstEmbeddings, columns=["Section", "Document", "Class", "Content", "Embedding", "Relevance"])

dfOutput.to_excel("./output__w15__23_A3.xlsx", index=False)

## Step 3: Find which requirements are impacted

In this step, we compare the relevant sections with the existing requirements. Based on the distance, we can determine which requirements are impacted. The requirements are provided as a separete list. 

In [13]:
print(f'Percentage relevant: {len(lstRelevant)/len(lstEmbeddings)*100:.2f}%')

Percentage relevant: 34.29%


In [14]:
# read the requirements from the excel file requirements.xlsx, worksheet LR
df = pd.read_excel("List_w15_A3.xlsx", sheet_name="LR")

# convert to list
lstRequirements = df.values.tolist()
lstRequirements[0]

# now we calculate the embeddings for each of these requirements
lstEmbeddingsReq = []

for oneLine in lstRequirements:
    
        # the content of the section starts on the third position of the list
        sentences = oneLine[1]
    
        # Sentences are encoded by calling model.encode()
        embeddings = model.encode(sentences)
        
        # Print the average embeddings for all the sentences 
        # in this section
        avg_embedding = embeddings
        
        lstOneLine = [oneLine[0], 'latency', oneLine[1], oneLine[1], avg_embedding]
    
        lstEmbeddingsReq.append(lstOneLine)

In [15]:
print(len(lstEmbeddingsReq))

16


In [16]:
# now we calculate the euclidean distance between the requirements and the sections
# that are relevant
lstDist = []
lstRelevantDist = []

for oneLine in lstRelevant:
    for oneLineReq in lstEmbeddingsReq:
        # euclidean distance between the two embeddings
        dist = euclidean_distances([oneLine[4]], [oneLineReq[4]])
        lstDist.append([oneLine[0], oneLine[1], oneLineReq[0], dist[0][0], oneLine[3]])

# now we sort the list by the distance
lstDist.sort(key=lambda x: x[2])

# and we print them
for i in range(len(lstDist)):
    print(f"Section {lstDist[i][0]} is close to requirement {lstDist[i][2]} with distance {lstDist[i][3]:.2f}")
    # add this to a list
    lstRelevantDist.append([lstDist[i][0], lstDist[i][1], lstDist[i][2], lstDist[i][3], lstDist[i][4]])

# save the list to an Excel file
dfOutput = pd.DataFrame(lstRelevantDist, columns=["Section", "Document", "Requirement", "Distance", "Content"])

# sort it by section and document
dfOutput = dfOutput.sort_values(by=["Section", "Document"])

dfOutput.to_excel("./output_reqs_distances_w15__23_A3.xlsx", index=False)

Section <h4>5.4.3.3	Priority Level</h4> is close to requirement LR1_signalling with distance 1.11
Section <h4>5.4.3.3	Priority Level</h4> is close to requirement LR1_signalling with distance 1.12
Section <h4>5.4.3.3	Priority Level</h4> is close to requirement LR1_signalling with distance 1.04
Section <h4>5.4.3.3	Priority Level</h4> is close to requirement LR1_signalling with distance 1.05
Section <h4>9.7.2.1	Data transmission quality measurement </h4> is close to requirement LR1_signalling with distance 1.09
Section <h4>9.7.2.1	Data transmission quality measurement </h4> is close to requirement LR1_signalling with distance 1.05
Section <h4>9.7.2.1	Data transmission quality measurement </h4> is close to requirement LR1_signalling with distance 1.07
Section <h4>9.7.2.1	Data transmission quality measurement </h4> is close to requirement LR1_signalling with distance 1.08
Section <h4>9.7.2.3	Data transmission quality measurement reported by SEALDD client</h4> is close to requirement LR1_sig

In [17]:
print(len(lstDist))
print(len(lstRelevant))
print(len(lstEmbeddingsReq))

576
36
16


In [18]:
# average the distance in dfOutput per section, document and requirement
dfOutput["Distance"] = dfOutput["Distance"].astype(float)
dfGrouped = dfOutput.groupby(["Section", "Document", "Requirement", "Content"])

#convert dfGrouped to a dataframe
dfGrouped = dfGrouped.agg({"Distance": "mean"}).reset_index()

dfGrouped.to_excel("./output_reqs_distances_w15__23_A3.xlsx", index=False)

## Step 4: Find if it is a new requirement

In the last step, we look at the distances and then we find if they too far away from the existing requirements. If they are, we can consider them as new requirements.