In [48]:
import json
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from glob import glob
from itertools import product
import re

In [4]:
with open("checkpoints/requirements_extraction/claude_reqs_list_v1_20250429_081756.md") as f:
    mdreqs = f.read()

In [42]:
all_reqs = []
sections = list(re.finditer("#+ REQ-[0-9]+", mdreqs))
md_splits = []
for i in range(len(sections)-1):

    md_splits.append(mdreqs[sections[i].span()[1]:sections[i+1].span()[0]])
md_splits.append(mdreqs[sections[-1].span()[1]:])

counter = 0
for split in md_splits:
    headings = list(re.finditer("\*\*\w+\*\*\:", split))
    for i in range(len(headings)-1):
        heading = headings[i]
        if split[heading.span()[0]:heading.span()[1]].lower() == "**description**:":
            req_text = split[heading.span()[1]:headings[i+1].span()[0]]
            all_reqs.append({"text": req_text, "raw": split, "id": f"md{counter}"})
            counter += 1
            break
    heading = headings[-1]
    if split[heading.span()[0]:heading.span()[1]].lower() == "**description**:":
        req_text = split[heading.span()[1]:]
        all_reqs.append({"text": req_text, "raw": split, "id": f"md{counter}"})
        counter += 1

In [43]:
ragreq_list = []
with open("checkpoints/requirements_extraction/RAG/plan_net_reqs.json") as f:
    ragreqs = json.load(f)
for ragreq in ragreqs['responses']['processed']:
    if ragreq['contains_requirement']:
        ragreq_list.extend([{'text': x['Requirement*'], 'raw': x} for x in ragreq['extracted_requirement']])
counter = 0
for rl in ragreq_list:
    rl['id'] = f"rag{counter}"
    counter += 1

In [87]:
from uuid import uuid4

In [89]:
str(uuid4())

'91364f64-be7e-4138-ab09-3b7de4434c4f'

In [44]:
all_reqs.extend(ragreq_list)

In [90]:
all_reqs

[{'text': ' "Access to the Plan-Net service should not require authentication, and the server should not maintain any records that could associate the consumer with the entities that were queried."\n',
  'raw': '\n\n**Summary**: No authentication for directory access\n**Description**: "Access to the Plan-Net service should not require authentication, and the server should not maintain any records that could associate the consumer with the entities that were queried."\n**Verification**: Inspection\n**Notes**: Actor: Server, Conformance: SHOULD, Conditional: False\n**Source**: Privacy Considerations Section\n\n',
  'id': 'md0'},
 {'text': ' "A conformant Plan-Net service SHALL NOT require a directory mobile application to send consumer identifying information in order to query content."\n',
  'raw': '\n\n**Summary**: No consumer identification required\n**Description**: "A conformant Plan-Net service SHALL NOT require a directory mobile application to send consumer identifying informatio

In [46]:
model = SentenceTransformer("all-mpnet-base-v2")

for i, areq in enumerate(all_reqs):
    if (i+1) % 50 == 0:
        print(f"{i+1} of {len(all_reqs)}", end='\r')
    areq['embedding'] = model.encode(areq['text'])


200 of 232

In [50]:

scored_prod_list = [['Requirement1', "Requirement2", "Similarity Score"]]
prod_list = list(product(all_reqs, all_reqs))
for i, prod in enumerate(prod_list):
    counter = i+1
    if counter % 1000 == 0:
        print(f"Pair {counter} of {len(prod_list)}", end='\r')
    if prod[0]['id'] != prod[1]['id']:
        emb0 = prod[0]['embedding']
        emb1 = prod[1]['embedding']
        score = float(util.pytorch_cos_sim(emb0, emb1)[0][0])
        scored_prod_list.append([prod[0]['id'], prod[1]['id'], score])
print(f"Pair {counter} of {len(prod_list)}", end='\r')

Pair 53824 of 53824

In [51]:
df = pd.DataFrame(scored_prod_list[1:], columns=scored_prod_list[0])
df.head()

Unnamed: 0,Requirement1,Requirement2,Similarity Score
0,md0,md1,0.69088
1,md0,md2,0.724847
2,md0,md3,0.180561
3,md0,md4,0.228241
4,md0,md5,0.578607


In [82]:
dupdf = df[df['Similarity Score'] >= 0.98]
groups = {}
for r1 in set(list(dupdf['Requirement1'].unique()) + list(dupdf['Requirement2'].unique())):
    subdf = dupdf[(dupdf['Requirement1'] == r1)|(dupdf['Requirement2'] == r1)]
    grouped_ids = set(subdf['Requirement1'].to_list() + subdf['Requirement2'].to_list())
    for gid in grouped_ids:
        if gid not in groups:
            groups[gid] = grouped_ids
        else:
            new_group = groups[gid].union(grouped_ids)
            groups[gid] = new_group 

unique_groups = set()
for val in groups.values():
    unique_groups.add(frozenset(val))

to_keep = []
dup_ids = []
for ugroup in unique_groups:
    glist = list(ugroup)
    to_keep.append(glist[0])
    dup_ids.extend(glist[1:])
    
filtered_allreqs = list(filter(lambda x: x['id'] not in dup_ids, all_reqs))

In [84]:
embeds_only = {}
for areq in filtered_allreqs:
    embeds_only[areq['id']] = [float(x) for x in areq.pop('embedding')]

In [None]:
filtered_allreqs

[{'text': ' "Access to the Plan-Net service should not require authentication, and the server should not maintain any records that could associate the consumer with the entities that were queried."\n',
  'raw': '\n\n**Summary**: No authentication for directory access\n**Description**: "Access to the Plan-Net service should not require authentication, and the server should not maintain any records that could associate the consumer with the entities that were queried."\n**Verification**: Inspection\n**Notes**: Actor: Server, Conformance: SHOULD, Conditional: False\n**Source**: Privacy Considerations Section\n\n',
  'id': 'md0'},
 {'text': ' "A conformant Plan-Net service SHALL NOT require a directory mobile application to send consumer identifying information in order to query content."\n',
  'raw': '\n\n**Summary**: No consumer identification required\n**Description**: "A conformant Plan-Net service SHALL NOT require a directory mobile application to send consumer identifying informatio

In [74]:
set(list(dupdf['Requirement1'].unique()) + list(dupdf['Requirement2'].unique()))

{'md0',
 'md14',
 'md18',
 'md2',
 'md21',
 'md26',
 'md30',
 'md34',
 'md38',
 'rag0',
 'rag2'}

In [70]:
set(subdf['Requirement1'].to_list() + subdf['Requirement2'].to_list()).union(set(['11']))

{'11', 'md0', 'rag0'}

In [59]:
dupdf

Unnamed: 0,Requirement1,Requirement2,Similarity Score
85,md0,rag0,0.98236
549,md2,rag2,0.984251
3251,md14,md18,1.0
3254,md14,md21,1.0
3259,md14,md26,1.0
3263,md14,md30,1.0
3267,md14,md34,1.0
3271,md14,md38,0.983983
4172,md18,md14,1.0
4178,md18,md21,1.0
