In [1]:
import ollama
import json

# Define the function to extract regulatory data
def extract_regulatory_data(regulatory_text):
    prompt = f"""
You are an expert in the legal domain and computational linguistics. Follow these steps strictly:

1. Convert passive voice input sentences into active voice.
2. Break down the input into syntactic parts following the schema precisely.
3. Before categorizing any part, explicitly verify: "Can this logically belong to this category based on provided markers?" If yes, categorize it; if not, do not categorize it.
4. Use schema markers ONLY as hints; do NOT include markers in the final JSON output.

Schema:
{{
    "Type": ["Definition", "Prohibition", "Obligation", "Fact", "Penalty", "Permission", "Recommendation", "Exemption"],
    "Action": "VP excluding modality, condition, exception, and reason annotations",
    "Condition": ["SRel << (condition marker)", "PP << (condition marker)", "Ssub << (condition marker)", "NP < (VPinf !<< (exception marker) !<< (reason marker))", "NP < (VPart !<< (exception marker) !<< (reason marker))"],
    "Condition Marker": ["if", "in case of", "provided that", "when", "in the context of", "limit", "who", "whose", "which"],
    "Modality": ["VN < (modality marker)"],
    "Modality Marker": ["may", "must", "is prohibited from", "should", "shall", "can", "need to", "required to", "is authorized to"],
    "Actor": ["subject dependency and NP < (actor marker)", "object dependency and passive voice and PP < P $ (NP < (actor marker))", "object dependency and active voice and NP < (actor marker)"],
    "Actor Marker": ["firm", "expert", "staff", "company", "consumer", "tax"],
    "Artifact": ["NP < (artifact marker)", "NP !<< (violation marker) | !<< (time marker) | !<< (situation marker) | !<< (sanction marker) | !<< (reference marker) | !<< (location marker) | !<< (action marker)"],
    "Artifact Marker": ["document", "agreement", "certificate", "license", "permit", "warrant", "pass"],
    "Exception": ["Srel << (exception marker)", "Ssub << (exception marker)", "NP < (VPinf !<< (exception marker))", "PP << (exception marker)", "NP << (P < (exception marker) $ VPinf)"],
    "Exception Marker": ["with the exception of", "except for", "derogation", "apart from", "other than"],
    "Presence": ["NP < (location marker)"],
    "Presence Marker": ["site", "place", "target market", "customer base", "street"],
    "Reason": ["Srel << (reason marker)", "Ssub << (reason marker)", "PP << (reason marker)", "NP < (VPart << (reason marker))", "NP << (P < (reason marker) $ VPinf)"],
    "Reason Marker": ["in order to", "for the purpose of", "so as to", "so that", "in the interest of", "in view of"],
    "Sanction": ["NP < (sanction marker)"],
    "Sanction Marker": ["punishment", "jail sentence", "imprisonment", "prison term", "fine"],
    "Situation": ["NP < (situation marker)"],
    "Situation Marker": ["renewal", "inspection", "registration", "deliberation"],
    "Time": ["NP < (time marker)", "PP < (P < (time marker)) $ NP"],
    "Time Marker": ["before", "after", "temporary", "permanent", "period", "day", "year", "month", "date"],
    "Violation": ["NP < (violation marker)"],
    "Violation Marker": ["offence", "crime", "misdemeanor", "civil wrong", "infraction", "transgression"]
}}

Sample:
'Input': 'Firms should be able to provide us with the information they are using to monitor whether they are achieving outcomes for consumers with characteristics of vulnerability that are as good as those for other consumers (see monitoring and evaluation in Chapter 5).',
  'JSON': 
    'Type': 'Obligation',
    'Actor': 'Firm',
    'Modality': 'should',
    'Action': 'be able to provide us with the information they are using',
    'Reason': 'to monitor whether they are achieving outcomes as good as those for other consumers',
    'Condition': 'for consumers with characteristics of vulnerability',
    'Artifact': 'monitoring and evaluation in Chapter 5',
    'Violation': null,
    'Exception': null,
    'Presence': null,
    'Time': null,
  
 'Input': 'Where possible, staff should be able to respond to the consumer’s needs promptly so that action is taken to ensure harm does not occur or become more severe.',
  'JSON': 
    'Type': 'Obligation',
    'Actor': 'Staff',
    'Modality': 'should',
    'Action': "respond to the consumer\'s needs promptly",
    'Reason': 'action is taken to ensure harm does not occur or become more severe',
    'Condition': 'where possible',
    'Artifact': null,
    'Violation': null,
    'Exception': null,
    'Presence': null,
    'Time': null,
 'Input': 'Firms should improve the skills and capability of staff in a way that is proportionate.',
  'JSON': 
    'Type': 'Obligation',
    'Actor': 'Firm',
    'Modality': 'should',
    'Action': 'improve the skills and capability of staff',
    'Reason': null,
    'Condition': 'in a way that is proportionate',
    'Artifact': null,
    'Violation': null,
    'Exception': null,
    'Presence': null,
    'Time': null,
 'Input': 'If a firm’s business model intentionally exploits vulnerable consumers, this would be a clear breach of our Principles.',
  'JSON': 
    'Type': 'Prohibition',
    'Actor': 'Firm’s business model',
    'Modality': 'if',
    'Action': 'intentionally exploits vulnerable consumers',
    'Reason': null,
    'Condition': null,
    'Artifact': null,
    'Violation': 'breach of our Principles',
    'Exception': null,
    'Presence': null,
    'Time': null,
 'Input': 'Firms should ensure they record and process data in line with requirements of data protection legislation, see Appendix 1.',
  'JSON': 
    'Type': 'Obligation',
    'Actor': 'Firm',
    'Modality': 'should',
    'Action': 'ensure they record and process data in line with requirements of data protection legislation',
    'Reason': null,
    'Condition': null,
    'Artifact': 'Appendix 1',
    'Violation': null,
    'Exception': null,
    'Presence': null,
    'Time': null,



Return strictly structured JSON adhering to the schema above, excluding the markers themselves.

Input:
"{regulatory_text}"

JSON:
    """

    # Generate response from Llama model via Ollama
    response = ollama.generate(model="llama3:8b", prompt=prompt, format="json")

    # Parse and return JSON output
    return json.loads(response["response"])

# Example usage:
regulation_example = """Under Article 5(1)(d) firms should take care to ensure the accuracy of information they record about customers and vulnerabilities. This may be challenging for firms where vulnerabilities are temporary, and firms should consider this in the context of the customer service they provide."""

structured_data = extract_regulatory_data(regulation_example)

print(json.dumps(structured_data, indent=2))

{
  "Type": "Obligation",
  "Action": "take care to ensure the accuracy of information they record about customers and vulnerabilities",
  "Condition": "for firms where vulnerabilities are temporary",
  "Reason": null,
  "Artifact": null,
  "Violation": null,
  "Exception": null,
  "Presence": null,
  "Time": null
}


In [3]:
import pandas as pd
import datetime

data = pd.read_excel("data/Generated_Ontology_GDPR_Output.xlsx")
statements_all = data["Statement"].tolist()

import pandas as pd

def is_valid_output(json_output):
    required_fields = ["Type", "Actor", "Action"]
    for field in required_fields:
        if not json_output.get(field):
            return False
    return True

knowledge_graph = []
statements = statements_all[:50]

for text in statements:
    now = datetime.datetime.now()
    
    print(now.time())
    print(text)
    retries = 3
    json_output = None
    while retries > 0:
        json_output = extract_regulatory_data(text)
        if is_valid_output(json_output):
            break
        retries -= 1
    print(json_output)
    
    if json_output:
        json_output["InputText"] = text  # Add the original text to the JSON output
        knowledge_graph.append(json_output)

# Save the knowledge graph to a JSON file
output_json_filename = "Output/my_knowledge_graph_output.json"
with open(output_json_filename, 'w') as json_file:
    json.dump(knowledge_graph, json_file, indent=4)

print(f"Knowledge graph saved to {output_json_filename}")

# Convert the knowledge graph to a DataFrame and save to an Excel file
df = pd.DataFrame(knowledge_graph)
# Reorder columns to have 'InputText' as the first column
columns = ["InputText"] + [col for col in df.columns if col != "InputText"]
df = df[columns]

output_excel_filename = "Output/my_knowledge_graph_output.xlsx"
df.to_excel(output_excel_filename, index=False)

print(f"Knowledge graph saved to {output_excel_filename}")

09:37:23.897430
I(Legislative acts)REGULATIONSREGULATION (EU) 2016/679 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCILof 27 April 2016on the protection of natural persons with regard to the processing of personal data and on the free movement of such data, and repealing Directive 95/46/EC (General Data Protection Regulation)(Text with EEA relevance)THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,Having regard to the Treaty on the Functioning of the European Union, and in particular Article 16 thereof, Having regard to the proposal from the European Commission,After transmission of the draft legislative act to the national parliaments,Having regard to the opinion of the European Economic and Social Committee (1), Having regard to the opinion of the Committee of the Regions (2),Acting in accordance with the ordinary legislative procedure (3), Whereas:The protection of natural persons in relation to the processing of personal data is a fundamental right.
{'Type': 'Definition',

In [4]:

# Load the DPA data from the Excel file
dpa_data = pd.read_excel("data/Corpus/DPA11_Cookieyes_Obligation_Output.xlsx")
dpa_statements = dpa_data.iloc[:, 0].tolist()  # Assuming the text input is in the first column

def is_valid_output(json_output):
    required_fields = ["Type", "Actor", "Action"]
    for field in required_fields:
        if not json_output.get(field):
            return False
    return True

knowledge_graph = []
statements = dpa_statements[:20]  # Process the first 50 statements

for text in statements:
    retries = 3
    json_output = None
    while retries > 0:
        json_output = extract_regulatory_data(text)
        if is_valid_output(json_output):
            break
        retries -= 1
    
    if json_output:
        json_output["InputText"] = text  # Add the original text to the JSON output
        knowledge_graph.append(json_output)

# Save the knowledge graph to a JSON file
output_json_filename = "Output/my_dpa_knowledge_graph_output.json"
with open(output_json_filename, 'w') as json_file:
    json.dump(knowledge_graph, json_file, indent=4)

print(f"Knowledge graph saved to {output_json_filename}")

# Convert the knowledge graph to a DataFrame and save to an Excel file
df = pd.DataFrame(knowledge_graph)
# Reorder columns to have 'InputText' as the first column
columns = ["InputText"] + [col for col in df.columns if col != "InputText"]
df = df[columns]

output_excel_filename = "Output/my_dpa_knowledge_graph_output.xlsx"
df.to_excel(output_excel_filename, index=False)

print(f"Knowledge graph saved to {output_excel_filename}")

Knowledge graph saved to Output/my_dpa_knowledge_graph_output.json
Knowledge graph saved to Output/my_dpa_knowledge_graph_output.xlsx


In [2]:
import pandas as pd
from jarowinkler import jarowinkler_similarity
from rouge_score import rouge_scorer
import openai
import re
import os
from openai import OpenAI

from ollama import chat


os.environ["OPENAI_API_KEY"] = ""
client = OpenAI()
openai.api_key = ""

def jaro_winkler_similarity(s1, s2):
    return jarowinkler_similarity(s1, s2)

def generate_improved_statement(kg_row, statement):
    prompt = f"""
    Reference Knowledge Graph:
    Type: {kg_row['Type']}
    Actor: {kg_row['Actor']}
    Action: {kg_row['Action']}
    Reason: {kg_row['Reason']}
    Condition: {kg_row['Condition']}
    Modality: {kg_row['Modality']}
    Exception: {kg_row['Exception']}
    Violation: {kg_row['Violation']}
    Artifact: {kg_row['Artifact']}
    Presence: {kg_row['Presence']}
    
    Original Statement:
    {statement}
    
    In the first part as "Improvements" describe what type of statement it is and describe what semantic roles need to be improved based on the reference knowledge graph above to improve the statement based on that. If there is an nan value, do not mention in the description and just say what improvement needs to be done.
    In the second part as "Improved Statement" consider the knowledge graph and improve the statement with reference to it. In the improved statement, include all semantic roles that exist in the knowledge graph and remove any semantic roles that do not exist in the knowledge graph. The length of improved statement should not be very different than the original statement.
    Generate the improved statement in a natural language.
    In the third part assess different semantic roles in the improved statement and create the knowledge graph of it.
    In the fourth part return the reference knowledge graph.
    In the fifth part assess different semantic roles in {statement} and create a new knowledge graph for it.
    Return the third, fourth, and fifth outputs in a dictionary format. The output format is 
    'Type': ['Type'],
    'Actor': ['Actor'],
    'Action': ['Action'],
    'Reason': ['Reason'],
    'Condition': ['Condition'],
    'Modality': ['Modality'],
    'Exception': ['Exception'],
    'Violation': ['Violation'],
    'Artifact': ['Artifact'],
    'Presence': ['Presence'].
    The first output is "Improvements", which was done in the first part. The second output is "Improved Statement", which was done in the second part above.
    The third output is the knowledge graph of "Improved Statement". The fourth output is the reference knowledge graph, which is {kg_row}. The fifth output is "Policy Statement KG" as the knowledge graph of {statement}, which was extracted in the fifth part.
    You must return 5 outputs. The outputs are "Improvements", "Improved Statement", "Improved Statement KG", "Reference Statement KG", and "Policy Statement KG":
    """

    # response = client.completions.create(
    #     model="gpt-3.5-turbo-instruct",
    #     prompt=prompt,
    #     max_tokens=1000,
    #     n=1,
    #     stop=None,
    #     temperature=0.4
    # )
    # response_text = response.choices[0].text.strip()



    # response = client.chat.completions.create(
    #     model="gpt-4",
    #     messages=[{"role": "user", "content": prompt}],
    #     max_tokens=1000,
    #     temperature=0.5
    # )
    # response_text = response.choices[0].message.content

    response = chat(
        messages=[

            {
                'role': 'user',
                'content': prompt,
            }
        ],
        # model='llama3.2:1b',
        # model = "cas/llama-3.2-1b-instruct:latest",
        # model = "deepseek-r1:1.5b",
        # model = "deepseek-r1:8b",
        model = "llama3:8b",
        # model = "smollm:1.7b",
        # format=KnowledgeGraph.model_json_schema(),
    )
    response_text = response.message.content
    print(response_text)



    # if "Improvements:" in response_text and "Improved Statement:" in response_text:
    #     parts = response_text.split("Improved Statement:")
    #     improvements_part = parts[0].split("Improvements:")[-1].strip()
    #     improved_statement_part = parts[1].split("Improved Statement KG:")[0].strip()
    #     parts = response_text.split("Reference Statement KG:")
    #     improved_statement_kg_part = parts[0].split("Improved Statement KG:")[-1].strip()
    #     parts = response_text.split("Policy Statement KG:")
    #     reference_statement_kg_part = parts[0].split("Reference Statement KG:")[-1].strip()
    #     policy_statement_kg_part = parts[1].strip()

    # improved_statement_part = re.sub(r"Type: .*?Presence: .*?\n", "", improved_statement_part, flags=re.DOTALL).strip()

    # return improvements_part, improved_statement_part, improved_statement_kg_part, reference_statement_kg_part, policy_statement_kg_part

    if "Improvements" in response_text and "Improved Statement" in response_text:
        parts = response_text.split("Improved Statement")
        improvements_part = parts[0].split("Improvements")[-1].strip()
        improved_statement_part = parts[1].split("Improved Statement KG")[0].strip()
        parts = response_text.split("Reference Statement KG")
        improved_statement_kg_part = parts[0].split("Improved Statement KG")[-1].strip()
        parts = response_text.split("Policy Statement KG")
        reference_statement_kg_part = parts[0].split("Reference Statement KG")[-1].strip()
        policy_statement_kg_part = parts[1].strip()

    improved_statement_part = re.sub(r"Type: .*?Presence: .*?\n", "", improved_statement_part, flags=re.DOTALL).strip()

    return improvements_part, improved_statement_part, improved_statement_kg_part, reference_statement_kg_part, policy_statement_kg_part


    
def process_statements(kg_filename, statements_filename, output_filename):
    # Read the reference KG and statements from Excel files
    kg_df = pd.read_excel(kg_filename)[:20]
    statements_df = pd.read_excel(statements_filename)[:20]
    
    # Ensure both files have the same number of rows
    if len(kg_df) != len(statements_df):
        raise ValueError("The number of rows in the reference KG and statements file must be the same.")
    
    # Create a list to store the improved statements and similarity scores
    improvements = []
    improved_statements = []
    improved_statements_kg = []
    reference_statements_kg = []
    reg_statements = []
    policy_statements_kg = []
    original_statements = []
    similarity_scores = []
    rouge_policy_regulation_scores = []
    rouge_improved_regulation_scores = []

    
    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Iterate over each row and generate the improved statement
    for index, (kg_row, statement_row) in enumerate(zip(kg_df.iterrows(), statements_df.iterrows())):
        kg_row = kg_row[1]  # Extract the row data
        statement = statement_row[1]['InputText']
        statement_reg = statement_row[1]['InputText']

        improvement, improved_statement, improved_statement_kg, reference_statement_kg, policy_statement_kg = generate_improved_statement(kg_row, statement)
        improvements.append(improvement)
        improved_statements.append(improved_statement)
        improved_statements_kg.append(improved_statement_kg)
        reference_statements_kg.append(reference_statement_kg)
        original_statements.append(statement)
        policy_statements_kg.append(policy_statement_kg)
        reg_statements.append(statement_reg)

        # Extract semantic roles from the improved statement
        
        # improved_roles = input_to_dict(improved_statement_kg)

        # Extract list items
        # print(improved_roles)
        # improved_roles = extract_list_items(is_kg)

        # Calculate Jaro-Winkler similarity for each semantic role present in the reference KG
        # Extract the columns
        # roles1 = df[col1].astype(str).tolist()
        # roles2 = df[col2].astype(str).tolist()
        
        
        # role_similarities = {}
        # for role in kg_row.index:
        #     if role in improved_roles and kg_row[role] != 'nan':
        #         print((str(kg_row[role]), improved_roles[role]))
        #         role_similarities[role] = jaro_winkler_similarity(str(kg_row[role]), improved_roles[role])
        # print("_______________________-____________________-_________________________-_______________________-__________________")
        
        
        # # Concatenate similarity scores into a single string
        # concatenated_similarities = ', '.join(f"{role}: {similarity:.2f}" for role, similarity in role_similarities.items())
        # similarity_scores.append(concatenated_similarities)

        # Compute ROUGE scores
        rouge_policy_regulation = scorer.score(statement, statement_reg)
        rouge_improved_regulation = scorer.score(improved_statement, statement_reg)
        
        rouge_policy_regulation_scores.append(rouge_policy_regulation)
        rouge_improved_regulation_scores.append(rouge_improved_regulation)

        # Log the progress
        print(f"Processed row {index + 1}/{len(kg_df)}: {improved_statement}")



    # Initialize a list to store the similarity scores
    similarity_scores_imp_reg = []
    
    # Compute Jaro-Winkler similarity score for each pair
    for role1, role2 in zip(improved_statements_kg, reference_statements_kg):
        
        score = jaro_winkler_similarity(role1, role2)
        # print(role1,"__________________",role2,"____",score)
        similarity_scores_imp_reg.append(score)

    similarity_scores_pol_reg = []
    
    # Compute Jaro-Winkler similarity score for each pair
    for role1, role2 in zip(policy_statements_kg, reference_statements_kg):
        
        score = jaro_winkler_similarity(role1, role2)
        # print(role1,"__________________",role2,"____",score)
        similarity_scores_pol_reg.append(score)
    

    # Extract ROUGE scores into separate columns
    rouge1_policy_reg = [score['rouge1'].fmeasure for score in rouge_policy_regulation_scores]
    rouge2_policy_reg = [score['rouge2'].fmeasure for score in rouge_policy_regulation_scores]
    rougeL_policy_reg = [score['rougeL'].fmeasure for score in rouge_policy_regulation_scores]
    
    rouge1_improved_regulation = [score['rouge1'].fmeasure for score in rouge_improved_regulation_scores]
    rouge2_improved_regulation = [score['rouge2'].fmeasure for score in rouge_improved_regulation_scores]
    rougeL_improved_regulation = [score['rougeL'].fmeasure for score in rouge_improved_regulation_scores]

    # Create a DataFrame with the original and improved statements, concatenated similarity scores, and ROUGE scores
    result_df = pd.DataFrame({
        'Reference Regulation Statement': reg_statements,
        'Original Policy Statement': original_statements,
        'Improved Statement': improved_statements,
        'Improvements': improvements,
        'Reference Statement KG': reference_statements_kg,
        'Policy Statement KG': policy_statements_kg,
        'Improved Statement KG': improved_statements_kg,
        'Original Policy vs Regulation Semantic Role Similarity Score': similarity_scores_pol_reg,
        'Improved Policy vs Regulation Semantic Role Similarity Score': similarity_scores_imp_reg,
        'Original Policy vs Regulation ROUGE-1': rouge1_policy_reg,
        'Original Policy vs Regulation ROUGE-2': rouge2_policy_reg,
        'Original Policy vs Regulation ROUGE-L': rougeL_policy_reg,
        'Improved Policy vs Regulation ROUGE-1': rouge1_improved_regulation,
        'Improved POlicy vs Regulation ROUGE-2': rouge2_improved_regulation,
        'Improved Policy vs Regulation ROUGE-L': rougeL_improved_regulation
    })
    
    # Save the result to a new Excel file
    result_df.to_excel(output_filename, index=False)

# Example usage

# utilise  knowledge graph based on articles and corrected manually
kg_filename="Output/gdpr_articles_24_43_kg_output_corrected.xlsx"
#kg_filename = "Output/my_knowledge_graph_output.xlsx"  # Reference KG file with columns Actor, Action, Reason, Condition, Modality
statements_filename = "Output/my_dpa_knowledge_graph_output.xlsx"  # Statements file with column Statement
output_filename = 'Output/gdpr_articles_Improved_Policy_4_gpt4.xlsx'  # Output file to save the improved statements

process_statements(kg_filename, statements_filename, output_filename)

print(f"Output saved to {output_filename}")


Here are the outputs:

**Improvements**
The original statement is a Policy Statement, and it needs to be improved by including the semantic role of "Type: Obligation" and "Reason: processing is performed in accordance with this Regulation". The semantic roles that need to be improved are "Action", "Condition", and "Modality".

**Improved Statement**
The Processor shall implement appropriate technical and organisational measures to ensure and to be able to demonstrate that processing is performed in accordance with this Regulation, taking into account the nature, scope, context and purposes of processing as well as the risks of varying likelihood and severity for the rights and freedoms of natural persons.

**Improved Statement KG**
Type: Obligation
Actor: controller
Action: implement appropriate technical and organisational measures to ensure and to be able to demonstrate that processing is performed in accordance with this Regulation
Reason: processing is performed in accordance with 

In [1]:
import json
import pandas as pd
from jarowinkler import jarowinkler_similarity
from sklearn.metrics.pairwise import cosine_similarity
import re
from openai import OpenAI
import os
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import config
from sentence_transformers import SentenceTransformer

# Set up OpenAI client (insert your actual API key)
os.environ["OPENAI_API_KEY"] = config.token_key
client = OpenAI()

# Load BERT and RoBERTa models and tokenizers
bert_model_name = "bert-base-uncased"
roberta_model_name = "roberta-base"

bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
roberta_model = AutoModel.from_pretrained(roberta_model_name)

stsb_model = SentenceTransformer('stsb-roberta-base')


# def get_OpenAI_embedding(text, model_ID="text-embedding-3-small"):
#     """
#     Generate embeddings using OpenAI's embedding model.
#     """
#     text = text.replace("\n", " ")  # Remove newline characters for consistency
#     response = client.embeddings.create(input=[text], model=model_ID)
#     return np.array(response.data[0].embedding)

def get_bert_embedding(text):
    """
    Generate embeddings using BERT.
    """
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    # Use the [CLS] token embedding as the sentence embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

def get_roberta_embedding(text):
    """
    Generate embeddings using RoBERTa.
    """
    inputs = roberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    # Use the [CLS] token embedding as the sentence embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

def get_stsb_embedding(text):
    """
    Generate embeddings using STSB model.
    """
    return stsb_model.encode(text)

def get_bin_embedding(text):
    """
    Placeholder for BIN embedding generation.
    Replace this with your actual BIN embedding logic or precomputed values.
    """
    # Example: Return a dummy embedding (replace with actual logic)
    return np.random.rand(768)




# Load the JSON file
with open("Output/my_knowledge_graph_output.json", "r") as file:
    data = json.load(file)

# Extract the relevant fields
fields = ["Type", "Actor", "Action", "Reason", "Condition", "Modality", "Exception", "Violation", "Artifact", "Presence"]
n = len(data)

# Prepare empty matrices to store similarity scores for each field
similarity_matrices = {field: np.zeros((n, n)) for field in fields}

# Create a DataFrame to store the results
results = []

for i in range(n):
    for j in range(i + 1, n):
        text1 = data[i]["InputText"]
        text2 = data[j]["InputText"]

        # Compute Jaro-Winkler similarity
        overall_sim = jarowinkler_similarity(text1, text2)

        # openai_emb1 = get_OpenAI_embedding(text1)
        # openai_emb2 = get_OpenAI_embedding(text2)
        # openai_cos_sim = cosine_similarity(openai_emb1.reshape(1, -1), openai_emb2.reshape(1, -1))[0][0]
        
        # Compute BERT embeddings and cosine similarity
        bert_emb1 = get_bert_embedding(text1)
        bert_emb2 = get_bert_embedding(text2)
        bert_cos_sim = cosine_similarity(bert_emb1.reshape(1, -1), bert_emb2.reshape(1, -1))[0][0]

        # Compute RoBERTa embeddings and cosine similarity
        roberta_emb1 = get_roberta_embedding(text1)
        roberta_emb2 = get_roberta_embedding(text2)
        roberta_cos_sim = cosine_similarity(roberta_emb1.reshape(1, -1), roberta_emb2.reshape(1, -1))[0][0]

        # Compute STSB embeddings and cosine similarity
        stsb_emb1 = get_stsb_embedding(text1)
        stsb_emb2 = get_stsb_embedding(text2)
        stsb_cos_sim = cosine_similarity(stsb_emb1.reshape(1, -1), stsb_emb2.reshape(1, -1))[0][0]

        # Compute BIN embeddings and cosine similarity
        bin_emb1 = get_bin_embedding(text1)
        bin_emb2 = get_bin_embedding(text2)
        bin_cos_sim = cosine_similarity(bin_emb1.reshape(1, -1), bin_emb2.reshape(1, -1))[0][0]


        # Store results
        row = {
            "Text 1": text1,
            "Text 2": text2,
            "Overall Similarity": overall_sim,
            "BERT Cosine Similarity": bert_cos_sim,
            "RoBERTa Cosine Similarity": roberta_cos_sim,
            "STSB Cosine Similarity": stsb_cos_sim,
            "BIN Cosine Similarity": bin_cos_sim
        }
        for field in fields:
            text1_field = data[i].get(field, "")
            text2_field = data[j].get(field, "")
            similarity = jarowinkler_similarity(text1_field, text2_field)
            row[f"{field} Similarity"] = similarity
        results.append(row)

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to an Excel file
results_df.to_excel("Output/my_knowledge_graph_similarity_results_with_bert_roberta.xlsx", index=False)

print("Similarity results saved to 'Output/knowledge_graph_similarity_results_with_bert_roberta.xlsx'")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/672 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Similarity results saved to 'Output/knowledge_graph_similarity_results_with_bert_roberta.xlsx'


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE  # Import SMOTE for oversampling
import ast
from imblearn.over_sampling import RandomOverSampler  # Importing RandomOverSampler
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')

sheet_name = 'JW_Cos_1_2_3_4_5_6_R1_R2_7_9_11'
df = pd.read_excel("data/Score_Label.xlsx",sheet_name=sheet_name)#[:1343]

# # Undersampling
# # Separate majority and minority classes
# df_majority = df[df['Label'] == 0]
# df_minority = df[df['Label'] == 1]
# # Undersample majority class
# df_majority_undersampled = df_majority.sample(n=len(df_minority), random_state=42)
# # Combine minority class with undersampled majority class
# df_balanced = pd.concat([df_majority_undersampled, df_minority])
# # Shuffle the resulting dataset
# df = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
# # Now df_balanced has equal 0 and 1 labels

jw_vec = df['JW'].tolist()
# cos_vec = df['Cos'].tolist()
# cos_score = df['Cosine_Similarity'].tolist()
# jw_score = df['Jaro_Wink_Similarity_By_Text'].tolist()
BERT_cos_score = df['BERT_Cosine_Similarity_Text']
Roberta_cos_score = df['Roberta_Cosine_Similarity_Text']
stsb_cos_score = df['STSB_Cosine_Similarity_Text']
BERT_cos_score_KG = df['BERT_Cosine_Similarity_KG']
stsb_cos_score_KG = df['STSB_Cosine_Similarity_KG']
bin_cos_score = df['Binary_Cos'].tolist()
label = df['Final_Label'].tolist()

jw_vec = [ast.literal_eval(lst) for lst in jw_vec]
# cos_vec = [ast.literal_eval(lst) for lst in cos_vec]

# Find the maximum length of the feature vectors
max_len = max(len(vec) for vec in jw_vec)
jw_vec = [vec + [0] * (max_len - len(vec)) for vec in jw_vec]
# cos_vec = [vec + [0] * (max_len - len(vec)) for vec in cos_vec]

def remove_indices(lst, indices_to_remove):
    # Use list comprehension to filter out elements at specified indices
    value_list=[]
    for seq in lst:
        values=[]
        for index, value in enumerate(seq):
            if index not in indices_to_remove:
                values.append(value)
        value_list.append(values)
        
    return(value_list)
    
indices_to_remove = {}  # Indices to be removed
jw_vec = remove_indices(jw_vec, indices_to_remove)
# cos_vec = remove_indices(cos_vec, indices_to_remove)

similarity_vector = jw_vec
# similarity_vector = cos_vec
for n,sim in enumerate(similarity_vector):
    # similarity_vector[n] = sim + cos_vec[n]
    # similarity_vector[n].append(cos_score[n])
    # similarity_vector[n].append(jw_score[n])
    similarity_vector[n].append(BERT_cos_score[n])
    similarity_vector[n].append(Roberta_cos_score[n])
    similarity_vector[n].append(stsb_cos_score[n])
    # similarity_vector[n].append(BERT_cos_score_KG[n])
    # similarity_vector[n].append(stsb_cos_score_KG[n])
    similarity_vector[n].append(bin_cos_score[n])


# Sample data (replace with your actual data)
X = np.array(similarity_vector)  # Ensure X is a NumPy array with consistent shape
# X = [arr[22:] for arr in similarity_vector]
y = np.array(label)  # Replace with actual labels
rs=22
# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=rs)

# Step 3: Apply RandomOverSampler to the training data
oversampler = RandomOverSampler(random_state=rs)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

In [4]:
from sklearn.model_selection import train_test_split
import numpy as np

# Convert data to NumPy arrays (if not already)
X = np.array(similarity_vector)  # Ensure X is a NumPy array with consistent shape
y = np.array(label)  # Replace with actual labels

rs = 533  # Random seed

# Step 1: Split the data into training (90%) and test (10%) sets
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1, random_state=rs)

# Step 2: Further split the training set into training (80% of the original data) and validation (10%)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1 / 0.9, random_state=rs)

# Output the sizes of each set
print(f"Training set size: {X_tr.shape[0]} samples")
# print(f"Validation set size: {X_val.shape[0]} samples")
print(f"Test set size: {X_te.shape[0]} samples")

Training set size: 4142 samples
Test set size: 461 samples


In [5]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

extended_X = X_tr
extended_y = y_tr

# Define base learners with hyperparameters
# base_learners = [
#     ('rf', RandomForestClassifier(n_estimators=40, max_depth=10, random_state=42)),
#     ('gb', GradientBoostingClassifier(max_depth=10, n_estimators=40, random_state=42)),
#     ('xgb', XGBClassifier(max_depth=10, min_child_weight=1, n_estimators=40, random_state=42))
# ]

# base_learners = [
#     ('rf', RandomForestClassifier(n_estimators=25, max_depth=8, random_state=42)),
#     ('gb', GradientBoostingClassifier(max_depth=8, n_estimators=25, random_state=42)),
#     ('xgb', XGBClassifier(max_depth=4, min_child_weight=5, n_estimators=50, random_state=42))
# ]

# # Define the Stacking Classifier with Logistic Regression as the final estimator
# final_estimator = LogisticRegression(C=1, penalty='l2', solver='lbfgs', random_state=42)
# model = StackingClassifier(estimators=base_learners, final_estimator=final_estimator)

# model = XGBClassifier(max_depth=10, min_child_weight=5, n_estimators=500, use_label_encoder=False, eval_metric='logloss')

# model = XGBClassifier(
#     objective='binary:logistic',
#     n_estimators=25,
#     learning_rate=0.1,
#     max_depth=9,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     min_child_weight=1,
#     scale_pos_weight=1,  # Adjust this for imbalanced data
#     gamma=0,
#     reg_alpha=0,
#     reg_lambda=1,
#     eval_metric='logloss',
#     random_state=42
# )
# from sklearn.neural_network import MLPClassifier
# model = MLPClassifier(hidden_layer_sizes=(50,25), activation='tanh', solver='adam', alpha=0.01, learning_rate='adaptive', max_iter=2000, random_state=42)


# Create the RandomForestClassifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)



# Lists to store F1-scores for each label
f1_scores_0 = []
f1_scores_1 = []

subsets = [i for i in range(50) if i % 3 == 0]
accuracies = []

for i in subsets:
    rs = i
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=rs)
    X_train, X_test, y_train, y_test = train_test_split(extended_X, extended_y, test_size=0.05, random_state=rs)
    oversampler = RandomOverSampler(random_state=rs)
    X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

    # Step 9: Concatenate false-labeled samples with the training data
    # X_train_resampled_extended = np.concatenate((X_train_resampled, false_labeled_X), axis=0)
    # y_train_resampled_extended = np.concatenate((y_train_resampled, false_labeled_y_true), axis=0)

    # from imblearn.under_sampling import RandomUnderSampler
    # # Initialize the undersampler
    # undersampler = RandomUnderSampler(random_state=rs)
    # # Apply undersampling
    # X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

    manual_threshold = 0.5  # Set your desired threshold here

    # model.fit(X_train_resampled_extended, y_train_resampled_extended)
    model.fit(X_train_resampled, y_train_resampled)
    # model.fit(X_train, y_train)

    y_probs = model.predict_proba(X_test)[:, 1]
    y_pred = (y_probs >= manual_threshold).astype(int)

    # # Step 7: Find indices of false-labeled samples (where predictions are incorrect)
    false_labeled_indices = np.where(y_pred != y_test)[0]
    # Step 8: Extract false-labeled X_test vectors
    false_labeled_X = X_test[false_labeled_indices]
    false_labeled_y_true = y_test[false_labeled_indices]
    
    extended_X = np.concatenate((extended_X, false_labeled_X), axis=0)
    extended_y = np.concatenate((extended_y, false_labeled_y_true), axis=0)

    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    # Get classification report as dictionary
    class_report_dict = classification_report(y_test, y_pred, output_dict=True)
    
    # Extract F1-scores for each label
    f1_score_0 = class_report_dict['0']['f1-score']
    f1_score_1 = class_report_dict['1']['f1-score']
    
    # Append F1-scores to respective lists
    f1_scores_0.append(f1_score_0)
    f1_scores_1.append(f1_score_1)

    print('Classification Report:')
    print(classification_report(y_test, y_pred))

print('Average Accuracy:', sum(accuracies) / len(accuracies))

# # Output the F1-score lists
# print('F1-scores for label 0:', f1_scores_0)
# print('F1-scores for label 1:', f1_scores_1)


Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.57      0.65        77
           1       0.78      0.89      0.83       131

    accuracy                           0.77       208
   macro avg       0.76      0.73      0.74       208
weighted avg       0.77      0.77      0.76       208

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.71      0.75        86
           1       0.81      0.88      0.84       124

    accuracy                           0.81       210
   macro avg       0.81      0.79      0.80       210
weighted avg       0.81      0.81      0.81       210

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80        82
           1       0.88      0.87      0.87       130

    accuracy                           0.84       212
   macro avg       0.84      0.84      0.84       212
weigh

In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')

# Define the desired column order for feature extraction
desired_order = [
    "Type Similarity", "Actor Similarity", "Action Similarity", "Reason Similarity", 
    "Condition Similarity", "Modality Similarity", "Exception Similarity", 
    "Violation Similarity", "Artifact Similarity", "Presence Similarity",
    "Overall Similarity", 
    "BERT Cosine Similarity", "RoBERTa Cosine Similarity", 
    "STSB Cosine Similarity", "BIN Cosine Similarity"
]

# Load the new data from the Excel file
df_new = pd.read_excel("Output/my_knowledge_graph_similarity_results_with_bert_roberta.xlsx")  # Ensure correct file path

# Extract the feature columns in the desired order
X_new = df_new[desired_order].values

# Assuming the RandomForestClassifier model is already trained and available as `model`
# If not, you need to train the model first as shown in the previous code

# Make predictions on the new data
y_pred_new = model.predict(X_new)

# Add the predictions to the new DataFrame as the last column
df_new['Predicted_Label'] = y_pred_new

# Save the updated DataFrame to a new Excel file
df_new.to_excel("Output/my_knowledge_graph_similarity_results_with_predictions.xlsx", index=False)

print("Predictions saved to 'Output/knowledge_graph_similarity_results_with_predictions.xlsx'")

Predictions saved to 'Output/knowledge_graph_similarity_results_with_predictions.xlsx'
