In [42]:
import ollama
import json
import pandas as pd
import datetime
import chromadb
from deepeval.dataset import EvaluationDataset
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
import numpy as np


In [1]:


# Define the function to extract regulatory data
def extract_regulatory_data(regulatory_text):
    prompt = f"""
You are an expert in the legal domain and computational linguistics. Follow these steps strictly:

1. Convert passive voice input sentences into active voice.
2. Break down the input into syntactic parts following the schema precisely.
3. Before categorizing any part, explicitly verify: "Can this logically belong to this category based on provided markers?" If yes, categorize it; if not, do not categorize it.
4. Use schema markers ONLY as hints; do NOT include markers in the final JSON output.

Schema:
{{
    "Type": ["Definition", "Prohibition", "Obligation", "Fact", "Penalty", "Permission", "Recommendation", "Exemption"],
    "Action": "VP excluding modality, condition, exception, and reason annotations",
    "Condition": ["SRel << (condition marker)", "PP << (condition marker)", "Ssub << (condition marker)", "NP < (VPinf !<< (exception marker) !<< (reason marker))", "NP < (VPart !<< (exception marker) !<< (reason marker))"],
    "Condition Marker": ["if", "in case of", "provided that", "when", "in the context of", "limit", "who", "whose", "which"],
    "Modality": ["VN < (modality marker)"],
    "Modality Marker": ["may", "must", "is prohibited from", "should", "shall", "can", "need to", "required to", "is authorized to"],
    "Actor": ["subject dependency and NP < (actor marker)", "object dependency and passive voice and PP < P $ (NP < (actor marker))", "object dependency and active voice and NP < (actor marker)"],
    "Actor Marker": ["firm", "expert", "staff", "company", "consumer", "tax"],
    "Artifact": ["NP < (artifact marker)", "NP !<< (violation marker) | !<< (time marker) | !<< (situation marker) | !<< (sanction marker) | !<< (reference marker) | !<< (location marker) | !<< (action marker)"],
    "Artifact Marker": ["document", "agreement", "certificate", "license", "permit", "warrant", "pass"],
    "Exception": ["Srel << (exception marker)", "Ssub << (exception marker)", "NP < (VPinf !<< (exception marker))", "PP << (exception marker)", "NP << (P < (exception marker) $ VPinf)"],
    "Exception Marker": ["with the exception of", "except for", "derogation", "apart from", "other than"],
    "Presence": ["NP < (location marker)"],
    "Presence Marker": ["site", "place", "target market", "customer base", "street"],
    "Reason": ["Srel << (reason marker)", "Ssub << (reason marker)", "PP << (reason marker)", "NP < (VPart << (reason marker))", "NP << (P < (reason marker) $ VPinf)"],
    "Reason Marker": ["in order to", "for the purpose of", "so as to", "so that", "in the interest of", "in view of"],
    "Sanction": ["NP < (sanction marker)"],
    "Sanction Marker": ["punishment", "jail sentence", "imprisonment", "prison term", "fine"],
    "Situation": ["NP < (situation marker)"],
    "Situation Marker": ["renewal", "inspection", "registration", "deliberation"],
    "Time": ["NP < (time marker)", "PP < (P < (time marker)) $ NP"],
    "Time Marker": ["before", "after", "temporary", "permanent", "period", "day", "year", "month", "date"],
    "Violation": ["NP < (violation marker)"],
    "Violation Marker": ["offence", "crime", "misdemeanor", "civil wrong", "infraction", "transgression"]
}}

Sample:
'Input': 'Firms should be able to provide us with the information they are using to monitor whether they are achieving outcomes for consumers with characteristics of vulnerability that are as good as those for other consumers (see monitoring and evaluation in Chapter 5).',
  'JSON': 
    'Type': 'Obligation',
    'Actor': 'Firm',
    'Modality': 'should',
    'Action': 'be able to provide us with the information they are using',
    'Reason': 'to monitor whether they are achieving outcomes as good as those for other consumers',
    'Condition': 'for consumers with characteristics of vulnerability',
    'Artifact': 'monitoring and evaluation in Chapter 5',
    'Violation': null,
    'Exception': null,
    'Presence': null,
    'Time': null,
  
 'Input': 'Where possible, staff should be able to respond to the consumer’s needs promptly so that action is taken to ensure harm does not occur or become more severe.',
  'JSON': 
    'Type': 'Obligation',
    'Actor': 'Staff',
    'Modality': 'should',
    'Action': "respond to the consumer\'s needs promptly",
    'Reason': 'action is taken to ensure harm does not occur or become more severe',
    'Condition': 'where possible',
    'Artifact': null,
    'Violation': null,
    'Exception': null,
    'Presence': null,
    'Time': null,
 'Input': 'Firms should improve the skills and capability of staff in a way that is proportionate.',
  'JSON': 
    'Type': 'Obligation',
    'Actor': 'Firm',
    'Modality': 'should',
    'Action': 'improve the skills and capability of staff',
    'Reason': null,
    'Condition': 'in a way that is proportionate',
    'Artifact': null,
    'Violation': null,
    'Exception': null,
    'Presence': null,
    'Time': null,
 'Input': 'If a firm’s business model intentionally exploits vulnerable consumers, this would be a clear breach of our Principles.',
  'JSON': 
    'Type': 'Prohibition',
    'Actor': 'Firm’s business model',
    'Modality': 'if',
    'Action': 'intentionally exploits vulnerable consumers',
    'Reason': null,
    'Condition': null,
    'Artifact': null,
    'Violation': 'breach of our Principles',
    'Exception': null,
    'Presence': null,
    'Time': null,
 'Input': 'Firms should ensure they record and process data in line with requirements of data protection legislation, see Appendix 1.',
  'JSON': 
    'Type': 'Obligation',
    'Actor': 'Firm',
    'Modality': 'should',
    'Action': 'ensure they record and process data in line with requirements of data protection legislation',
    'Reason': null,
    'Condition': null,
    'Artifact': 'Appendix 1',
    'Violation': null,
    'Exception': null,
    'Presence': null,
    'Time': null,



Return strictly structured JSON adhering to the schema above, excluding the markers themselves.

Input:
"{regulatory_text}"

JSON:
    """

    # Generate response from Llama model via Ollama
    response = ollama.generate(model="llama3:8b", prompt=prompt, format="json")

    # Parse and return JSON output
    return json.loads(response["response"])

# Example usage:
regulation_example = """Under Article 5(1)(d) firms should take care to ensure the accuracy of information they record about customers and vulnerabilities. This may be challenging for firms where vulnerabilities are temporary, and firms should consider this in the context of the customer service they provide."""

structured_data = extract_regulatory_data(regulation_example)

print(json.dumps(structured_data, indent=2))

{
  "Type": "Obligation",
  "Action": "take care to ensure the accuracy of information they record about customers and vulnerabilities",
  "Condition": null,
  "Reason": null,
  "Actor": "Firm",
  "Modality": "should",
  "Artifact": null,
  "Violation": null,
  "Exception": null,
  "Presence": null,
  "Time": null
}


In [296]:
#gdpr_articles_json_filename = 'Output/gdpr_articles_24_43_qa_output.json'
gdpr_articles_json_filename = 'Output/gdpr_chapter4_recital_qa_correct_output.json'

with open(gdpr_articles_json_filename) as json_file:
    article_json_data = json.load(json_file)
print(len(article_json_data))

statements_all =[]
for article in article_json_data:
    statements_all.append(article['InputText'])

print(len(statements_all))                         
                         

46
46


In [None]:
import pandas as pd
import datetime
#data = pd.read_excel("data/GDPR_Obligation_Output.xlsx")
#statements_all = data["Text"].tolist()

#data = pd.read_csv('data/cejas_dpa_compliance_req.txt', sep="\t", header=None)
#data.columns = ["Statement"]

#data = pd.read_excel("data/Generated_Ontology_GDPR_Output.xlsx")
#statements_all = data["Statement"].tolist()

import pandas as pd

def is_valid_output(json_output):
    required_fields = ["Type", "Actor", "Action"]
    for field in required_fields:
        if not json_output.get(field):
            return False
    return True

knowledge_graph = []
#statements = statements_all[:50]
statements = statements_all

for index,text in enumerate(statements):
    now = datetime.datetime.now()
    
    print(now.time())
    print(index)
    print(text)
    retries = 3
    json_output = None
    while retries > 0:
        json_output = extract_regulatory_data(text)
        if is_valid_output(json_output):
            break
        retries -= 1
    print(json_output)
    
    if json_output:
        json_output["InputText"] = text  # Add the original text to the JSON output
        knowledge_graph.append(json_output)


In [298]:



# Save the knowledge graph to a JSON file
#output_json_filename = "Output/my_knowledge_graph_output.json"
#output_json_filename = "Output/cejas_dpa_compliance_graph_output.json"

#output_json_filename = "Output/gdpr_articles_24_43_kg_output.json"
output_json_filename = "Output/gdpr_chapter4_recital_kg_output.json"

with open(output_json_filename, 'w') as json_file:
    json.dump(knowledge_graph, json_file, indent=4)

print(f"Knowledge graph saved to {output_json_filename}")

# Convert the knowledge graph to a DataFrame and save to an Excel file
df = pd.DataFrame(knowledge_graph)
# Reorder columns to have 'InputText' as the first column
columns = ["InputText"] + [col for col in df.columns if col != "InputText"]
df = df[columns]

print(columns)

#output_excel_filename = "Output/cejas_dpa_compliance_graph_output.xlsx"

#output_excel_filename = "Output/my_knowledge_graph_output.xlsx"

#output_excel_filename = "Output/gdpr_articles_24_43_kg_output.xlsx"
output_excel_filename = "Output/gdpr_chapter4_recital_kg_output.xlsx"

df.to_excel(output_excel_filename, index=False)

print(f"Knowledge graph saved to {output_excel_filename}")

Knowledge graph saved to Output/gdpr_chapter4_recital_kg_output.json
['InputText', 'Type', 'Actor', 'Modality', 'Action', 'Reason', 'Condition', 'Artifact', 'Violation', 'Exception', 'Presence', 'Time', 'Condition Marker', 'JSON', 'Situation', 'Sanction', 'Violations', 'Exceptions']
Knowledge graph saved to Output/gdpr_chapter4_recital_kg_output.xlsx


In [None]:

data = pd.read_excel("Output/gdpr_articles_24_43_kg_output_corrected.xlsx")
# one more entry in the following txt as noticed as indvidual sentence at the end of recital 86 with no qa but seems an obligation
#data = pd.read_excel("Output/gdpr_chapter4_recital_kg_output_corrected.xlsx")

statements_all = data["InputText"].tolist()
print(len(statements_all))


    # Iterate over each row and generate the improved statement
for index,kg_row in enumerate(data.iterrows()):
        
        kg_row = kg_row[1]  # Extract the row data
        modality = kg_row['Modality']
    
        print("index:%d %s" %(index,kg_row['Sanction']))
        artifact = kg_row['Artifact']
        artifact = str(artifact)

        continue
    
        if not artifact== 'nan':
            print(artifact)
            
            art_list = re.split(r"[\,|\[|\]]",artifact)

            for art in art_list:
                art=  art.strip()
                if len(art)>0:
                    print(art)
        if index> 1:
            break
        

In [348]:
import ollama
import json

def generate_question_answer(context):
    empty_result= "{}"
    try:

        
        messages = f'''
            Your task is to generate realistic and applied questions that pertain to the provided regulatory or compliance material. Ensure that the context explicitly contains the answer to the question and the question should not expect a binary answer. the answer should be concise"
            Each single question and answer into a valid JSON format. Each JSON should contain a single question with a single answer. Only respond with valid JSON and no additional text. Test the JSON for validity before returning it.
        Input:
        "{context}"
        '''

        response = ollama.generate(model="llama3:8b", prompt=messages,format="json")

        return json.loads(response["response"])

        #question_list = chat_completion.choices[0].message.content.strip().split('\n')
        #questions = [re.sub(r'^\d+\.\s+', '', q).strip() for q in question_list if q.strip().endswith('?')]
    except Exception as e:
        print(f"An error occurred: {e}")
        time.sleep(20)
    return json.loads(empty_result)
    


In [349]:
# Example usage:
regulation_example = """The processing of personal data should be designed to serve mankind. The right to the protection of personal data is not an absolute right; it must be considered in relation to its function in society and be balanced against other fundamental rights, in accordance with the principle of proportionality."""

structured_data = generate_question_answer(regulation_example)

print(json.dumps(structured_data, indent=2))


{
  "question": "What is the purpose of processing personal data according to this regulatory material?",
  "answer": "The processing of personal data should be designed to serve mankind."
}


In [5]:
import pandas as pd
import datetime

data = pd.read_excel("data/Generated_Ontology_GDPR_Output.xlsx")
obligations_all = data[data.Type=="['Obligation']"]

print(len(obligations_all))


409


In [52]:
use_sentences=False
if use_sentences:
    #obligations_all = pd.read_csv('data/gdpr_articles_sentences_clean.txt', sep="\t", header=None)
    # article sentences related to controller and processor
    #obligations_all = pd.read_csv('data/gdpr_articles_24_43_sentences_clean.txt', sep="\t", header=None)
  
    obligations_all = pd.read_csv('data/gdpr_recitals_sentences_clean.txt', sep="\t", header=None)
else:
    
    #obligations_all = pd.read_csv('data/gdpr_articles_clean.txt', sep="\t", header=None)
    obligations_all = pd.read_csv('data/gdpr_recitals.txt', sep="\t", header=None)
    
obligations_all.columns = ["Statement"]
print(len(obligations_all))

173


In [None]:
import random
statements_all = obligations_all["Statement"].tolist()



def is_valid_output(json_output):
    required_fields = ["question", "answer"]
    for field in required_fields:
        if not json_output.get(field):
            return False
    return True

question_answers = []
if use_sentences:
    # shuffle sentences
    #random.shuffle(statements_all)
    #statements = statements_all[:50]
    statements = statements_all
else:
    statements = statements_all
    #statements = statements_all[:173]

current_article =''
current_recital = '1'
process_recitals = True

for index,text in enumerate(statements):
    now = datetime.datetime.now()

    current_recital = str(index+1)
    #if index==100:
    #    break
        
    print("%d %s" %(index,now.time()))
    print(text)
    print(current_recital)

    if text.startswith("Article"):
        current_article = text.split(" ")[1]
    print(current_article)     
        
    # skip certain lines in gdpr articles
    if text== '\n' or text == '' or text.startswith("Article") or text.startswith("CHAPTER") or text.startswith("Section"):
        continue
        
    retries = 3
    json_output = None
    while retries > 0:
        
        json_output = generate_question_answer(text)
        if is_valid_output(json_output):
            break
        retries -= 1
    print(json_output)
    
    if json_output:
        #json_output["InputText"] = text  # Add the original text to the JSON output
        json_output["InputText"] = text.replace(u"\u2018", "'").replace(u"\u2019", "'")  # change unicode chars for quotes
        if process_recitals:
            json_output["Recital"] = current_recital
        else:    
            json_output["Article"] = current_article
        question_answers.append(json_output)

# Save the knowledge graph to a JSON file
#output_json_filename = "Output/obligation_question_answer_output.json"
if use_sentences:
    output_json_filename = "Output/gdpr_recital_sentences_question_answer_output.json"
    #output_json_filename = "Output/gdpr_articles_sentences_question_answer_output.json"
    #output_json_filename = "Output/gdpr_articles_24_43_sentences_question_answer_output.json"
   
    
else:    
    output_json_filename = "Output/gdpr_recital_question_answer_output.json"
    #output_json_filename = "Output/gdpr_articles_question_answer_output.json"

with open(output_json_filename, 'w') as json_file:
    json.dump(question_answers, json_file, indent=4)

print(f"obligations question answer  saved to {output_json_filename}")


In [None]:

samples =f'''

        'Input':
        "Public authorities to which personal data are disclosed in accordance with a legal obligation for the exercise of their official mission, such as tax and customs authorities, financial investigation units, independent adminis\u00ad trative authorities, or financial market authorities responsible for the regulation and supervision of securities markets should not be regarded as recipients if they receive personal data which are necessary to carry out a particular inquiry in the general interest, in accordance with Union or Member State law. The requests for disclosure sent by the public authorities should always be in writing, reasoned and occasional and should not concern the entirety of a filing system or lead to the interconnection of filing systems. The processing of personal data by those public authorities should comply with the applicable data-protection rules according to the purposes of the processing"

        'JSON':
        {
        "question": "What type of public authorities are exempt from being considered recipients if they receive personal data necessary for a particular inquiry?",
        "answer": "Tax and customs authorities, financial investigation units, independent administrative authorities, or financial market authorities responsible for the regulation and supervision of securities markets" 
        }
        'Input': 
        "In the absence of an adequacy decision, the controller or processor should take measures to compensate for the lack of data protection in a third country by way of appropriate safeguards for the data subject. Such appropriate safeguards may consist of making use of binding corporate rules, standard data protection clauses adopted by the Commission, standard data protection clauses adopted by a supervisory authority or contractual clauses authorised by a supervisory authority. Those safeguards should ensure compliance with data protection requirements and the rights of the data subjects appropriate to processing within the Union, including the availability of enforceable data subject rights and of effective legal remedies, including to obtain effective adminis\u00ad trative or judicial redress and to claim compensation, in the Union or in a third country. They should relate in particular to compliance with the general principles relating to personal data processing, the principles of data protection by design and by default. Transfers may also be carried out by public authorities or bodies with public authorities or bodies in third countries or with international organisations with corresponding duties or functions, including on the basis of provisions to be inserted into administrative arrangements, such as a memorandum of understanding, providing for enforceable and effective rights for data subjects. Authorisation by the competent supervisory authority should be obtained when the safeguards are provided for in administrative arrangements that are not legally binding."
        'JSON':
        {
        "question": "What types of safeguards can a controller or processor use to compensate for the lack of an adequacy decision when transferring personal data to a third country?",
        "answer": "Binding corporate rules, standard data protection clauses adopted by the Commission, standard data protection clauses adopted by a supervisory authority, contractual clauses authorised by a supervisory authority"
        }
        'Input':
        "The decision should be agreed jointly by the lead supervisory authority and the supervisory authorities concerned and should be directed towards the main or single establishment of the controller or processor and be binding on the controller and processor. The controller or processor should take the necessary measures to ensure compliance with this Regulation and the implementation of the decision notified by the lead supervisory authority to the main establishment of the controller or processor as regards the processing activities in the Union."
        'JSON':
        {
        "question": "What is the process for agreeing on a decision regarding a data controller or processor?",
        "answer": "The decision should be agreed jointly by the lead supervisory authority and the supervisory authorities concerned."
        }

        
        '''
        samples =f'''
        "question": "What type of public authorities are exempt from being considered recipients if they receive personal data necessary for a particular inquiry?"
        "answer": "Tax and customs authorities, financial investigation units, independent administrative authorities, or financial market authorities responsible for the regulation and supervision of securities markets" 
         "question": "What types of safeguards can a controller or processor use to compensate for the lack of an adequacy decision when transferring personal data to a third country?"
        "answer": "Binding corporate rules, standard data protection clauses adopted by the Commission, standard data protection clauses adopted by a supervisory authority, contractual clauses authorised by a supervisory authority"
        "question": "What is the process for agreeing on a decision regarding a data controller or processor?"
        "answer": "The decision should be agreed jointly by the lead supervisory authority and the supervisory authorities concerned."
        '''
        '''


In [7]:
import time

def generate_question_answer_with_samples(context):
    empty_result= "{}"
    try:

        samples =f'''

        'Input':
        "Public authorities to which personal data are disclosed in accordance with a legal obligation for the exercise of their official mission, such as tax and customs authorities, financial investigation units, independent adminis\u00ad trative authorities, or financial market authorities responsible for the regulation and supervision of securities markets should not be regarded as recipients if they receive personal data which are necessary to carry out a particular inquiry in the general interest, in accordance with Union or Member State law. The requests for disclosure sent by the public authorities should always be in writing, reasoned and occasional and should not concern the entirety of a filing system or lead to the interconnection of filing systems. The processing of personal data by those public authorities should comply with the applicable data-protection rules according to the purposes of the processing"
        'JSON':
        "question": "What type of public authorities are exempt from being considered recipients if they receive personal data necessary for a particular inquiry?"
        "answer": "Tax and customs authorities, financial investigation units, independent administrative authorities, or financial market authorities responsible for the regulation and supervision of securities markets" 
        'Input': 
        "In the absence of an adequacy decision, the controller or processor should take measures to compensate for the lack of data protection in a third country by way of appropriate safeguards for the data subject. Such appropriate safeguards may consist of making use of binding corporate rules, standard data protection clauses adopted by the Commission, standard data protection clauses adopted by a supervisory authority or contractual clauses authorised by a supervisory authority. Those safeguards should ensure compliance with data protection requirements and the rights of the data subjects appropriate to processing within the Union, including the availability of enforceable data subject rights and of effective legal remedies, including to obtain effective adminis\u00ad trative or judicial redress and to claim compensation, in the Union or in a third country. They should relate in particular to compliance with the general principles relating to personal data processing, the principles of data protection by design and by default. Transfers may also be carried out by public authorities or bodies with public authorities or bodies in third countries or with international organisations with corresponding duties or functions, including on the basis of provisions to be inserted into administrative arrangements, such as a memorandum of understanding, providing for enforceable and effective rights for data subjects. Authorisation by the competent supervisory authority should be obtained when the safeguards are provided for in administrative arrangements that are not legally binding."
        'JSON':
        "question": "What types of safeguards can a controller or processor use to compensate for the lack of an adequacy decision when transferring personal data to a third country?",
        "answer": "Binding corporate rules, standard data protection clauses adopted by the Commission, standard data protection clauses adopted by a supervisory authority, contractual clauses authorised by a supervisory authority"
        '''

        
        
        messages = f'''
            Your task is to generate realistic and applied questions that pertain to the provided regulatory or compliance material. Ensure that the context explicitly contains the answer to the question and the question should not return a binary answer. the answer should be concise"
            Each single question and answer should be in a valid JSON format. Each JSON should contain a single question with a single answer. Only respond with valid JSON and no additional text. Test the JSON for validity before returning it.
            Below are a few examples of a sample input and the returned JSON:
            "{samples}"
        Input:
        "{context}"
        '''

        #print(messages)
        
        response = ollama.generate(model="llama3:8b", prompt=messages,format="json")

        return json.loads(response["response"])

        
    except Exception as e:
        print(f"An error occurred: {e}")
        time.sleep(20)
    return json.loads(empty_result)

In [8]:
regulation_example = """The processing of personal data should be designed to serve mankind. The right to the protection of personal data is not an absolute right; it must be considered in relation to its function in society and be balanced against other fundamental rights, in accordance with the principle of proportionality."""

print(regulation_example)

structured_data = generate_question_answer_with_samples(regulation_example)

print(json.dumps(structured_data, indent=2))

The processing of personal data should be designed to serve mankind. The right to the protection of personal data is not an absolute right; it must be considered in relation to its function in society and be balanced against other fundamental rights, in accordance with the principle of proportionality.
{
  "question": "What is the primary consideration for processing personal data according to this regulation?",
  "answer": "The processing of personal data should be designed to serve mankind"
}


In [12]:
# Example usage:
import datetime
use_value= 4
if use_value==1:
    
    regulation_example = """The processing of personal data should be designed to serve mankind. The right to the protection of personal data is not an absolute right; it must be considered in relation to its function in society and be balanced against other fundamental rights, in accordance with the principle of proportionality."""
elif use_value==2:
    regulation_example = """
    Those developments require a strong and more coherent data protection framework in the Union, backed by strong enforcement, given the importance of creating the trust that will allow the digital economy to develop across the internal market. Natural persons should have control of their own personal data. Legal and practical certainty for natural persons, economic operators 
    and public authorities should be enhanced.
    """

elif use_value==3:
    regulation_example = """
    The processing of personal data of data subjects who are in the Union by a controller or processor not established in the Union should
    also be subject to this Regulation when it is related to the monitoring of the behaviour of such data subjects in so far as their behaviour takes place within
    the Union. In order to determine whether a processing activity can be considered to monitor the behaviour of data subjects, it should be ascertained whether natural persons 
    are tracked on the internet including potential subsequent use of personal data processing techniques which consist of profiling a natural person, particularly in order to take decisions concerning her or him or for analysing 
    or predicting her or his personal preferences, behaviours and attitudes.
    """
elif use_value==4:
    regulation_example = """"
    The establishment of supervisory authorities in Member States, empowered to perform their tasks and exercise their powers with complete independence, is an essential component of the protection of natural persons with regard to the processing of their personal 
    data. Member States should be able to establish more than one supervisory authority, to reflect their constitutional, organisational and administrative structure
    """

now = datetime.datetime.now()
print(now.time())

print(regulation_example)

structured_data = generate_question_answer_with_samples(regulation_example)

now = datetime.datetime.now()
print(now.time())

print(json.dumps(structured_data, indent=2))

10:10:53.379958
"
    The establishment of supervisory authorities in Member States, empowered to perform their tasks and exercise their powers with complete independence, is an essential component of the protection of natural persons with regard to the processing of their personal 
    data. Member States should be able to establish more than one supervisory authority, to reflect their constitutional, organisational and administrative structure
    
10:10:59.375514
{
  "question": "How many supervisory authorities can a Member State establish?",
  "answer": "More than one"
}


In [26]:
import pandas as pd
use_sentences=True
if use_sentences:
    #obligations_all = pd.read_csv('data/gdpr_articles_sentences_clean.txt', sep="\t", header=None)
    # articles related to controller and processor
    #obligations_all = pd.read_csv('data/gdpr_articles_24_43_sentences_clean.txt', sep="\t", header=None)
    obligations_all = pd.read_csv('data/gdpr_recitals_sentences_clean.txt', sep="\t", header=None)
else:
    
    obligations_all = pd.read_csv('data/gdpr_articles_clean.txt', sep="\t", header=None)
    #obligations_all = pd.read_csv('data/gdpr_recitals.txt', sep="\t", header=None)
    
obligations_all.columns = ["Statement"]
print(len(obligations_all))

169


In [None]:
import random
statements_all = obligations_all["Statement"].tolist()



def is_valid_output(json_output):
    required_fields = ["question", "answer"]
    for field in required_fields:
        if not json_output.get(field):
            return False
    return True

question_answers = []
if use_sentences:
    # shuffle sentences
    #random.shuffle(statements_all)
    #statements = statements_all[:50]
    statements = statements_all
else:
    statements = statements_all
    #statements = statements_all[:173]
current_article =''

for index,text in enumerate(statements):
    now = datetime.datetime.now()

    #if index==100:
    #    break
        
    print("%d %s" %(index,now.time()))
    print(text)

    
    if text.startswith("Article"):
        current_article = text.split(" ")[1]
    print(current_article)
    
    # skip certain lines in gdpr articles
    if text== '\n' or text == '' or text.startswith("Article") or text.startswith("CHAPTER") or text.startswith("Section"):
        continue
        
    retries = 3
    json_output = None
    while retries > 0:
        
        json_output = generate_question_answer_with_samples(text)
        if is_valid_output(json_output):
            break
        retries -= 1
    print(json_output)
    
    if json_output:
        #json_output["InputText"] = text  # Add the original text to the JSON output
        json_output["InputText"] = text.replace(u"\u2018", "'").replace(u"\u2019", "'")  # change unicode chars for quotes
        json_output["Article"] = current_article

        question_answers.append(json_output)

# Save the knowledge graph to a JSON file
#output_json_filename = "Output/obligation_question_answer_output.json"
if use_sentences:
    #output_json_filename = "Output/gdpr_recital_sentences_question_answer_output.json"
    output_json_filename = "Output/gdpr_articles_24_43_sentences_question_answer_one_shot_output.json"

    #output_json_filename = "Output/gdpr_articles_sentences_question_answer_one_shot_output.json"
else:    
    #output_json_filename = "Output/gdpr_recital_question_answer_output.json"
    output_json_filename = "Output/gdpr_articles_question_answer_one_shot_output.json"

with open(output_json_filename, 'w') as json_file:
    json.dump(question_answers, json_file, indent=4)

print(f"obligations question answer  saved to {output_json_filename}")


In [2]:
gdpr_articles_json_filename = 'Output/gdpr_articles_24_43_qa_output.json'

with open(gdpr_articles_json_filename) as json_file:
    article_json_data = json.load(json_file)
print(len(article_json_data))

122


In [398]:
for record in article_json_data:
    print(record)
    print(record['Article'])
    break

{'question': 'What kind of measures should a controller implement to ensure and demonstrate compliance with Article 24?', 'answer': 'Appropriate technical and organisational measures.These measures shall be reviewed where necessary', 'InputText': 'Taking into account the nature, scope, context and purposes of processing as well as the risks of varying likelihood and severity for the rights and freedoms of natural persons, the controller shall implement appropriate technical and organisational measures to ensure and to be able to demonstrate that processing is performed in accordance with this Regulation.  Those measures shall be reviewed and updated where necessary.', 'Article': '24'}
24


In [399]:
print(article_json_data[0]['question'])

What kind of measures should a controller implement to ensure and demonstrate compliance with Article 24?


In [3]:
gdpr_recitals_json_filename = 'Output/gdpr_chapter4_recital_qa_correct_output.json'

with open(gdpr_recitals_json_filename) as json_file:
    recital_json_data = json.load(json_file)
print(len(recital_json_data))

46


In [None]:
for record in recital_json_data:
    print(record['Recital'])

In [4]:
import chromadb

In [5]:
client = chromadb.Client()

if "articles" in [c.name for c in client.list_collections()]:
      
    client.delete_collection(name="articles")

collection = client.create_collection(name="articles")

# store each document in a vector embedding database
for i, article in enumerate(article_json_data):
    input_text = article['InputText']  
    response = ollama.embed(model="all-minilm", input=input_text)
    embeddings = response["embeddings"]
    collection.add(
    ids=[str(i)+":article:" + article['Article']],
    embeddings=embeddings,
    documents=[input_text]
  )


In [6]:
#if client.get_collection(name="recitals"):
if "recitals" in [c.name for c in client.list_collections()]:
    
    client.delete_collection(name="recitals")

recital_collection = client.create_collection(name="recitals")

# store each document in a vector embedding database
for i, recital in enumerate(recital_json_data):
    input_text = recital['InputText']  
    response = ollama.embed(model="all-minilm", input=input_text)
    embeddings = response["embeddings"]
    recital_collection.add(
    ids=[str(i)+":recital:" + recital['Recital']],
    embeddings=embeddings,
    documents=[input_text]
  )


In [55]:
sample_article_text = article_json_data[5]['InputText']
print(sample_article_text)

The arrangement referred to in paragraph 1 shall duly reflect the respective roles and relationships of the joint controllers vis-à-vis the data subjects.  The essence of the arrangement shall be made available to the data subject.


In [7]:
print(collection.count())

122


In [8]:
print(recital_collection.count())

46


In [256]:
# an example input
sample_recital_question = recital_json_data[34]['question']
sample_recital_answer = recital_json_data[34]['answer']
sample_recital_text = recital_json_data[34]['InputText']

print(sample_recital_question)
print(sample_recital_answer)
print(sample_recital_text)



When would a data protection impact assessment be required for monitoring publicly accessible areas?
Especially when using optic-electronic devices or when the competent supervisory authority considers that the processing is likely to result in a high risk to the rights and freedoms of data subjects, in particular because they prevent data subjects from exercising a right or using a service or a contract, or because they are carried out systematically on a large scale.
A data protection impact assessment is equally required for monitoring publicly accessible areas on a large scale, especially when using optic-electronic devices or for any other operations where the competent supervisory authority considers that the processing is likely to result in a high risk to the rights and freedoms of data subjects, in particular because they prevent data subjects from exercising a right or using a service or a contract, or because they are carried out systematically on a large scale.


In [None]:

for recital in recital_json_data:
    recital_text = recital['InputText']
    
    # generate an embedding for the input and retrieve the most relevant doc
    response = ollama.embed(
      model="all-minilm",
      input=recital_text
    )

    results = collection.query(
      query_embeddings=[response["embeddings"][0]],
      n_results=2
    )

    
    
    closest_dist = results['distances'][0][0]
    closest_id = results['ids'][0][0]
    if closest_dist < 0.3:
        print(recital)
        print("%3.3f" %(closest_dist))
        print(closest_id)
        data = results['documents'][0][0]
        print(data)


In [190]:
print(sample_recital_text)

 Where a controller or a processor not established in the Union is processing personal data of data subjects who are in the Union whose processing activities are related to the offering of goods or services, irrespective of whether a payment of the data subject is required, to such data subjects in the Union, or to the monitoring of their behaviour as far as their behaviour takes place within the Union, the controller or the processor should designate a representative, unless the processing is occasional, does not include processing, on a large scale, of special categories of personal data or the processing of personal data relating to criminal convictions and offences, and is unlikely to result in a risk to the rights and freedoms of natural persons, taking into account the nature, context, scope and purposes of the processing or if the controller is a public authority or body. The representative should act on behalf of the controller or the processor and may be addressed by any super

In [386]:
print(recital_json_data[0])

{'InputText': 'Taking into account the nature, scope, context and purposes of processing as well as the risks of varying likelihood and severity for the rights and freedoms of natural persons, the controller shall implement appropriate technical and organisational measures to ensure and to be able to demonstrate that processing is performed in accordance with this Regulation.  Those measures shall be reviewed and updated where necessary.', 'question': 'What kind of measures should a controller implement to ensure and demonstrate compliance with Article 24?', 'answer': 'Appropriate technical and organisational measures.These measures shall be reviewed where necessary', 'expanded_score': '0.3608', 'expanded': ['The controller shall maintain accurate and up-to-date records of processing activities, including information on the categories of personal data processed, the source of such data, and the recipients to whom it was disclosed', 'The controller shall provide adequate training to emp

In [None]:
Experiment: Provide an answer to each recital utilising contextal data based on closest matching articles 

In [116]:
use_recital_question =True

deep_dataset = []
for index,recital in enumerate(recital_json_data):
    print(index)
    recital_text = recital['InputText']
    recital_query= recital_text
    
    if use_recital_question:
        recital_query = recital['question']    
    
    # generate an embedding for the input and retrieve the most relevant doc
    response = ollama.embed(
      model="all-minilm",
      input=recital_query
    )

    results = collection.query(
      query_embeddings=[response["embeddings"][0]],
      n_results=3
    )

    closest_dist = results['distances'][0][0]

    if closest_dist < 100.0:
        #print(index)
   
        print("%3.3f" %(closest_dist))
        #print("closest document id %s" %(closest_id))
       
            
        closest_id = results['ids'][0][0]
        #data = results['documents'][0][0]
        # add all returned documents not just the first
        data = results['documents'][0]
        
        print("recital_text:")
        print(recital_text)
        #print("returned document")
        #print(data)
        
        recital_question = recital['question']
        recital_answer = recital['answer']
        print("recital question:")
        print(recital_question)
    
        print("recital answer:")
        print(recital_answer)
    
        print("response from LLM:")
        #generate a response combining the prompt and data we retrieved in step 2
        output = ollama.generate(
          model="llama3:8b",
          #prompt=f"Using this data: {data}. Respond to this prompt: {recital_question}"
          prompt=f"Using this contextual data: {data}.Provide a concise answer to the given query: {recital_question}"
     
        ) 
    
        print(output['response']) 
    
 
        retrieved_contexts = data

        test_case = LLMTestCase(input=recital_question, expected_output =recital_answer,actual_output=output['response'], retrieval_context= retrieved_contexts)
    
        deep_dataset.append(test_case)
    
        

deepeval_dataset = EvaluationDataset(test_cases=deep_dataset)
   
    

0
0.976
recital_text:
 In order to ensure a consistent level of protection for natural persons throughout the Union and to prevent divergences hampering the free movement of personal data within the internal market, a Regulation is necessary to provide legal certainty and transparency for economic operators, including micro, small and medium-sized enterprises, and to provide natural persons in all Member States with the same level of legally enforceable rights and obligations and responsibilities for controllers and processors, to ensure consistent monitoring of the processing of personal data, and equivalent sanctions in all Member States as well as effective cooperation between the supervisory authorities of different Member States. The proper functioning of the internal market requires that the free movement of personal data within the Union is not restricted or prohibited for reasons connected with the protection of natural persons with regard to the processing of personal data. To

KeyboardInterrupt: 

In [408]:

for index,article in enumerate(article_json_data):
    print(index)
    article_text = article['InputText']
    
    article_question = article['question']
    article_answer = article['answer']

    # generate an embedding for the input and retrieve the most relevant doc
    response = ollama.embed(
      model="all-minilm",
      input=article_question
    )

    results = collection.query(
      query_embeddings=[response["embeddings"][0]],
      n_results=3
    )

    closest_dist = results['distances'][0][0]

    if closest_dist < 10.0:
        #print(index)
   
        print("closest distance in vector store:%3.3f" %(closest_dist))
        #print("closest document id %s" %(closest_id))
       
            
        closest_id = results['ids'][0][0]
        #data = results['documents'][0][0]
        # add all returned documents not just the first
        data = results['documents'][0]
        
        print("article_text:")
        print(article_text)
        #print("returned document")
        #print(data)
        
        print("article question:")
        print(article_question)
    
        print("article answer:")
        print(article_answer)
    
        print("response from LLM:")
        #generate a response combining the prompt and data we retrieved in step 2
        output = ollama.generate(
          model="llama3:8b",
          #prompt=f"Using this data: {data}. Respond to this prompt: {recital_question}"
          prompt=f"Using this contextual data: {data}.Provide a concise answer to the given query: {article_question}"
     
        ) 
    
        print(output['response']) 
    if index == 19:
        break
    

0
closest distance in vector store:0.669
article_text:
Taking into account the nature, scope, context and purposes of processing as well as the risks of varying likelihood and severity for the rights and freedoms of natural persons, the controller shall implement appropriate technical and organisational measures to ensure and to be able to demonstrate that processing is performed in accordance with this Regulation.  Those measures shall be reviewed and updated where necessary.
article question:
What kind of measures should a controller implement to ensure and demonstrate compliance with Article 24?
article answer:
Appropriate technical and organisational measures.These measures shall be reviewed where necessary
response from LLM:
According to the provided contextual data, a controller should implement "appropriate data protection policies" as referred to in paragraph 1. Additionally, adherence to approved codes of conduct (Article 40) or approved certification mechanisms (Article 42) m

In [263]:

# generate an embedding for the input and retrieve the most relevant doc as context to LLM query
response = ollama.embed(
      model="all-minilm",
      input=sample_recital_text
)

results = collection.query(
      query_embeddings=[response["embeddings"][0]],
      n_results=2
)

    
    
closest_dist = results['distances'][0][0]
closest_id = results['ids'][0][0]
if closest_dist < 10.0:
        print(sample_recital_text)
        print("closest distance:%3.3f" %(closest_dist))
        print("closest id:%s" %closest_id)
        #data = results['documents'][0][0]
        # return all documents
        data = results['documents'][0][0]
        
        print("closest documents:")
        print(data)

print("sample question and given answer") 
input =sample_recital_text
#input= sample_recital_question
#print(input)
#print(sample_recital_answer)

print(" response from LLM ")

#generate a response combining the prompt and data we retrieved in step 2
output = ollama.generate(
  model="llama3:8b",
  #prompt=f"Using this data: {data}. Respond to this prompt: {input}"
  #prompt=f"Using this contextual data: {data}.Provide a concise answer to the given query: {input}"
  #prompt=f"Using this contextual data: {data}, provide three additional concise obligations as one sentence only, where the obligation is not present in the original input text: {input}. Do not output any other text except the obligation. Output a maximum of three obligations. Each obligation should be a sentence starting with a capital letter and ending with a period"
  prompt=f"Using this contextual data only: {data}, provide three additional concise obligations as one sentence only, where the obligation is not present in the original input text: {input}. Do not output any other text except the obligation. Output a maximum of three obligations. The obligation should be in the contextual data {data}"
    
     
  
  #prompt=f"Using this contextual data: {data}.Provide additional information not present in the original context: {input}"
) 

print(output['response']) 


A data protection impact assessment is equally required for monitoring publicly accessible areas on a large scale, especially when using optic-electronic devices or for any other operations where the competent supervisory authority considers that the processing is likely to result in a high risk to the rights and freedoms of data subjects, in particular because they prevent data subjects from exercising a right or using a service or a contract, or because they are carried out systematically on a large scale.
closest distance:0.469
closest id:60:article:35
closest documents:
A data protection impact assessment referred to in paragraph 1 shall in particular be required in the case of: (a) a systematic and extensive evaluation of personal aspects relating to natural persons which is based on automated processing, including profiling, and on which decisions are based that produce legal effects concerning the natural person or similarly significantly affect the natural person; (b) processin

expand recitals using articles 

In [None]:

import re

expanded_recitals= [] 

for index,recital in enumerate(recital_json_data):
    print(index)
    recital_text = recital['InputText']
    
    # generate an embedding for the input and retrieve the most relevant doc
    response = ollama.embed(
      model="all-minilm",
      input=recital_text
    )

    results = collection.query(
      query_embeddings=[response["embeddings"][0]],
      n_results=3
    )

    closest_dist = results['distances'][0][0]

   
    print("%3.3f" %(closest_dist))
       
            
    closest_id = results['ids'][0][0]
        #data = results['documents'][0][0]
        # add all returned documents not just the first
    data = results['documents'][0][0]
        
        #print("recital_text:")
    print(recital_text)
        #print("returned document")
    print(data)
        
    
    input= recital_text
    
    print("response from LLM:")
        #generate a response combining the prompt and data we retrieved in step 2
    output = ollama.generate(
          model="llama3:8b",
          prompt=f"Using this contextual data: {data}, provide three additional concise obligations as one sentence only, where the obligation is not present in the original input text: {input}. Do not output any other text except the obligation. Output a maximum of three obligations. Each obligation should be a sentence starting with a capital letter and ending with a period"  
     
        ) 

    
    other_obligations = []
    
    print(output['response']) 
    sentences = re.split(r' *[0-9\:\*\.\?!;] *', output['response'])

    print(sentences)
    
    for sentence in sentences:
        if not "concise obligation" in sentence:
            
            sentence = sentence.strip()
             
            if len(sentence) > 0:
                print(sentence)
                other_obligations.append(sentence)
        
    new_recital= {}
    new_recital['InputText'] = recital['InputText']
    new_recital['question'] = recital['question']
    new_recital['answer'] = recital['answer']
    new_recital['expanded_score'] = closest_dist
    new_recital['expanded'] = other_obligations
    new_recital['Recital'] = recital['Recital']
    expanded_recitals.append(new_recital)

output_json_filename = "Output/gdpr_chapter4_expanded_recitals.json"

with open(output_json_filename, 'w') as json_file:
    json.dump(expanded_recitals, json_file, indent=4)

print(f"expanded recitals saved to {output_json_filename}")

In [255]:
gdpr_expanded_recital_json_filename = 'Output/gdpr_chapter4_expanded_recitals.json'

with open(gdpr_expanded_recital_json_filename) as json_file:
    recital_json_data = json.load(json_file)
print(len(recital_json_data))

46


In [None]:

expanded_articles= [] 

for index,article in enumerate(article_json_data):
    print(index)
    article_text = article['InputText']
    
    # generate an embedding for the input and retrieve the most relevant doc
    response = ollama.embed(
      model="all-minilm",
      input=article_text
    )

    results = recital_collection.query(
      query_embeddings=[response["embeddings"][0]],
      n_results=2
    )

    closest_dist = results['distances'][0][0]

   
    print("%3.3f" %(closest_dist))
       
            
    closest_id = results['ids'][0][0]
        #data = results['documents'][0][0]
        # add all returned documents not just the first
    data = results['documents'][0]
        
        #print("recital_text:")
    print(article_text)
        #print("returned document")
    print(data)
        
    
    input= article_text
    
    print("response from LLM:")
        #generate a response combining the prompt and data we retrieved in step 2
    output = ollama.generate(
          model="llama3:8b",
          #prompt=f"Using this contextual data: {data}, provide three additional concise obligations as one sentence only, where the obligation is not present in the original input text: {input}. Do not output any other text except the obligation. Output a maximum of three obligations. Each obligation should be a single sentence and should be contained in the contextual data {data}"  
          prompt=f"Using this contextual data: {data}, provide three additional concise obligations as one sentence only, where the obligation is not present in the original input text: {input}. Do not output any other text except the obligation. Output a maximum of three obligations. Each obligation should be a single sentence.  Each obligation should be contained explicitly  in the contextual data"  
     
        ) 
    
    print(output['response'])

    
    other_obligations = []
    
    print(output['response']) 
    sentences = re.split(r' *[0-9\:\*\.\?!;] *', output['response'])

    print(sentences)
    
    for sentence in sentences:
        if not "concise obligation" in sentence:
            
            sentence = sentence.strip()
             
            if len(sentence) > 0:
                print(sentence)
                other_obligations.append(sentence)
    new_article= {}
    new_article['InputText'] = article['InputText']
    new_article['question'] = article['question']
    new_article['answer'] = article['answer']
    new_article['expanded_score'] = f"%3.4f" %closest_dist
    new_article['expanded'] = other_obligations
    new_article['Article'] = article['Article']
    expanded_articles.append(new_article)

output_json_filename = "Output/gdpr_24_43_expanded_articles2.json"

with open(output_json_filename, 'w') as json_file:
    json.dump(expanded_articles, json_file, indent=4)

print(f"expanded recitals saved to {output_json_filename}")

In [269]:
gdpr_expanded_recital_json_filename = 'Output/gdpr_24_43_expanded_articles2.json'

with open(gdpr_expanded_recital_json_filename) as json_file:
    recital_json_data = json.load(json_file)
print(len(recital_json_data))

122


use Langchain

In [11]:
from langchain_ollama import OllamaEmbeddings
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.vectorstores import InMemoryVectorStore

from langchain_core.documents import Document
from langchain_ollama import OllamaLLM

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
       

In [12]:
embeddings = OllamaEmbeddings(
model='all-minilm',
)


In [13]:

vector_store = InMemoryVectorStore(embeddings)



In [14]:

documents= []
for index, article in enumerate(article_json_data):
   
    content = article["InputText"]
    identifier =  str(index+1)+":article:" + article['Article']
    
    document_1 = Document(id=identifier, page_content=content,metadata={"id": identifier})
    print(document_1.metadata)
    
    documents.append(document_1)
    
vector_store.add_documents(documents)


print(len(vector_store.store.items()))    


{'id': '1:article:24'}
{'id': '2:article:24'}
{'id': '3:article:25'}
{'id': '4:article:25'}
{'id': '5:article:25'}
{'id': '6:article:26'}
{'id': '7:article:26'}
{'id': '8:article:27'}
{'id': '9:article:27'}
{'id': '10:article:28'}
{'id': '11:article:28'}
{'id': '12:article:28'}
{'id': '13:article:28'}
{'id': '14:article:28'}
{'id': '15:article:28'}
{'id': '16:article:28'}
{'id': '17:article:28'}
{'id': '18:article:28'}
{'id': '19:article:28'}
{'id': '20:article:28'}
{'id': '21:article:28'}
{'id': '22:article:28'}
{'id': '23:article:28'}
{'id': '24:article:28'}
{'id': '25:article:28'}
{'id': '26:article:29'}
{'id': '27:article:30'}
{'id': '28:article:30'}
{'id': '29:article:30'}
{'id': '30:article:30'}
{'id': '31:article:30'}
{'id': '32:article:30'}
{'id': '33:article:30'}
{'id': '34:article:30'}
{'id': '35:article:30'}
{'id': '36:article:30'}
{'id': '37:article:30'}
{'id': '38:article:30'}
{'id': '39:article:30'}
{'id': '40:article:30'}
{'id': '41:article:31'}
{'id': '42:article:32'}
{

In [134]:
for index, (id, doc) in enumerate(vector_store.store.items()):
    print(id)
    print(doc['id'])
    
    print(doc['metadata'])
    break

1:article:24
1:article:24
{'id': '1:article:24'}


In [21]:
sample_recital_question = "What types of harm can result from personal data processing?"
 
input = sample_recital_question
# Use the vectorstore as a retriever
retriever = vector_store.as_retriever()

# Retrieve the most similar text
retrieved_documents = retriever.invoke(input)
print(len(retrieved_documents))
# Show the retrieved document's content

for doc in retrieved_documents:
    print(doc.id)
    print(doc.page_content)
    print(f'{doc.metadata}')
    
    
    

4
58:article:35
Where a type of processing in particular using new technologies, and taking into account the nature, scope, context and purposes of the processing, is likely to result in a high risk to the rights and freedoms of natural persons, the controller shall, prior to the processing, carry out an assessment of the impact of the envisaged processing operations on the protection of personal data.
{'id': '58:article:35'}
55:article:34
The communication to the data subject referred to in paragraph 1 of this Article shall describe in clear and plain language the nature of the personal data breach and contain at least the information and measures referred to in points (b) (c) and (d) of Article 33 (3)
{'id': '55:article:34'}
61:article:35
A data protection impact assessment referred to in paragraph 1 shall in particular be required in the case of: (a) a systematic and extensive evaluation of personal aspects relating to natural persons which is based on automated processing, includin

In [167]:
results = vector_store.similarity_search_with_score(
    query=input, k=4
)
for doc, score in results:
    print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")
    print(doc.metadata)


* [SIM=0.886298] Representatives of controllers or processors not established in the Union 1.  Where Article 3 (2) applies, the controller or the processor shall designate in writing a representative in the Union.  The obligation laid down in paragraph 1 of this Article shall not apply to: (a) processing which is occasional, does not include, on a large scale, processing of special categories of data as referred to in Article 9 (1) or processing of personal data relating to criminal convictions and offences referred to in Article 10, and is unlikely to result in a risk to the rights and freedoms of natural persons, taking into account the nature, context, scope and purposes of the processing; or (b) a public authority or body. [{'id': '8:article:27'}]
{'id': '8:article:27'}
* [SIM=0.786815] The representative shall be established in one of the Member States where the data subjects, whose personal data are processed in relation to the offering of goods or services to them, or whose beha

In [184]:
# need to specify search_kwargs or do not get a doc id or meta data (?)
retriever = vector_store.as_retriever(
    #search_type="mmr",
    #search_kwargs={"k": 2, "fetch_k": 2},
    search_kwargs={"k": 4},
)
retrieved_documents= retriever.invoke(input)

for doc in retrieved_documents:
    print(doc.id)
    print(doc.page_content)
    print(f'{doc.metadata}')


1:article:24
Taking into account the nature, scope, context and purposes of processing as well as the risks of varying likelihood and severity for the rights and freedoms of natural persons, the controller shall implement appropriate technical and organisational measures to ensure and to be able to demonstrate that processing is performed in accordance with this Regulation.  Those measures shall be reviewed and updated where necessary.
{'id': '1:article:24'}
45:article:32
Taking into account the state of the art, the costs of implementation and the nature, scope, context and purposes of processing as well as the risk of varying likelihood and severity for the rights and freedoms of natural persons, the controller and the processor shall implement appropriate technical and organisational measures to ensure a level of security appropriate to the risk, including inter alia as appropriate: a process for regularly testing, assessing and evaluating the effectiveness of technical and organisa

In [186]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain
# the following adds a score to the meta data by creating a retrieval function that is a wrapper to similariry search
# to allow for a similarity score to be recorded in the metadata. However was unable to get this to work  
# in the langchain

@chain
def wrapped_retriever(query: str) -> List[Document]:
    docs, scores = zip(*vector_store.similarity_search_with_score(query,k=4))
    for doc, score in zip(docs, scores):
        #print(doc.metadata)
        doc.metadata["score"] = score

    return docs

In [178]:
print(article_json_data[0])


{'question': 'What kind of measures should a controller implement to ensure and demonstrate compliance with Article 24?', 'answer': 'Appropriate technical and organisational measures.These measures shall be reviewed where necessary', 'InputText': 'Taking into account the nature, scope, context and purposes of processing as well as the risks of varying likelihood and severity for the rights and freedoms of natural persons, the controller shall implement appropriate technical and organisational measures to ensure and to be able to demonstrate that processing is performed in accordance with this Regulation.  Those measures shall be reviewed and updated where necessary.', 'Article': '24'}


In [187]:
input = article_json_data[0]['InputText']


retrieved_documents= wrapped_retriever.invoke(input)

for doc in retrieved_documents:
    print(doc.id)
    print(doc.page_content)
    print(f'{doc.metadata}')


1:article:24
Taking into account the nature, scope, context and purposes of processing as well as the risks of varying likelihood and severity for the rights and freedoms of natural persons, the controller shall implement appropriate technical and organisational measures to ensure and to be able to demonstrate that processing is performed in accordance with this Regulation.  Those measures shall be reviewed and updated where necessary.
{'id': '1:article:24', 'score': 1.0}
45:article:32
Taking into account the state of the art, the costs of implementation and the nature, scope, context and purposes of processing as well as the risk of varying likelihood and severity for the rights and freedoms of natural persons, the controller and the processor shall implement appropriate technical and organisational measures to ensure a level of security appropriate to the risk, including inter alia as appropriate: a process for regularly testing, assessing and evaluating the effectiveness of technica

In [25]:
sample_recital_question="What types of harms can result from personal data processing?"
system_prompt = (
    "Use only the given context to answer the question. "
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
llm = OllamaLLM(model="llama3:8b")


question_answer_chain = create_stuff_documents_chain(llm, prompt)
# not able to use this wrapped retreiver in a chain 
#chain = create_retrieval_chain(wrapped_retriever, question_answer_chain)
chain = create_retrieval_chain(retriever, question_answer_chain)

print("sample query:")
print(sample_recital_question)

result = chain.invoke({"input": sample_recital_question})

context_docs = result['context']
for context_doc in context_docs:
    print(context_doc.id)
    print(context_doc.page_content)
    print(context_doc.metadata)
    
print(result['answer'])


sample query:
What types of harms can result from personal data processing?
58:article:35
Where a type of processing in particular using new technologies, and taking into account the nature, scope, context and purposes of the processing, is likely to result in a high risk to the rights and freedoms of natural persons, the controller shall, prior to the processing, carry out an assessment of the impact of the envisaged processing operations on the protection of personal data.
{'id': '58:article:35'}
55:article:34
The communication to the data subject referred to in paragraph 1 of this Article shall describe in clear and plain language the nature of the personal data breach and contain at least the information and measures referred to in points (b) (c) and (d) of Article 33 (3)
{'id': '55:article:34'}
61:article:35
A data protection impact assessment referred to in paragraph 1 shall in particular be required in the case of: (a) a systematic and extensive evaluation of personal aspects re

In [23]:
print(result)

{'input': 'What types of harm can result from personal data processing?', 'context': [Document(id='58:article:35', metadata={'id': '58:article:35'}, page_content='Where a type of processing in particular using new technologies, and taking into account the nature, scope, context and purposes of the processing, is likely to result in a high risk to the rights and freedoms of natural persons, the controller shall, prior to the processing, carry out an assessment of the impact of the envisaged processing operations on the protection of personal data.'), Document(id='55:article:34', metadata={'id': '55:article:34'}, page_content='The communication to the data subject referred to in paragraph 1 of this Article shall describe in clear and plain language the nature of the personal data breach and contain at least the information and measures referred to in points (b) (c) and (d) of Article 33 (3)'), Document(id='61:article:35', metadata={'id': '61:article:35'}, page_content='A data protection 

In [None]:

for index,recital in enumerate(recital_json_data):
    print(index)
    recital_question = recital['question']
    recital_answer= recital['answer']
    print("query:")
    print(recital_question)
    print(recital_answer)

    result = chain.invoke({"input": recital_question})

    print("response")
    
    context_docs = result['context']
    for context_doc in context_docs:
        #print(context_doc.page_content)
        print(context_doc.metadata)
    
    
    print(result['answer'])
 

Langchain LCEL implementation

In [227]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


system_prompt = (
    "Use only the given context to answer the following question. "
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

prompt_text = """"
{context}

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""


prompt = PromptTemplate.from_template(prompt_text)



llm = OllamaLLM(model="llama3:8b")



qa_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)

print("sample query:")
print(sample_recital_question)

#result = qa_chain.invoke({"input":sample_recital_question})
result = qa_chain.invoke(sample_recital_question)
    
print(result)

sample query:
What kind of measures should a controller establish to demonstrate compliance with this Regulation?
According to the provided context, a controller should establish technical and organisational measures that are proportionate to processing activities, including implementing appropriate data protection policies. Additionally, adherence to approved codes of conduct or certification mechanisms can be used as an element to demonstrate compliance with the regulation's obligations. These measures should be reviewed and updated as necessary to ensure and demonstrate compliance.


use knowledge graph to create a prompt

In [21]:
def generate_answer_based_on_kg_enhance_prompt(kg_row, article,use_question=True):
    
    article_text = article['InputText']
    
    article_question = article['question']
    article_answer = article['answer']

    query_text = article_text
    if use_question:
        query_text =article_question
    # generate an embedding for the input and retrieve the most relevant doc
    # if we use just the question the result will not be so close
    response = ollama.embed(
      model="all-minilm",
      input=query_text
      #input=article_text  
    )

    results = collection.query(
      query_embeddings=[response["embeddings"][0]],
      n_results=3
    )

    #print(results)
    
    closest_dist = results['distances'][0][0]

    if closest_dist < 10.0:
        #print(index)
   
        print("closest distance in vector store:%3.3f" %(closest_dist))
        #print("closest document id %s" %(closest_id))
       
            
        closest_id = results['ids'][0][0]
        #data = results['documents'][0][0]
        # add all returned documents not just the first
        data = results['documents'][0]
        
        print("article_text:")
        print(article_text)
        #print("returned document")
        #print(data)
        
        print("article question:")
        print(article_question)
    
        print("article answer:")
        print(article_answer)
    
        #Using this contextual data: {data} and the following reference knowledge graph, provide a concise answer to the given query: {article_question}
        #Using the following reference knowledge graph, provide a concise answer to the given query: {article_question}
        
        promptText = f"""
        Using this contextual data: {data} and the following reference knowledge graph, provide a concise answer to the given query: {article_question}
        
        Reference Knowledge Graph:
        Type: {kg_row['Type']}
        Actor: {kg_row['Actor']}
        Action: {kg_row['Action']}
        Reason: {kg_row['Reason']}
        Condition: {kg_row['Condition']}
        Modality: {kg_row['Modality']}
        Exception: {kg_row['Exception']}
        Violation: {kg_row['Violation']}
        Artifact: {kg_row['Artifact']}
        Presence: {kg_row['Presence']}
         """
        #generate a response combining the prompt and data we retrieved in step 2
        output = ollama.generate(
          model="llama3:8b",
          prompt=promptText
        )
        #print(promptText)

        print("response from LLM:")

        print(output['response'])

        return data,output['response']
        

        


In [60]:
def generate_answer_based_on_context(article,use_question=True):
    
    article_text = article['InputText']
    
    article_question = article['question']
    article_answer = article['answer']

    query_text = article_text
    if use_question:
        query_text =article_question
    # generate an embedding for the input and retrieve the most relevant doc
    # if we use just the question the result will not be so close
    response = ollama.embed(
      model="all-minilm",
      input=query_text
      #input=article_text  
    )

    results = collection.query(
      query_embeddings=[response["embeddings"][0]],
      n_results=3
    )

    #print(results)
    
    closest_dist = results['distances'][0][0]

    if closest_dist < 10.0:
        #print(index)
   
        print("closest distance in vector store:%3.3f" %(closest_dist))
        #print("closest document id %s" %(closest_id))
       
            
        closest_id = results['ids'][0][0]
        #data = results['documents'][0][0]
        # add all returned documents not just the first
        data = results['documents'][0]
        
        print("article_text:")
        print(article_text)
        #print("returned document")
        #print(data)
        
        print("article question:")
        print(article_question)
    
        print("article answer:")
        print(article_answer)
    
        #Using this contextual data: {data} and the following reference knowledge graph, provide a concise answer to the given query: {article_question}
        #Using the following reference knowledge graph, provide a concise answer to the given query: {article_question}
        
        promptText = f"""
        Using this contextual data: {data}  provide a concise answer to the given query: {article_question}
         """
        #generate a response combining the prompt and data we retrieved in step 2
        output = ollama.generate(
          model="llama3:8b",
          prompt=promptText
        )
        #print(promptText)

        print("response from LLM:")

        print(output['response'])

        return data,output['response']
       
        
        


In [93]:
def generate_answer_based_on_related_kg_enhance_prompt(kg_rows, article):
    
    article_text = article['InputText']
    
    article_question = article['question']
    article_answer = article['answer']

    # generate an embedding for the input and retrieve the most relevant doc
    response = ollama.embed(
      model="all-minilm",
      input=article_question
    )

    results = collection.query(
      query_embeddings=[response["embeddings"][0]],
      n_results=3
    )

    
    closest_dist = results['distances'][0][0]

    print("closest distance in vector store:%3.3f" %(closest_dist))
        #print("closest document id %s" %(closest_id))
       
            
    closest_id = results['ids'][0][0]
    print("closest article in vector store:%s" %closest_id)
        #data = results['documents'][0][0]
        # add all returned documents not just the first
    data = results['documents'][0]
        
    print("article_text:")
    print(article_text)
    # find the article index of the closest embedding and use corresponding knowedge graph
    article_index = int(closest_id.split(":")[0])

    #print(article_index)
    # use kg associated with closest matchin article
    kg_row = kg_rows[article_index]
        #print("returned document")
        #print(data)
        
    print("article question:")
    print(article_question)
    
    print("article answer:")
    print(article_answer)
    
    
#Using this contextual data: {data} and the following reference knowledge graph, provide a concise answer to the given query: {article_question}
        
    promptText = f"""
        Using this contextual data: {data} and the following reference knowledge graph, provide a concise answer to the given query {article_question}
        
        Reference Knowledge Graph:
        Type: {kg_row['Type']}
        Actor: {kg_row['Actor']}
        Action: {kg_row['Action']}
        Reason: {kg_row['Reason']}
        Condition: {kg_row['Condition']}
        Modality: {kg_row['Modality']}
        Exception: {kg_row['Exception']}
        Violation: {kg_row['Violation']}
        Artifact: {kg_row['Artifact']}
        Presence: {kg_row['Presence']}
         """
    #print(promptText)

    print("response from LLM:")
    
    #generate a response combining the prompt and data we retrieved in step 2
    output = ollama.generate(
          model="llama3:8b",
          prompt=promptText
        )
        
    print(output['response'])
        
    return data, output['response']    

        


In [None]:

kg_data = pd.read_excel("Output/gdpr_articles_24_43_kg_output_corrected.xlsx")
# one more entry in the following txt as noticed as indvidual sentence at the end of recital 86 with no qa but seems an obligation
#data = pd.read_excel("Output/gdpr_chapter4_recital_kg_output_corrected.xlsx")

statements_all = kg_data["InputText"].tolist()
print(len(statements_all))



    # Iterate over each row and generate the improved statement
for index,(kg_row,article_data) in enumerate(zip(kg_data.iterrows(),article_json_data)):

        print(index)
        #print(article_data)
        kg_row = kg_row[1]
        #print(kg_row)
        generate_answer_based_on_kg_enhance_prompt(kg_row,article_data)

        if index==19:
            break

In [None]:

kg_data = pd.read_excel("Output/gdpr_articles_24_43_kg_output_corrected.xlsx")
# one more entry in the following txt as noticed as indvidual sentence at the end of recital 86 with no qa but seems an obligation
#data = pd.read_excel("Output/gdpr_chapter4_recital_kg_output_corrected.xlsx")

statements_all = kg_data["InputText"].tolist()
print(len(statements_all))

kg_list = [] 
for kg_row in kg_data.iterrows():
    kg_row = kg_row[1]
    kg_list.append(kg_row)
  
for index,article_data in enumerate(article_json_data):

        print(index)
        generate_answer_based_on_related_kg_enhance_prompt(kg_list,article_data)

        if index==19:
            break

In [None]:

kg_data = pd.read_excel("Output/gdpr_chapter4_recital_kg_output_corrected.xlsx")
# one more entry in the following txt as noticed as indvidual sentence at the end of recital 86 with no qa but seems an obligation
#data = pd.read_excel("Output/gdpr_chapter4_recital_kg_output_corrected.xlsx")

statements_all = kg_data["InputText"].tolist()
print(len(statements_all))


kg_list = [] 
for kg_row in kg_data.iterrows():
    kg_row = kg_row[1]
    kg_list.append(kg_row)
  

deep_dataset= []
    # Iterate over each row and generate the improved statement
for index,recital_data in enumerate(recital_json_data):


        recital_query= recital_data['question']
    
        recital_question= recital_data['question']
        recital_answer= recital_data['answer']
    
        print(index)
       
        # generate an embedding for the input and retrieve the most relevant recital entry
        response = ollama.embed(
          model="all-minilm",
          input=recital_query
        )

        # find the recital that matches the recital question . This may not be the recital that was the same one

        results = recital_collection.query(
          query_embeddings=[response["embeddings"][0]],
          n_results=2
        )

        closest_dist = results['distances'][0][0]

   
        #print("%3.3f" %(closest_dist))

        closest_id = results['ids'][0][0]
        #print("closest recital in vector store:%s" %closest_id)
        #data = results['documents'][0][0]
        
        recital_index = int(closest_id.split(":")[0])

        #print(recital_index)
        kg_row = kg_list[recital_index]


        retrieved_contexts,response = generate_answer_based_on_kg_enhance_prompt(kg_row,recital_data)


    
        retrieved_contexts = data

        test_case = LLMTestCase(input=recital_question, expected_output =recital_answer,actual_output=response, retrieval_context= retrieved_contexts)
    
        deep_dataset.append(test_case)
 
        

  
deepeval_dataset = EvaluationDataset(test_cases=deep_dataset)
         
        

In [None]:
knowledge graph document store based on knowedge graph embeddings

In [29]:
import json
client = chromadb.Client()


#kg_data = pd.read_excel("Output/gdpr_chapter4_recital_kg_output_corrected.xlsx")
# one more entry in the following txt as noticed as indvidual sentence at the end of recital 86 with no qa but seems an obligation
#kg_data = pd.read_excel("Output/gdpr_chapter4_recital_kg_output_corrected.xlsx")

kg_data = pd.read_excel("Output/gdpr_articles_24_43_kg_output_corrected.xlsx")

statements_all = kg_data["InputText"].tolist()
print(len(statements_all))


kg_list = [] 
for kg_row in kg_data.iterrows():
    kg_row = kg_row[1]
    kg_list.append(kg_row)
    
if "kg_articles" in [c.name for c in client.list_collections()]:
      
    client.delete_collection(name="kg_articles")

kg_article_collection = client.create_collection(name="kg_articles")

# store each kg as a vector embedding database
for i, article in enumerate(article_json_data):
    
    input_text = kg_list[i]

    doc_dict = input_text.to_dict()
    #print(doc_dict)
    input_text=json.dumps(doc_dict)
    
    #print(input_text)
    response = ollama.embed(model="all-minilm", input=input_text)
    embeddings = response["embeddings"]
    kg_article_collection.add(
    ids=[str(i)+":article:" + article['Article']],
    embeddings=embeddings,
    documents=[input_text]   
  )

print(kg_article_collection.count())

122
122


In [111]:
'''
recital_list= []    
recital_data = pd.read_excel("Output/gdpr_chapter4_recital_kg_output_corrected.xlsx")

for recital_row in recital_data.iterrows():
    recital_row = recital_row[1]
    recital_list.append(recital_row)
    
recital_row =kg_list[0]
recital_row= recital_list[4] 
#print(recital_row)
'''

distance_below_threshold =0
deep_dataset = []
for index in range(len(recital_json_data)):
    print(index)
       
    recital_query = recital_json_data[index]['question']
    
    recital_question = recital_json_data[index]['question']
    
    recital_answer = recital_json_data[index]['answer']
    
    query = {}
    #query['InputText'] = recital_row['question']
    query['InputText'] = recital_query
    query['Type'] =''  
    query['Action'] =''
    
    query['Condition'] =''                                                     
    query['Modality'] =''                        
    query['Actor'] =''                                                         
    query['Reason'] =''        
    query['Artifact'] =''      
    query['Violation'] =''                                                
    query['Exception'] =''                                                
    query['Presence'] =''                                                 
    query['Time'] =''                                                     
    query['Sanction'] =''                                                 
    query['Situation'] =''
    query_data = json.dumps(query)
    recital_query= query_data
    #recital_query = json.dumps(recital_row.to_dict())
    #print(recital_query)
    response = ollama.embed(
              model="all-minilm",
              input=recital_query
            )
    
    results = kg_article_collection.query(
              query_embeddings=[response["embeddings"][0]],
              n_results=1
            )
    distance = float(results['distances'][0][0])
    print("kg distance:%f" %distance)

    if distance < 1.0:
    
    #if distance < 0.5:
       
        article_kg = results['documents'][0][0]
        article_kg=json.loads(article_kg)
        print("article kg")
        print(article_kg)
        distance_below_threshold+=1   
        
        data,response= generate_answer_based_on_kg_enhance_prompt(article_kg,recital_json_data[index])
    else:
        data,response = generate_answer_based_on_context(recital_json_data[index])
        
    
    retrieved_contexts = data

    test_case = LLMTestCase(input=recital_question, expected_output =recital_answer,actual_output=response, retrieval_context= retrieved_contexts)
    
    deep_dataset.append(test_case)
 
        
        
print("number using knowledge graph %d" %(distance_below_threshold))

  
deepeval_dataset = EvaluationDataset(test_cases=deep_dataset)

        
    


0
kg distance:0.682028
article kg
{'InputText': 'The data protection officer shall be bound by secrecy or confidentiality concerning the performance of his or her tasks, in accordance with Union or Member State law. ', 'Type': 'Obligation', 'Action': 'be bound by secrecy or confidentiality concerning the performance of his or her tasks', 'Condition': ' ', 'Modality': 'shall', 'Actor': 'data protection officer', 'Reason': nan, 'Artifact': '[Union or Member State law]', 'Violation': nan, 'Exception': nan, 'Presence': nan, 'Time': nan, 'Sanction': nan, 'Situation': nan}
closest distance in vector store:0.976
article_text:
 In order to ensure a consistent level of protection for natural persons throughout the Union and to prevent divergences hampering the free movement of personal data within the internal market, a Regulation is necessary to provide legal certainty and transparency for economic operators, including micro, small and medium-sized enterprises, and to provide natural persons i

In [90]:
def get_closest_match_kg(recital_question):
    
    fields = ['InputText', 'Action','Reason', 'all']
    article_kgs = []
    distances = []
    article_kg_set = set()
    print(recital_question)
    for field in fields:
        print(field)
        query = {}
        #query['InputText'] = recital_row['question']
        query['InputText'] = ''
        query['Type'] =''  
        query['Action'] =''
        
        query['Condition'] =''                                                     
        query['Modality'] =''                        
        query['Actor'] =''                                                         
        query['Reason'] =''        
        query['Artifact'] =''      
        query['Violation'] =''                                                
        query['Exception'] =''                                                
        query['Presence'] =''                                                 
        query['Time'] =''                                                     
        query['Sanction'] =''                                                 
        query['Situation'] =''

        if field == all:
            query[field[0]] = recital_question
            query[field[1]] = recital_question
            query[field[2]] = recital_question

        else:
            query[field] = recital_question
        
        query_data = json.dumps(query)
        recital_query= query_data
        #recital_query = json.dumps(recital_row.to_dict())
        #print(recital_query)
        response = ollama.embed(
                  model="all-minilm",
                  input=recital_query
                )
        
        results = kg_article_collection.query(
                  query_embeddings=[response["embeddings"][0]],
                  n_results=1
                )
        distance = float(results['distances'][0][0])
        print("kg distance:%f" %distance)
        distances.append(distance)
        article_kg = results['documents'][0][0]
        # need to add this as a string
        article_kg_set.add(article_kg)
       
        article_kg=json.loads(article_kg)
        article_kgs.append(article_kg)
        #print("article kg")
        #print(article_kg)
    low_dist = np.min(distances)
    lowest  = np.argmin(distances)
    print("lOWEST DISTANCE field:%s" %(fields[lowest]))
    print(" number unique kgs:%d" %len(article_kg_set))
    '''
    for index,article in enumerate(article_kg_set):
        article=json.loads(article)
            
        print(article)
        if index == 0:
            composite=article
        else:
            composite['InputText'] = composite['InputText'] +" " +article['InputText']
            composite['Action'] = composite['Action'] +" " +article['Action']
          
    print("composite")
    print(composite)
    '''
           
    return low_dist,article_kgs[lowest]
        
       
        

In [None]:
for index in range(len(recital_json_data)):
    print(index)
    recital_query = recital_json_data[index]['question']
    distance,article_kg= get_closest_match_kg(recital_query)

   

In [106]:

deep_dataset = []

distance_below_threshold = 0
for index in range(len(recital_json_data)):
    print(index)
    recital_query = recital_json_data[index]['question']
    distance,article_kg= get_closest_match_kg(recital_query)

    
    recital_query = recital_json_data[index]['question']
    
    recital_question = recital_json_data[index]['question']
    
    recital_answer = recital_json_data[index]['answer']
    print("kg distance:%f" %distance)

    # use 0.5 as threshold for evaluation
    if distance < 1.0:
       
        print("article kg")
        print(article_kg)
        distance_below_threshold += 1
       
        data,response= generate_answer_based_on_kg_enhance_prompt(article_kg,recital_json_data[index])
    else:
        data,response = generate_answer_based_on_context(recital_json_data[index])
        
    
    retrieved_contexts = data

    test_case = LLMTestCase(input=recital_question, expected_output =recital_answer,actual_output=response, retrieval_context= retrieved_contexts)
    
    deep_dataset.append(test_case)
    
        
print("number using knowledge graph %d" %(distance_below_threshold))
  
deepeval_dataset = EvaluationDataset(test_cases=deep_dataset)

    

0
What is the primary objective of introducing a Regulation to ensure consistent protection for natural persons throughout the Union?
InputText
kg distance:0.682028
Action
kg distance:0.731787
Reason
kg distance:0.716642
all
kg distance:0.772929
lOWEST DISTANCE field:InputText
 number unique kgs:1
{'InputText': 'The data protection officer shall be bound by secrecy or confidentiality concerning the performance of his or her tasks, in accordance with Union or Member State law. ', 'Type': 'Obligation', 'Action': 'be bound by secrecy or confidentiality concerning the performance of his or her tasks', 'Condition': ' ', 'Modality': 'shall', 'Actor': 'data protection officer', 'Reason': nan, 'Artifact': '[Union or Member State law]', 'Violation': nan, 'Exception': nan, 'Presence': nan, 'Time': nan, 'Sanction': nan, 'Situation': nan}
composite
{'InputText': 'The data protection officer shall be bound by secrecy or confidentiality concerning the performance of his or her tasks, in accordance w

In [None]:
Experiment find article closest matching recital query and utilise the kg associated with that given article to enhance query 

In [101]:
kg_data = pd.read_excel("Output/gdpr_articles_24_43_kg_output_corrected.xlsx")
# one more entry in the following txt as noticed as indvidual sentence at the end of recital 86 with no qa but seems an obligation
#data = pd.read_excel("Output/gdpr_chapter4_recital_kg_output_corrected.xlsx")

statements_all = kg_data["InputText"].tolist()
print(len(statements_all))

kg_list = [] 
for kg_row in kg_data.iterrows():
    kg_row = kg_row[1]
    kg_list.append(kg_row)


deep_dataset= []
    # Iterate over each row and generate the improved statement
for index,recital_data in enumerate(recital_json_data):


        recital_query= recital_data['question']
    
        recital_question= recital_data['question']
        recital_answer= recital_data['answer']
    
        print(index)
       
        retrieved_contexts,response = generate_answer_based_on_related_kg_enhance_prompt(kg_list,recital_data)

    
        retrieved_contexts = data

        test_case = LLMTestCase(input=recital_question, expected_output =recital_answer,actual_output=response, retrieval_context= retrieved_contexts)
    
        deep_dataset.append(test_case)
 
        

  
deepeval_dataset = EvaluationDataset(test_cases=deep_dataset)

122
0
closest distance in vector store:0.976
closest article in vector store:0:article:24
article_text:
 In order to ensure a consistent level of protection for natural persons throughout the Union and to prevent divergences hampering the free movement of personal data within the internal market, a Regulation is necessary to provide legal certainty and transparency for economic operators, including micro, small and medium-sized enterprises, and to provide natural persons in all Member States with the same level of legally enforceable rights and obligations and responsibilities for controllers and processors, to ensure consistent monitoring of the processing of personal data, and equivalent sanctions in all Member States as well as effective cooperation between the supervisory authorities of different Member States. The proper functioning of the internal market requires that the free movement of personal data within the Union is not restricted or prohibited for reasons connected with th

In [505]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch.nn as nn
import torch

query1 = "To enhance transparency and compliance with this Regulation"
query2 = " produce transparency and compliance with the regulation"

response = ollama.embed(
          model="all-minilm",
          input=query1
        )
embed1 = response['embeddings'][0]
embed1= np.array(embed1)


response = ollama.embed(
          model="all-minilm",
          input=query2
        )
embed2 = response['embeddings'][0]
embed2= np.array(embed2)

embed1 = embed1.reshape(1, -1)
print(embed1.shape)
embed2 = embed2.reshape(1, -1)

print( cosine_similarity(embed1,embed2)[0][0])

cos = nn.CosineSimilarity(dim=1, eps=1e-6)

embed1 = torch.from_numpy(embed1)

embed2 = torch.from_numpy(embed2)

print(cos(embed1,embed2).item())


(1, 384)
0.9084467754306719
0.9084467754306722


In [112]:
from deepeval.metrics import (
  ContextualRelevancyMetric,
  ContextualRecallMetric,
  ContextualPrecisionMetric,
  AnswerRelevancyMetric,
  FaithfulnessMetric
)

from deepeval.metrics import GEval

metric_with_steps = False
# if steps are provided is this any better?
if metric_with_steps:

    correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize omission of detail",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    threshold=0.5

)
else:
    correctness_metric = GEval(
        name="Correctness",
        criteria="Determine if the 'actual output' is correct based on the 'expected output'.",
        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
        threshold=0.5
    )


from deepeval import evaluate

contextual_precision = ContextualPrecisionMetric()
contextual_recall = ContextualRecallMetric()
contextual_relevancy = ContextualRelevancyMetric()
answer_relevancy = AnswerRelevancyMetric(threshold=0.8)
faithfulness = FaithfulnessMetric()

#evaluate(dataset, metrics=[contextual_precision, contextual_recall, contextual_relevancy, answer_relevancy, faithfulness])

evaluation_output = evaluate(deepeval_dataset, metrics=[correctness_metric])
#evaluation_output = evaluate(deepeval_dataset, metrics=[correctness_metric,answer_relevancy])


Output()



Metrics Summary

  - ✅ Correctness [GEval] (score: 0.8, threshold: 0.5, strict: False, evaluation model: llama3:8b (Ollama), reason: The actual output provides a clear explanation of the main objectives for ensuring free movement of personal data in the EU, aligning with the expected output's focus on protecting individuals' rights. However, it does not directly address the restriction or prohibition of personal data processing, which is a key aspect of the expected output., error: None)

For test case:

  - input: What are the main objectives of ensuring the free movement of personal data in the EU?
  - actual output: Based on the given contextual data, the main objectives of ensuring the free movement of personal data in the EU are:

1. To process only necessary personal data for each specific purpose.
2. To limit access to personal data without individual's intervention.
3. To implement technical and organisational measures to ensure the security and integrity of personal data.

T

In [113]:
test_results = evaluation_output.test_results

In [114]:
sorted_test_results = sorted(test_results, key=lambda x: int(x.name.replace("test_case_","")))

In [115]:
print("Answer correctness")
number_success=0
for test in sorted_test_results:
    print(test.name)
    print(test.input)
    print(test.actual_output)
    print(test.expected_output)
    print("%s" %test.metrics_data[0].success)
    if (test.metrics_data[0].success==True):
        
        number_success+=1
    print(test.metrics_data[0].score)
    print(test.metrics_data[0].reason)
    print(" ")
    

print("number Correctness successful:%d" %number_success)



Answer correctness
test_case_0
What is the primary objective of introducing a Regulation to ensure consistent protection for natural persons throughout the Union?
Based on the provided contextual data, the primary objective of introducing a Regulation to ensure consistent protection for natural persons throughout the Union is to implement appropriate technical and organisational measures to ensure that processing is performed in accordance with the Regulation. This is stated in the first sentence: "The controller shall implement appropriate technical and organisational measures to ensure and to be able to demonstrate that processing is performed in accordance with this Regulation."
to provide legal certainty and transparency for economic operators, including micro, small and medium-sized enterprises and to provide natural persons in all Member states with the same legal of legally enforceable rights and obligations and responsibilities for controllers and processors, to ensure consiste