# Create Knowledge Graph

## Installing packages

In [347]:
# System modules
import os

# json structure
import json

# Keyword Extraction Functions
import re

# Neo4j integration
import neo4j
from neo4j import GraphDatabase
from langchain_community.graphs import Neo4jGraph

# OpenAI integration
import openai

# pandas functionalities
import pandas as pd

# LangChain functionalities
from langchain_experimental.graph_transformers import LLMGraphTransformer 
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from langchain.document_loaders import PyPDFLoader
from langchain.chains import GraphCypherQAChain

# Langfuse integration
from langfuse import Langfuse
from langfuse.decorators import observe

## Create Knowledge Graph

### Credentials

In [None]:
llm_key = "your_key"
neo4j_username = "your_username"
neo4j_password = "your_password"
neo4j_uri = "bolt://localhost:7687"

### GPT-4o API

In [3]:
os.environ["OPENAI_API_KEY"] = llm_key
llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
llm_transformer = LLMGraphTransformer(llm=llm)

### Neo4j API

In [None]:
os.environ["NEO4J_URI"] = neo4j_uri
os.environ["NEO4J_USERNAME"] = neo4j_username
os.environ["NEO4J_PASSWORD"] = neo4j_password

graph = Neo4jGraph()

## Langfuse

In [None]:
from langfuse.callback import CallbackHandler
langfuse_handler = CallbackHandler(
    public_key="your_public_key",
    secret_key="your_secret_key",
    host="https://cloud.langfuse.com"
)

### Load Data

In [None]:
def process_pdf(pdf_file):
    """
    Processes a PDF file, extracts its content, and prints the first three pages.
    
    Parameters:
        pdf_file (str): The path to the PDF file.
    
    Returns:
        list: A list of Document objects representing the content of the PDF.
    """
    # Load and split the PDF
    loader = PyPDFLoader(pdf_file)
    documents = loader.load_and_split()
    
    # Print the first three documents for preview
    for doc in documents[:3]:
        print(f"Page: {doc.metadata['page']}\nContent: {doc.page_content[:200]}...\n")
    
    return documents

In [None]:
pdf_path = "KG_Datensatz.pdf"
documents = process_pdf(pdf_path)

### Construction of the Knowledge Graph

In [None]:
graph_documents = llm_transformer.convert_to_graph_documents(documents)

In [None]:
for node in graph_documents[0].nodes[:5]:
    print(node)

In [None]:
for relationship in graph_documents[0].relationships[:5]:
    print(relationship)

In [None]:
graph.add_graph_documents(graph_documents)

# Retrieval-Augmented Generation

### Load cybercrime cases from a CSV file

In [246]:
def create_prompts_from_csv(file_path: str, prompt_text: str) -> list:
    """
    Reads a CSV file and generates prompts based on the data.
    
    Parameters:
        file_path (str): The path to the CSV file.
        prompt_text (str): Instruction text to prepend to each prompt.

    Returns:
        list: A list of formatted prompts.
    """
    
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Define the prompt template
    prompt_template = """{instruction}\n\nData and Time: {date_time};\nLocation: {location};\nIncident Description: {incident_description};\nInformation on Suspects or Witnesses: {suspect_info};\nDamage(€): {damage};"""
    
    # Generate prompts
    prompts = []
    for _, row in df.iterrows():
        prompt = prompt_template.format(
            instruction=prompt_text,
            date_time=row["Date and Time"],
            location=row["Location"],
            incident_description=row["Incident Description"],
            suspect_info=row["Information on Suspects or Witnesses"],
            damage=row["Damage (€)"],
        )
        prompts.append(prompt)
    
    return prompts

### Cybercrime Case Pre-Analysis

In [556]:
baseline_analysis_prompt_text = """
Answer the following questions briefly and precisely:

Task:
1. What "Cybercrime Phenomenon" is present in the given case?
2. What "Investigation measures" are appropriate?

Provide your answer in the following format and spelling:
- Cybercrime Phenomenon: "Phenomenon"

- Investigation measures: "measure1, measure2, ..."
"""

In [557]:
baseline_analysis_context = """
This case describes a cybercrime incident from the perspective of an affected individual. 
It provides details about the event, including the circumstances, 
potential threats, and any available information about suspects or damages. 
"""

In [558]:
baseline_analysis_system_message = """
As a cybercrime investigation expert, 
you assist law enforcement in analyzing cases by identifying cybercrime phenomena and 
investigative methods. These methods should be presented 
as keywords, such as "Email Analysis" or "Domain Analysis," without full sentences.
"""

In [559]:
baseline_analysis_prompts = create_prompts_from_csv(file_path, baseline_analysis_prompt_text)

In [560]:
print(baseline_analysis_prompts[0])


Answer the following questions briefly and precisely:

Task:
1. What "Cybercrime Phenomenon" is present in the given case?
2. What "Investigation measures" are appropriate?

Provide your answer in the following format and spelling:
- Cybercrime Phenomenon: "Phenomenon"

- Investigation measures: "measure1, measure2, ..."


Data and Time: 2025-02-28 21:10;
Location: Online;
Incident Description: I was searching for a free streaming app on my Smart TV and found a site that offered a download link. After installation, my TV restarted, and a message appeared on the screen: 'Your device has been locked! Pay €300 in Bitcoin to unlock it.' I could no longer use any functions, and even resetting the device did not work. The TV manufacturer's support confirmed that this was ransomware and advised me not to pay.;
Information on Suspects or Witnesses: The app was downloaded from a website called 'best-streaming-apps.com'.;
Damage(€): 300;


### OpenAI Pre-Analysis

In [561]:
def analyse_cybercrime_prompt(api_key: str, prompt: str, context: str, system_message: str) -> str:
    """
    Analyzes a given cybercrime prompt and determines the relevant cybercrime phenomenon 
    as well as appropriate police investigation measures in the context of initial assessment.
    
    :param api_key: Your OpenAI API key
    :param prompt: The input text describing a cybercrime situation
    :return: The analysis and recommended police measures as a string
    """
    client = openai.OpenAI(api_key=api_key)
    
    response = client.chat.completions.create(
        model="chatgpt-4o-latest",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": context}
        ]
    )
    
    return response.choices[0].message.content

In [584]:
def extract_cybercrime_details(output: str) -> dict:
    """
    Extracts the cybercrime phenomenon and recommended investigation measures from the input string.
    
    :param output: The raw response containing cybercrime details.
    :return: A dictionary containing the identified phenomenon and recommended investigation measures.
    """
    identified_phenomenon = None
    recommended_investigation_measures = None

    phenomenon_match = re.search(r'(?:\*\*)?\s*cybercrime\s*phenomenon\s*:\s*"(.*?)"', output, re.IGNORECASE)
    measures_match = re.search(r'(?:\*\*)?\s*investigation\s*measures\s*:\s*"(.*?)"', output, re.IGNORECASE)

    if phenomenon_match:
        identified_phenomenon = phenomenon_match.group(1)
    if measures_match:
        recommended_investigation_measures = measures_match.group(1)

    return {
        "identified_phenomenon": identified_phenomenon,
        "recommended_investigation_measures": recommended_investigation_measures
    }


### Generate Cypher Query

In [563]:
def generate_cypher_query(phenomenon, measures, llm_key):
    return f"""
CALL apoc.ml.openai.embedding(
["{phenomenon}"],
"{llm_key}"
)
YIELD index, text AS search_text, embedding
MATCH (p:Phenomenon)
WHERE p.embedding IS NOT NULL
WITH p, gds.similarity.cosine(p.embedding, embedding) AS score
ORDER BY score DESC
LIMIT 1
MATCH (c {{id: "Cybercrime"}})-[:INCLUDES_PHENOMENON]->(p)
MATCH (c)-[:HAS_CATEGORY]->(category:Category)
MATCH (category)
CALL apoc.path.subgraphAll(category, {{
relationshipFilter: ">",
minLevel: 1,
maxLevel: 3
}}) YIELD nodes AS category_nodes
MATCH (p)
CALL apoc.path.subgraphAll(p, {{
relationshipFilter: ">",
minLevel: 1,
maxLevel: 5
}}) YIELD nodes AS phenomenon_nodes, relationships AS phenomenon_relationships
MATCH (i {{id: "Investigation"}})-[:CONTAINS]->(q {{id: "Investigation Questions"}})
MATCH (q)-[:INCLUDES]->(question:Question)
WITH p, c, category, phenomenon_nodes, phenomenon_relationships, category_nodes, i, q, COLLECT(question) AS investigation_questions

CALL apoc.ml.openai.embedding(
["{measures}"],
"{llm_key}"
)
YIELD index AS investigation_index, text AS investigation_search_text, embedding AS investigation_embedding
MATCH (m:Investigation_method)
WHERE m.embedding IS NOT NULL
WITH p, c, category, phenomenon_nodes, phenomenon_relationships, category_nodes, i, q, investigation_questions,
m, gds.similarity.cosine(m.embedding, investigation_embedding) AS investigation_score
WITH p, c, category, phenomenon_nodes, phenomenon_relationships, category_nodes, i, q, investigation_questions,
CASE WHEN investigation_score > 0.80 THEN m ELSE NULL END AS filtered_m,
CASE WHEN investigation_score > 0.80 THEN investigation_score ELSE NULL END AS filtered_score
OPTIONAL MATCH (filtered_m)
CALL apoc.path.subgraphAll(filtered_m, {{
relationshipFilter: ">",
minLevel: 1,
maxLevel: 5
}})
YIELD nodes AS investigation_method_nodes

RETURN DISTINCT
    apoc.convert.toJson(p) AS Identified_Phenomenon,
    apoc.convert.toJson(c) AS Related_Cybercrime,
    apoc.convert.toJson(category) AS Cybercrime_Category,
    apoc.convert.toJson(phenomenon_nodes) AS Phenomenon_Connected_Nodes,
    apoc.convert.toJson(phenomenon_relationships) AS Phenomenon_Connections,
    apoc.convert.toJson(category_nodes) AS Cybercrime_Category_Details_Nodes,
    apoc.convert.toJson(i) AS Investigation_Node,
    apoc.convert.toJson(q) AS Investigation_Questions_Node,
    apoc.convert.toJson(investigation_questions) AS Investigation_Questions,
    apoc.convert.toJson(COALESCE(COLLECT(filtered_m), [])) AS Most_Similar_Investigation_Methods,
    apoc.convert.toJson(COALESCE(COLLECT(investigation_method_nodes), [])) AS Investigation_Method_Details;
"""

In [564]:
def execute_cypher_query(query: str):
    """
    Executes a given Cypher query on a local Neo4j Knowledge Graph and returns the results.
    
    :param query: The Cypher query to execute
    :return: The query result as a list of dictionaries
    """
    driver = GraphDatabase.driver(
        os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )

    try:
        with driver.session() as session:
            result = session.run(query)
            
            # Stores all records as a list of dictionaries
            records = [record.data() for record in result]
            
            return records  # Returns the data

    except Exception as e:
        print(f"Error executing query: {e}")
        return None  # Returns None in case of an error

    finally:
        driver.close()

### Format the retrieved information from the Neo4j Knowledge Graph

In [565]:
def remove_embedding(data):
    """
    Recursive function that removes all 'embedding' keys from a JSON dictionary or a list.
    """
    if isinstance(data, dict):
        return {k: remove_embedding(v) for k, v in data.items() if k != "embedding"}
    elif isinstance(data, list):
        return [remove_embedding(item) for item in data]
    else:
        return data

In [566]:
def format_neo4j_response(neo4j_output):
    """
    Cleans the Neo4j response, removes string embeddings from JSON objects,
    eliminates the 'embedding' field, and converts everything into a valid JSON format.

    :param neo4j_output: List of Neo4j data as dictionaries
    :return: JSON-formatted string without 'embedding' entries
    """
    cleaned_data = []

    for record in neo4j_output:
        cleaned_record = {}
        for key, value in record.items():
            try:
                # If the value is an embedded JSON string, decode it
                cleaned_value = json.loads(value)
            except (json.JSONDecodeError, TypeError):
                # If the value is already in a valid format, use it as is
                cleaned_value = value

            # Remove 'embedding' from the JSON data
            cleaned_record[key] = remove_embedding(cleaned_value)

        cleaned_data.append(cleaned_record)

    return json.dumps({"Query": cleaned_data}, indent=4)


## Report Generation

In [567]:
report_prompt_text = """
Create a structured police investigation report with a length of 150 words for the given cybercrime case, 
utilizing information from a Knowledge Graph in JSON format.

Important Information:
- Use full sentences and detailed explanations.
- Clearly structure the report with the following sections and headings.
- You are only allowed to use information provided by the structured JSON

REPORT STRUCTURE:
1. Case Categorization  
   - Classify the case as either "Cybercrime in the narrow sense" or "Cybercrime in the broad sense".  
   - Justify your classification with relevant definitions.

2. Phenomenon Analysis  
   - Identify the specific cybercrime type(s) involved.  
   - Explain how this technique works in general (e.g., How phishing attacks operate, technical aspects).

3. Legal Assessment  
   - Identify which German laws (StGB, BDSG, UrhG) apply.  
   - Cite specific paragraphs and explain why they are relevant.  

4. Investigation Strategy  
   - List concrete forensic measures (e.g., E-Mail Header analysis, specific investigation questions).  
   - Provide step-by-step descriptions of how each measure is conducted.  
"""

In [568]:
report_prompts = create_prompts_from_csv(file_path, report_prompt_text)

In [569]:
report_context = """
You are a cybercrime investigation assistant supporting law enforcement with a Neo4j Knowledge Graph.
The structured JSON contains the extracted knowledge, serving as the basis for analyzing the case.
Your task is to utilize all relevant details to provide a clear and concise assessment.

Key Requirements:
- Use only data from the JSON.

Use this context to enhance the accuracy and detail of your response.    
"""

In [570]:
def update_json_in_system_message(json_output):

    return f"You are extracting information from the Knowledge Graph, where the structured JSON provides the necessary knowledge for analyzing the given cybercrime case.\n\n{json_output}"

### RAG Results

In [586]:
def process_cybercrime_cases_sequentially(baseline_prompts, baseline_context, baseline_system_message, llm_key, report_prompts, report_context):
    """
    Sequentially processes each cybercrime case prompt by executing a series of analysis functions 
    and stores the results in a dictionary.

    Parameters:
        baseline_prompts (list): List of baseline cybercrime case prompts.
        baseline_context (str): Context for the baseline analysis.
        baseline_system_message (str): System message for baseline analysis.
        llm_key (str): API key for the LLM analysis.
        phenomenon (str): Cybercrime phenomenon being analyzed.
        measures (str): Cybercrime measures being considered.
        report_prompts (list): List of report prompts corresponding to each baseline prompt.
        report_context (str): Context for the report analysis.

    Returns:
        dict: A dictionary mapping each prompt index to the corresponding result.
    """
    rag_results = {}

    # Ensure the lists have the same length
    if len(baseline_prompts) != len(report_prompts):
        raise ValueError("baseline_prompts and report_prompts must have the same length")

    # Iterate through each prompt pair with an index
    for index, (baseline_prompt, report_prompt) in enumerate(zip(baseline_prompts, report_prompts), start=1):
        print(f"Processing case {index}/{len(baseline_prompts)}: {baseline_prompt}")

        # Step 1: Generate the LLM response based on the baseline prompt
        baseline_cybercrime_llm_response = analyse_cybercrime_prompt(llm_key, baseline_prompt, baseline_context, baseline_system_message)

        # Step 2: Extract relevant cybercrime details
        baseline_cybercrime_details = extract_cybercrime_details(baseline_cybercrime_llm_response)
        
        phenomenon = baseline_cybercrime_details["identified_phenomenon"]
        measures = baseline_cybercrime_details["recommended_investigation_measures"]

        # Step 3: Generate the Cypher query using extracted details
        cybercrime_cypher_query = generate_cypher_query(phenomenon, measures, llm_key)

        # Step 4: Execute the generated Cypher query to retrieve information
        cybercrime_informations = execute_cypher_query(cybercrime_cypher_query)

        # Step 5: Format the retrieved Neo4j response into JSON
        json_output = format_neo4j_response(cybercrime_informations)
        
        # Step 6: Update system message with JSON output
        report_system_message = update_json_in_system_message(json_output)
        
        # Step 7: Generate the LLM response based on the report prompt
        cybercrime_report = analyse_cybercrime_prompt(llm_key, report_prompt, report_context, report_system_message)

        # Save the result indexed by the prompt number
        rag_results[index] = {
            "report_prompt": report_prompt,
            "cypher_query": cybercrime_cypher_query,
            "formatted_json": json_output,
            "cybercrime_report": cybercrime_report
        }

        # Optional: Print progress
        print(f"Finished processing case {index}\n")

    return rag_results

In [None]:
rag_results = process_cybercrime_cases_sequentially(baseline_analysis_prompts, baseline_analysis_context, baseline_analysis_system_message, llm_key, report_prompts, report_context)