In [None]:
# Cell 0: Install Dependencies

# First, let's install the necessary Python packages.  Run this cell to install the 'openai', 'neo4j', and 'azure-storage-blob' libraries.\n# These libraries allow us to work with OpenAI, Neo4j, and Azure Blob Storage, respectively.\n# In a Jupyter Notebook, you can install packages using pip with a cell like this:\n# !pip install openai neo4j azure-storage-blob\n#
# Note: You only need to run this cell once.  If you are running this code in a non-notebook\n#       environment, you'll need to install these packages using your system's terminal\n#       (e.g.,  pip install openai neo4j azure-storage-blob).

In [None]:
# Cell 1: Import Libraries

        """
# Next, we'll import the Python libraries that we'll use in this notebook.\n# These libraries provide functions for working with XML data, connecting to Neo4j, handling JSON, \n# accessing Azure Blob Storage, and interacting with the OpenAI API.
"""
import os
from xml.etree import ElementTree as ET
from neo4j import GraphDatabase
import json
from azure.storage.blob import BlobServiceClient
import openai
from openai import AzureOpenAI

In [None]:
# Cell 2: Azure Blob Storage Setup

# Now, we'll set up the connection to Azure Blob Storage.  You'll need to provide your Azure Blob Storage account URL.\n# We'll use this to access the Form 13F data files.\n# Replace with your actual Azure Blob Storage account URL\naccount_url = "https://neo4jdataset.blob.core.windows.net/"  
container_name = "form13-raw"
blob_service_client = BlobServiceClient(account_url=account_url)

In [None]:
# Cell 3: OpenAI Setup

# Here, we configure the connection to Azure OpenAI.  You'll need to provide your Azure OpenAI credentials, including the endpoint, API version, API key, and deployment name.\n# Replace with your actual Azure OpenAI credentials and deployment name\nAPI_ENDPOINT = "https://oneblinkopenaigenericservice.openai.azure.com/"
API_VERSION = "2024-02-01"
API_KEY = "8191daa13e01408887fcd362364937bb"
deployment_name = "oneblink-gp4ouseast"

openai.api_type = "azure"
openai.api_base = API_ENDPOINT
openai.api_version = API_VERSION
openai.api_key = API_KEY

client = AzureOpenAI(
    api_key=API_KEY,
    api_version=API_VERSION,
    azure_endpoint=API_ENDPOINT
)

In [None]:
# Cell 4: Neo4j Setup

# Now, let's set up the connection to the Neo4j graph database.  You'll need to provide your Neo4j connection URI, username, and password.\n# Replace with your actual Neo4j connection URI, username, and password\nNEO4J_URI = 'neo4j+s://neo4j-partners.com:443'
NEO4J_USERNAME = 'neo4j'
NEO4J_PASSWORD = 'FabricNeo@2024'
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [None]:
# Cell 5: Helper Functions

# We'll define a few helper functions to make the code more organized and reusable.\n# These functions will handle reading XML data from Azure Blob Storage, extracting entities using OpenAI, and creating nodes in Neo4j.
def read_xml_from_azure(filename):
    """
    Downloads and reads an XML file from Azure Blob Storage.
    """
    try:
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=filename)
        content = blob_client.download_blob(max_concurrency=1, encoding='UTF-8').readall()
        
        # Find the actual XML content
        xml_start = content.find(b'<?xml version="1.0"')
        xml_end = content.find(b'</XML>')
        
        if xml_start != -1 and xml_end != -1:
            xml_content = content[xml_start:xml_end + 6].decode('utf-8')  # Decode the relevant part
            return xml_content
        else:
            print(f"Could not find XML content in {filename}")
            return None
            
    except Exception as e:
        print(f"Error reading file {filename}: {e}")
        return None

def extract_entities(xml_content):
    """
    Extracts entities from the XML content using Azure OpenAI.
    """
    if not xml_content:
        return None
    prompt = f"""
    Extract the following information from the XML content and return it as a JSON object:\n    * \"managerName\": The text content of the <name> tag under <filingManager>\n    * \"street1\": The text content of the <com:street1> tag under <address>\n    * \"street2\": The text content of the <com:street2> tag under <address> (if present)\n    * \"city\": The text content of the <com:city> tag under <address>\n    * \"stateOrCounty\": The text content of the <com:stateOrCountry> tag under <address>\n    * \"zipCode\": The text content of the <com:zipCode> tag under <address>\n
    Return the JSON object without any markdown formatting or code block indicators.
    
    XML Content:
    {{xml_text}}
    """.format(xml_text=xml_content)
    try:
        response = client.chat.completions.create(
            model=deployment_name,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=300
        )
        result = response.choices[0].message.content
        cleaned_result = result.replace('```json', '').replace('```', '').strip()
        return json.loads(cleaned_result)
    except Exception as e:
        print(f"Error extracting entities: {e}")
        print(f"Response content: {response.choices[0].message.content if 'response' in locals() else 'No response'}")
        return None

def create_nodes(tx, data):
    """
    Creates nodes in Neo4j based on the extracted entities.
    """
    if not data:
        return
    # Create Manager node
    if data.get("managerName"):
        tx.run("MERGE (m:Manager {name: $name})",
               name=data["managerName"])
    # Filter out None values from address properties
    address_props = {k: v for k, v in data.items()
                    if k in ["street1", "street2", "city", "stateOrCounty", "zipCode"]
                    and v is not None  # Only include non-None values
                   }
    if address_props.get("street1"):  # Only create address if at least street1 exists
        # Dynamically build the Cypher query based on available properties
        props_string = ", ".join(f"{k}: ${k}" for k in address_props.keys())
        address_query = f"""
            MERGE (a:Address {{{props_string}}})
        """
        tx.run(address_query, **address_props)
        # Create relationship between Manager and Address
        if data.get("managerName"):
            tx.run("""
                MATCH (m:Manager {{name: $name}})
                MATCH (a:Address {{street1: $street1}})
                MERGE (m)-[:HAS_ADDRESS]->(a)
            """, name=data["managerName"], street1=address_props["street1"])


In [None]:
# Cell 6: Main Execution

# This is the main part of the script.  It defines the files to be processed and calls the helper functions to extract data and load it into Neo4j.\nif __name__ == "__main__":
    sample_files = [
        'raw_2023-07-18_archives_edgar_data_1108893_0001108893-23-000005.txt',
        'raw_2023-07-18_archives_edgar_data_1488921_0001085146-23-002736.txt',
        'raw_2023-07-18_archives_edgar_data_1163165_0001104659-23-081874.txt',
        'raw_2023-07-18_archives_edgar_data_1567459_0000950123-23-006124.txt'
    ]
    process_files(sample_files)
    driver.close()