In [25]:
import json
import getpass
import os
import textwrap
from dotenv import load_dotenv
import uuid

from langchain.docstore.document import Document
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.graphs import Neo4jGraph

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI

# 환경 변수 로드
load_dotenv('.env', override=True)

# Neo4j 데이터베이스 설정
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'

# OpenAI 설정
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_BASE_URL = os.getenv('OPENAI_BASE_URL')

# URI 출력
print('>>>>>>>>>>', NEO4J_URI)

# 이벤트에서 postId와 content 추출
postId = 73
content = [
    {
      "id": "c6a352f4-1898-405c-919d-c91dc628f6c2",
      "type": "paragraph",
      "props": {
        "textColor": "default",
        "backgroundColor": "default",
        "textAlignment": "left"
      },
      "content": [
        {
          "type": "text",
          "text": "빨가\n배가 너무 부릅니다."
        }
      ],
      "children": []
    },
    {
      "id": "ab9bcf5f-2c92-4cc6-95da-8da5211ca0c1",
      "type": "paragraph",
      "props": {
        "textColor": "default",
        "backgroundColor": "default",
        "textAlignment": "left"
      },
      "content": [
        {
          "type": "text",
          "text": "의식의 흐름."
        }
      ],
      "children": []
    },
    {
      "id": "df2b011c-9f8d-40ca-9512-06d0c9579fa6",
      "type": "paragraph",
      "props": {
        "textColor": "default",
        "backgroundColor": "default",
        "textAlignment": "left"
      },
      "content": [
        {
          "type": "text",
          "text": "이것이 개발."
        }
      ],
      "children": []
    }
  ]

# content가 사전 형태의 리스트이므로 JSON 파싱 필요 없음
data = content

# "type"이 "paragraph", "mention", "heading"인 객체들의 "text" 속성을 이어붙일 문자열 초기화
result_text = ''

# 각 객체 순회하며 조건에 맞는 "text" 속성 추출하여 문자열로 이어붙임
for obj in data:
    if obj["type"] in ["paragraph", "mention", "heading"]:
        if "content" in obj and len(obj["content"]) > 0 and "text" in obj["content"][0]:
            result_text += obj["content"][0]["text"]

print("파싱 완료")

print("파싱 완료 어브젝트: ", result_text)



random_uuid = uuid.uuid4()

print(random_uuid)



def split_form10k_data_from_file(file):
    chunks_with_metadata = [] # use this to accumlate chunk records
    file_as_object = json.load(open(file)) # open the json file
    for item in ['item1','item1a','item7','item7a']: # pull these keys from the json
        print(f'Processing {item} from {file}') 
        item_text = file_as_object[item] # grab the text of the item
        item_text_chunks = text_splitter.split_text(item_text) # split the text into chunks
        chunk_seq_id = 0
        for chunk in item_text_chunks[:20]: # only take the first 20 chunks
            form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
            # finally, construct a record with metadata and the chunk text
            chunks_with_metadata.append({
                'text': chunk, 
                # metadata from looping...
                'f10kItem': item,
                'chunkSeqId': chunk_seq_id,
                # constructed metadata...
                'formId': f'{form_id}', # pulled from the filename
                'chunkId': f'{form_id}-{item}-chunk{chunk_seq_id:04d}',
                # metadata from file...
                'names': file_as_object['names'],
                'cik': file_as_object['cik'],
                'cusip6': file_as_object['cusip6'],
                'source': file_as_object['source'],
            })
            chunk_seq_id += 1
        print(f'\tSplit into {chunk_seq_id} chunks')
    return chunks_with_metadata
  
  
  

chunkId = random_uuid
chunkText = result_text

merge_chunk_node_query = """
MERGE(mergedChunk:Chunks)
    ON CREATE SET
        mergedChunk.text = $chunkText
    WITH mergedChunk
        MATCH (p:Post) WHERE ID(p) = $postId
        MERGE (mergedChunk)-[:EMBED]->(p)
RETURN mergedChunk
"""





# Neo4j 그래프 초기화
kg = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE
)



kg.query(merge_chunk_node_query, 
         params={'chunkText': chunkText, 'postId': postId })




# 벡터 인덱스 생성 쿼리
vector = kg.query("""
    CREATE VECTOR INDEX `embeddedPost` IF NOT EXISTS
    FOR (c:Chunks) ON (c.textEmbeddings)
    OPTIONS { indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'    
    }}
""")

print("인덱스 생성 쿼리", kg.query("SHOW INDEXES"))



print("벡터 인덱스 생성 완료")

# 벡터 인코딩 및 노드 연결 쿼리
kg.query("""
    MATCH (chunks:Chunks) WHERE chunks.textEmbeddings IS NULL
    WITH chunks, genai.vector.encode(
        chunks.text, 
        "OpenAI", 
        {
            token: $openAiApiKey, 
            endpoint: $openAiEndpoint
        }) AS vector
    CALL db.create.setNodeVectorProperty(chunks, "textEmbeddings", vector)
    
    WITH chunks
    MATCH (post:Post {id: $postId}) 
    MERGE (chunks)-[:EMBED]->(post) 
    RETURN chunks, post
""", 
params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_BASE_URL, "postId": postId})

# 결과 출력
print("결과 : ", result_text)


>>>>>>>>>> neo4j+ssc://849cc9ca.databases.neo4j.io:7687
파싱 완료
파싱 완료 어브젝트:  빨가
배가 너무 부릅니다.의식의 흐름.이것이 개발.
ca651d9f-06cd-4d40-9b32-abec0b5d4c4b
인덱스 생성 쿼리 [{'id': 2, 'name': 'Document_unique_idx', 'state': 'ONLINE', 'populationPercent': 100.0, 'type': 'RANGE', 'entityType': 'NODE', 'labelsOrTypes': ['Document'], 'properties': ['id'], 'indexProvider': 'range-1.0', 'owningConstraint': 'Document_unique_idx', 'lastRead': neo4j.time.DateTime(2024, 3, 25, 14, 8, 47, 785000000, tzinfo=<UTC>), 'readCount': 54}, {'id': 9, 'name': 'embeddedPost', 'state': 'ONLINE', 'populationPercent': 100.0, 'type': 'VECTOR', 'entityType': 'NODE', 'labelsOrTypes': ['Chunks'], 'properties': ['textEmbeddings'], 'indexProvider': 'vector-2.0', 'owningConstraint': None, 'lastRead': None, 'readCount': 0}, {'id': 11, 'name': 'form_10k_chunks', 'state': 'ONLINE', 'populationPercent': 100.0, 'type': 'VECTOR', 'entityType': 'NODE', 'labelsOrTypes': ['Chunk'], 'properties': ['textEmbedding'], 'indexProvider': 'vector-2.0', 'o