In [1]:
import neo4j
import os.path
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
# from llama_index.core import PropertyGraphIndex
from llama_index.core import SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM 
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from llama_index.core.indices.property_graph import SimpleLLMPathExtractor
# from langchain.graphs import PropertyGraph
# from langchain.indexes import PropertyGraphIndex
import warnings
from ollama import chat
import openai
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm










In [2]:
using_openai = True

llm_path =  "E:/Temp/model/Llama-3.2-1B"
embed_path =  "E:/Temp/model/bert-base-uncased"

openai_key_path = "E:/Temp/util/openai_key"

neo4j_uri = "bolt://localhost:7687"
neo4j_username = "neo4j"
neo4j_password_path = "E:/Temp/util/password"

In [3]:
def GetTxt(path):
	if os.path.isfile(path):
		return open(path, "r").read()
	return ""

def ConnectDriver(uri, auth):
	return neo4j.GraphDatabase.driver(uri, auth)

In [4]:
neo4j_password = GetTxt(neo4j_password_path)
if (neo4j_password == ""):
    print("Password file not found.")

In [5]:
if using_openai:    
    openai_key = GetTxt(openai_key_path)
    if (openai_key == ""):
        print("OpenAI API Key not found.")
        
    os.environ["OPENAI_API_KEY"] = openai_key

    llm = llm=OpenAI(model="gpt-3.5-turbo", temperature=0.0)
    embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")
else:
    llm = HuggingFaceLLM(model_name=llm_path)
    embed_model = HuggingFaceEmbedding(model_name=embed_path)

In [6]:
def get_completion(prompt, model='gpt-3.5-turbo', api_key=openai_key):
    # Assign the provided API key
    if api_key is None:
        raise ValueError("An API key must be provided.")
    
    openai.api_key = api_key
    
    # Make the API call using the updated API
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {'role': 'user', 'content': prompt}
        ]
    )
    
    # Return the content of the response
    return response['choices'][0]['message']['content']

In [8]:
def categorize_codes_with_langchain(api_key, codes, model="gpt-3.5-turbo"):
    # Initialize the ChatOpenAI model with the API key
    chat = ChatOpenAI(
        model=model,
        openai_api_key=api_key,
        temperature=0  # Set temperature to 0 for deterministic results
    )

    # Define the prompt template
    prompt_template = PromptTemplate(
        input_variables=["codes"],
        template="""
        Examine the list of codes generated from the grounded theory analysis and identify groups or categories of codes that share similar themes or concepts. 
        For each pair or set of codes, assess whether they represent the same or closely related ideas. Consider the context of the codes, their underlying meanings, and the patterns that emerge across the data. 
        Group codes into categories based on their conceptual similarity, and describe the rationale behind each grouping. 
        Additionally, note if any codes overlap or can be merged into broader categories.
        {codes}
        """
    )

    # Format the prompt with the provided codes
    prompt = prompt_template.format(codes=codes)

    # Get the response
    response = chat.predict(prompt)

    return response


In [9]:
documents = SimpleDirectoryReader("E:/Temp/data/").load_data()

In [None]:
def extract_codes_with_langchain(api_key, excerpt, model="gpt-3.5-turbo" ):
  
    chat = ChatOpenAI(
        model=model,
        openai_api_key=api_key,
        temperature=0  
    )

    prompt_template = PromptTemplate(
    input_variables=["text"],
    template = f"""I have qualitative interview data that I want to analyze using grounded theory methodology. 
    Please help me generate open codes by identifying key ideas, concepts, and patterns in the text, these will be called 'codes'. 
    Focus on labeling discrete segments of the data with descriptive codes that capture their essence without imposing preconceived categories. 
    The goal is to remain open to emergent themes and meanings inherent in the data. 
    Example Input Text: 'Participants frequently mentioned feeling overwhelmed by the constant demands of their work environment, 
    but they also described finding moments of satisfaction when completing challenging projects. 
    One participant said, 'I often feel like I'm drowning in tasks, but when I finally finish something, it's so rewarding'. 
    Desired Output in CSV format with codes upto 5 words: ('Feeling overwhelmed', 'Constant work demands', 'Moments of satisfaction', 'Rewarding experiences after task completion'). 
    Do this the data and ensure use of only CSV format, don't include quotation marks or any other special character except comma: {excerpt}"""
    )

    prompt = prompt_template.format(text=excerpt)

    response = chat.predict(prompt)

    return response

In [None]:
all_codes = []
for i, doc in enumerate(documents):
    resp = extract_codes_with_langchain(openai_key, doc.text)
    all_codes.append({i: resp})

In [10]:
import pickle
def load_dict_from_pickle(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data

# Example usage
loaded_data = load_dict_from_pickle('data.pkl')
print(loaded_data)



In [27]:
print(len(loaded_data))

540


In [28]:
codes_dict = {}
temp = []
for i,code in enumerate(loaded_data):
    codes_dict[i] = [ j.strip() for j in code[i].split(',')]
    temp.extend(codes_dict[i])    

In [29]:
print(temp)



In [31]:
print(codes_dict)



In [15]:
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from sentence_transformers import SentenceTransformer


In [16]:
class SentenceTransformerWrapper(Embeddings):
    def __init__(self, model_name: str):
        self.model = SentenceTransformer(model_name)
    
    def embed_documents(self, texts):
        return self.model.encode(texts, show_progress_bar=True)
    
    def embed_query(self, text):
        return self.model.encode([text])[0]

In [17]:
embedding_model = SentenceTransformerWrapper("paraphrase-multilingual-mpnet-base-v2")

In [18]:
vector_store = FAISS.from_texts(temp, embedding_model)

Batches:   0%|          | 0/95 [00:00<?, ?it/s]

Batches: 100%|██████████| 95/95 [00:02<00:00, 34.11it/s]


In [19]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver("bolt://localhost:7687", auth=(neo4j_username, neo4j_password))


In [32]:
def create_node(tx, node_type, node_name):
    query = f"MERGE (n:{node_type} {{name: $name}})"
    tx.run(query, name=node_name)

def create_relationship(tx, source, target, relationship):
    query = f"""
    MATCH (a {{name: $source}}), (b {{name: $target}})
    MERGE (a)-[:{relationship}]->(b)
    """
    tx.run(query, source=source, target=target)

In [53]:
with driver.session() as session:
    
    for excerpt_id, codes in codes_dict.items():
        session.write_transaction(create_node, "Excerpt", f"Excerpt {excerpt_id}")
        for code in codes:
            session.write_transaction(create_node, "Code", code)
            session.write_transaction(create_relationship, f"Excerpt {excerpt_id}", code, "HAS_CODE")

In [None]:
def find_similar_codes(code, vector_store):
    similar = vector_store.similarity_search(code, k=5)

    related_codes = [
        sim.page_content
        for sim in similar
    ]
    
    return related_codes

In [None]:
def create_relationship_if_not_exists(tx, node1, node2, relationship_type):
    query = f"""
    MATCH (a {{name: $node1}})-[r:{relationship_type}]->(b {{name: $node2}})
    RETURN r
    """
    result_1 = tx.run(query, node1=node1, node2=node2) 
    result_2 = tx.run(query, node1=node2, node2=node1)
    if ( not result_1.single() and not result_2.single()):  
        create_query = f"""
        MATCH (a {{name: $node1}}), (b {{name: $node2}})
        CREATE (a)-[:{relationship_type}]->(b)
        """
        tx.run(create_query, node1=node1, node2=node2)

In [None]:
with driver.session() as session:
    for excerpt_id, codes_1 in codes_dict.items():
        for code in codes_1:
            similar_codes = find_similar_codes(code, vector_store)
            for related_code in similar_codes:
                if code != related_code:  # Avoid self-loops
                    session.write_transaction(
                        create_relationship_if_not_exists, code, related_code, "SIMILAR_TO"
                    )

In [39]:
with driver.session() as session:
    for excerpt_id, codes_1 in codes_dict.items():
        for code in codes_1:
            similar_codes = find_similar_codes(code, vector_store)
            for related_code in similar_codes:
                if code != related_code:  
                    session.write_transaction(create_relationship, code, related_code, "SIMILAR_TO")