In [73]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,build_text_unit_context,LocalContextBuilder
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

In [74]:
#INPUT_DIR = "./inputs/operation dulce"
INPUT_DIR = "/home/cip/ce/ix05ogym/Majid/LLM/GraphRag/elec_graph/output"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

In [75]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 2229


Unnamed: 0,level,title,type,description,source_id,degree,human_readable_id,id,graph_embedding,community,size,entity_type,top_level_node_id,x,y
0,0,MIKE JUDD,PERSON,"Mike Judd is an author of the book ""Soldering ...",e82f9f1a4a5f409251687ca54991574a,2,0,fb1ad80c6e4842c2ac554e156ce593f5,,,,,fb1ad80c6e4842c2ac554e156ce593f5,,
1,0,KEITH BRINDLEY,PERSON,"Keith Brindley is an author of the book ""Solde...",e82f9f1a4a5f409251687ca54991574a,2,1,fbb428960ebc4729b5e2e1c4d0eaabfe,,,,,fbb428960ebc4729b5e2e1c4d0eaabfe,,
2,0,NEWNES,ORGANIZATION,"Newnes is the publisher of the book ""Soldering...",e82f9f1a4a5f409251687ca54991574a,9,2,7c9524a8d3a545768248a79e49a4d846,,,,,7c9524a8d3a545768248a79e49a4d846,,
3,0,BUTTERWORTH-HEINEMANN,ORGANIZATION,Butterworth-Heinemann is the parent company of...,e82f9f1a4a5f409251687ca54991574a,2,3,24c36d4df1d645dab616505a361365fd,,,,,24c36d4df1d645dab616505a361365fd,,
4,0,REED EDUCATIONAL AND PROFESSIONAL PUBLISHING LTD,ORGANIZATION,Reed Educational and Professional Publishing L...,e82f9f1a4a5f409251687ca54991574a,2,4,38041978d9114fc9842436b762783fae,,,,,38041978d9114fc9842436b762783fae,,


In [76]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 3179


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,MIKE JUDD,NEWNES,5.0,Mike Judd is an author of a book published by ...,[e82f9f1a4a5f409251687ca54991574a],2253aca4b1544f899ab80a6bcac6a6c4,0,2,9,11
1,MIKE JUDD,SOLDERING IN ELECTRONICS ASSEMBLY,5.0,"Mike Judd is an author of the book ""Soldering ...",[e82f9f1a4a5f409251687ca54991574a],dd2c46e8c5aa42888c4011c9490cc0ab,1,2,4,6
2,KEITH BRINDLEY,NEWNES,5.0,Keith Brindley is an author of a book publishe...,[e82f9f1a4a5f409251687ca54991574a],d9b99f755a494293ac9627a0a01b0b52,2,2,9,11
3,KEITH BRINDLEY,SOLDERING IN ELECTRONICS ASSEMBLY,5.0,"Keith Brindley is an author of the book ""Solde...",[e82f9f1a4a5f409251687ca54991574a],7ecd045f2385449fba6f653a0c0920a3,3,2,4,6
4,NEWNES,BUTTERWORTH-HEINEMANN,8.0,Newnes is an imprint of Butterworth-Heinemann,[e82f9f1a4a5f409251687ca54991574a],983ea8ae7ad643f98eea079002fadcf2,4,9,2,11


In [77]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
"""covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}"""

'covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")\n\nclaims = read_indexer_covariates(covariate_df)\n\nprint(f"Claim records: {len(claims)}")\ncovariates = {"claims": claims}'

In [78]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 73


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,58,# Soldering and Electronics Assembly\n\nThis c...,2,7.0,Soldering and Electronics Assembly,The impact severity rating is high due to the ...,This community revolves around the process of ...,[{'explanation': 'Soldering is the central ent...,"{\n ""title"": ""Soldering and Electronics Ass...",b35de978-ca01-44d7-a012-7b6526ae3412
1,59,# Flux and Solderability\n\nThis community rev...,2,3.0,Flux and Solderability,The impact severity rating is low due to the l...,This community revolves around the concept of ...,[{'explanation': 'Flux is a key entity in this...,"{\n ""title"": ""Flux and Solderability"",\n ...",287df284-3c47-4e78-8024-7bfed14af4ee
2,60,# Assembly and its Associated Faults\n\nThis c...,2,7.0,Assembly and its Associated Faults,The impact severity rating is high due to the ...,This community revolves around the concept of ...,[{'explanation': 'Assembly is a fundamental pr...,"{\n ""title"": ""Assembly and its Associated F...",797c3220-46e5-4c6f-9a3a-83da48391eb3
3,61,# Component Assembly and Manufacturing\n\nThis...,2,7.0,Component Assembly and Manufacturing,The impact severity rating is high due to the ...,This community focuses on the assembly and man...,[{'explanation': 'Components are the fundament...,"{\n ""title"": ""Component Assembly and Manufa...",65dff29b-096d-4a55-aaef-66de20e2de64
4,62,# Electrical Connections and Circuits\n\nThis ...,2,3.0,Electrical Connections and Circuits,The impact severity rating is low due to the t...,This community revolves around the concept of ...,[{'explanation': 'Soldering is a crucial techn...,"{\n ""title"": ""Electrical Connections and Ci...",d294c6d5-ba91-42de-a01e-1aa6c4b21978


In [79]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()


Text unit records: 28


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,e82f9f1a4a5f409251687ca54991574a,"6G a®*: s +. i 2 a <, s e Ged SECOND EDITION\n...",791,[soldering.pdf_0],"[fb1ad80c6e4842c2ac554e156ce593f5, fbb428960eb...","[2253aca4b1544f899ab80a6bcac6a6c4, dd2c46e8c5a..."
1,e79b884d7685eec5915f598876065fa0,Foreword\n\nPreface\n\nAcknowledgements\n\n1 S...,830,[soldering.pdf_1],"[3bfa928fa72346a9bd36df9d18e29d46, e40202c25c8...","[18a22fbd59984239829041aaa1f09ce5, 79bcefb0932..."
2,8283f5a9bf11a066788202bdc6651675,TIME ON ITS SIDE 3\n\nmake up the circuit of a...,796,[soldering.pdf_10],"[3bfa928fa72346a9bd36df9d18e29d46, e40202c25c8...","[99835832c2f3439880b49c6c38bf68d3, 0ee567e423b..."
3,6927c3447dc3d9eeaa6e89e04ae5a84b,PRINTED CIRCUIT BOARD 5 Printed circuit board ...,773,[soldering.pdf_11],"[3bfa928fa72346a9bd36df9d18e29d46, e40202c25c8...","[18a22fbd59984239829041aaa1f09ce5, c17ec52ee3b..."
4,a666a3f859c5aa120085c1c8e136a832,6 SOLDERING PROCESS\n\nHole through board Comp...,634,[soldering.pdf_12],"[3bfa928fa72346a9bd36df9d18e29d46, 3b642c945e2...","[05f6641eec57433c913257e8a092a183, 70aa84654a9..."


In [80]:

import litellm
#api_key = os.environ["GRAPHRAG_API_KEY"]
#llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
#embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]

BASE_URL = "http://localhost:8080/"
MODEL_NAME = "gpt-4o"
litellm.set_verbose = False

llm = ChatOpenAI(
    api_key="api_key",
    model=MODEL_NAME,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
    api_base=BASE_URL,
    
)
token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key="api_key",
    api_base=BASE_URL,
    api_type=OpenaiApiType.OpenAI,
    model='text',
    deployment_name='text',
    max_retries=20,
)


In [81]:
entities[:10]

[Entity(id='c9caaab7681b47a0bae2a1aaf695be2c', short_id='489', title='186 SC SOLDERING PROCESSES', type='PROCESS', description='A soldering process that uses a hot gas or air machine, typically used in rework or individual component placement', description_embedding=[-0.0075237914, -0.01027315, -0.0015119776, 0.010034217, 0.023496395, -0.037361916, -0.022355689, 0.06416811, -0.027110195, -0.014773703, -0.035126936, 0.010744071, 0.017751874, -0.0045348313, -0.07585211, 0.022411436, 0.06338846, -0.037610248, -0.059380427, -0.03047326, -0.028546648, 0.018513015, -0.04821185, -0.024791863, -0.06670405, -0.031593002, -0.003413394, -0.078411296, -0.034091175, -0.011972618, -0.0058803298, 0.023806598, -0.008523792, 0.020857617, 0.005668695, 0.029570563, -0.010683148, 0.011566746, 0.012391507, -0.038378965, -0.04134887, -0.010391751, -0.044730514, -0.01501928, 0.0018924002, -0.020704102, -0.008398382, 0.042849496, -0.073535174, 0.011375849, 0.027243104, 0.030515704, -0.01112724, 0.02405667, 0.

In [82]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    #text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    #covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,

)

In [83]:
print(context_builder.build_context("flux")[0])

-----Reports-----
id|title|content
58|Soldering and Electronics Assembly|"# Soldering and Electronics Assembly

This community revolves around the process of soldering, which is used to join metal parts together, particularly in the context of electronics assembly. The community includes various entities related to soldering, such as solder, flux, solder pots and pumps, and solder pot heaters. It also encompasses broader concepts like safety, cleanliness, and assembly variations. The community's focus on electronics assembly highlights its importance in the manufacturing of electronic devices.

## Soldering as a core process

Soldering is the central entity in this community, serving as the primary process for joining metal parts together, particularly in electronics assembly. This process involves melting and flowing solder into the joint between two metal parts, creating a strong and reliable bond. [Data: Entities (13); Relationships (58, 71, 175, 74, 86, +more)]

## Importance of so

In [10]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [20]:
from graphrag.query.structured_search.local_search.system_prompt import (
    LOCAL_SEARCH_SYSTEM_PROMPT
)
print(LOCAL_SEARCH_SYSTEM_PROMPT)


---Role---

You are a helpful assistant responding to questions about data in the tables provided.


---Goal---

Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.

If you don't know the answer, just say so. Do not make anything up.

Points supported by data should list their data references as follows:

"This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."

Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.

For example:

"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16), Reports (1), Entities (5, 7); Relationships (23); Claims (2, 7, 34, 46, 64, +more)]."

whe

In [84]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [85]:
import chromadb
client = chromadb.PersistentClient('chroma_docs')
col_questions = client.get_or_create_collection('questions')
all_json = col_questions.get()['documents']
import json_repair

all_questions =[]
for j in all_json:
    all_questions.extend( json_repair.loads(j) )

all_questions

[{'Answer': 'I apologize, but the provided text does not specify the essential materials used in the soldering process. It mainly discusses the overall process, safety aspects, and the importance of solderability.',
  'Refrences': [],
  'Answerable': False,
  'Question': 'What are some of the essential materials used in the soldering process?',
  'Graph_Answer': "Solder and flux are crucial materials. Solder is the material that melts and flows into the joint, while flux removes oxides and contaminants from the surfaces, improving the solder's wetting and flow."},
 {'Answer': "The provided text focuses on various assembly variations and soldering processes used in electronics manufacturing, but it doesn't specifically mention how soldering contributes to assembly variations. The text describes different assembly methods for through-hole and surface mounted components, including CS (components before solder) and SC (solder before components) soldering processes.",
  'Refrences': [],
  '

In [86]:
question = all_questions[-1]['Question']
#question = "A soldering process that uses a hot?"
print(question)
result = await search_engine.asearch(question)#"describe soldering metrics")
print(result.response)

What are some industries where soldering is essential?
Soldering is a crucial process in many industries, particularly those involving the assembly of electronic devices. [Data: Entities (653, 646); Relationships (160, 161, 114)] It is a key process in the manufacture of electronic assemblies and products, contributing to the development of new technologies and advancements in the field. [Data: Entities (653, 646); Relationships (160, 161, 114)] Soldering is also used in other industries, such as plumbing, where it is necessary to join metal parts together. [Data: Entities (13)] 

### Soldering in Electronics Manufacturing

Soldering is a critical process in electronics manufacturing, as it ensures the electrical conductivity and mechanical strength of the connections. [Data: Entities (13)] The quality of the solder joint is crucial for the reliability and performance of electronic devices. [Data: Entities (13)] Soldering is used to connect electronic components to a printed circuit bo

In [93]:
from utils import answer_to_quesion,electronic_collection
print(question)
output = answer_to_quesion(electronic_collection,question)
print(output)

What are some industries where soldering is essential?
Soldering is a critical process in many industries, particularly those that involve the assembly of electronic devices. 

### Electronics Assembly 

Soldering is essential in the electronics assembly industry [Data: Source unit_text soldering.pdf_8]. It provides both mechanical and electrical support, allowing components to be joined together to form electrical connections. The widespread use of soldering in electronics assembly is due to its cost-effectiveness, ease of operation, and overall high performance compared to other methods [Data: Source unit_text soldering.pdf_8].

### Plumbing

The word "plumbing" itself originates from the Latin "plumbum," meaning lead, highlighting the historical use of soldering in plumbing systems [Data: Source unit_text soldering.pdf_9].  While lead plumbing is no longer common due to health concerns, soldering is still essential in modern plumbing, often using alloys of tin and lead [Data: Source

In [14]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,191,SOLDERING TECHNOLOGY,Soldering technology refers to the techniques ...,3,True
1,13,SOLDERING,Soldering is a process that joins metal parts ...,120,True
2,486,SOLDERING ASSEMBLY,Soldering assembly is a key process in the ele...,3,True
3,46,HAND SOLDERING,Hand soldering is a manual process of joining ...,2,True
4,426,MACHINE SOLDERING,Machine soldering is a method of soldering tha...,3,True


In [15]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,129,SOLDERING,SOLDERING TECHNOLOGY,Soldering is a process that is part of solderi...,8.0,123,1,True
1,176,SOLDERING,ELECTRICAL CONNECTION,Soldering is used to create electrical connect...,9.0,123,1,True
2,76,SOLDERING,HAND SOLDERING,Hand soldering is a type of soldering,5.0,122,1,True
3,175,SOLDERING,COMPONENT,Soldering is used to bond components together ...,9.0,137,2,True
4,62,SOLDERING,QUALITY,Soldering quality is a crucial aspect of elect...,21.0,129,2,True


In [16]:
result.context_data["reports"].head()

Unnamed: 0,id,title,content
0,58,Soldering and Electronics Assembly,# Soldering and Electronics Assembly\n\nThis c...
1,15,Electronics Assemblies and Soldering Processes,# Electronics Assemblies and Soldering Process...


In [17]:
result.context_data["sources"].head()

KeyError: 'sources'

In [18]:
print(result.context_text)

-----Reports-----
id|title|content
58|Soldering and Electronics Assembly|"# Soldering and Electronics Assembly

This community revolves around the process of soldering, which is used to join metal parts together, particularly in the context of electronics assembly. The community includes various entities related to soldering, such as solder, flux, solder pots and pumps, and solder pot heaters. It also encompasses broader concepts like safety, cleanliness, and assembly variations. The community's focus on electronics assembly highlights its importance in the manufacturing of electronic devices.

## Soldering as a core process

Soldering is the central entity in this community, serving as the primary process for joining metal parts together, particularly in electronics assembly. This process involves melting and flowing solder into the joint between two metal parts, creating a strong and reliable bond. [Data: Entities (13); Relationships (58, 71, 175, 74, 86, +more)]

## Importance of so

In [88]:
print(result.context_data['sources'])

KeyError: 'sources'

In [20]:
from graphrag.query.question_gen.local_gen import LocalQuestionGen

question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [21]:
question_history = [
    "Tell me about soldering process",
    "What happens in cleaning soldering?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

[{'role': 'system', 'content': '\n---Role---\n\nYou are a helpful assistant generating a bulleted list of 5 questions about data in the tables provided.\n\n\n---Data tables---\n\n-----Conversation History-----\nturn|content\nuser|Tell me about soldering process\n\n\n-----Reports-----\nid|title|content\n15|Electronics Assemblies and Soldering Processes|"# Electronics Assemblies and Soldering Processes\n\nThis community revolves around the creation of electronics assemblies, which are finished products made by soldering electronic components to a printed circuit board. The community includes various soldering processes, such as component/solder (CS) processes, solder/component (SC) processes, and inert atmosphere soldering, all of which are used to create different types of joints. The community also includes surface mounted components, which are used in electronics assemblies and are often soldered to the board using surface mount soldering processes.\n\n## Electronics Assemblies as the