In [5]:
# import pypdf
# print("pypdf version:", pypdf.__version__)

# import sys
# !{sys.executable} -m pip install tf-keras
import sys
!{sys.executable} -m pip install google-generativeai


Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.25.1-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.183.0-py3-none-any.whl.metadata (7.0 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting grpcio-status<2.0.0,>=1.33.2 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading grpcio_status-1.75.0-py3-none-any.whl.meta

In [1]:
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings  # <-- NEW

import os
import shutil
import glob

# Using local embeddings instead of OpenAI paid api
CHROMA_PATH = "chroma"
DATA_PATH = os.path.join("..", "..", "data", "rag_data")

def main():
    generate_data_store()

def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)

def load_documents():
    documents = []
    pdf_files = glob.glob(os.path.join(DATA_PATH, "*.pdf"))  # Load all PDFs
    for file_path in pdf_files:
        loader = PyPDFLoader(file_path)
        documents.extend(loader.load())
    print(f"Loaded {len(documents)} pages from {len(pdf_files)} PDF files.")
    return documents

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    return chunks

def save_to_chroma(chunks: list[Document]):
    # Clear out the database first, but safely
    if os.path.exists(CHROMA_PATH):
        try:
            shutil.rmtree(CHROMA_PATH)
        except PermissionError:
            print("Chroma DB is locked or used by other kernel")
            return

    # Use free, local embedding model
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Create a new DB from the documents
    db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_PATH)
    db.persist()
    db = None  # Explicitly drop reference to release file handles
    print(f" Saved {len(chunks)} chunks to {CHROMA_PATH}.")


if __name__ == "__main__":
    main()


Loaded 13 pages from 1 PDF files.
Split 13 documents into 290 chunks.


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



 Saved 290 chunks to chroma.


  db.persist()


In [2]:
import argparse
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import HuggingFaceHub  # free model from HuggingFace Hub

CHROMA_PATH = "chroma"

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

def main():
    # CLI for passing question
    parser = argparse.ArgumentParser()
    parser.add_argument("query_text", type=str, help="The query text you want to ask.")
    args = parser.parse_args()
    query_text = args.query_text

    # Prepare the DB with local embeddings
    embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB (top 3 chunks)
    results = db.similarity_search_with_relevance_scores(query_text, k=3)
    if len(results) == 0 or results[0][1] < 0.3:
        print(f"Unable to find good matching results.")
        return

    # Prepare the context for the prompt
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _ in results])
    prompt = PromptTemplate.from_template(PROMPT_TEMPLATE)
    final_prompt = prompt.format(context=context_text, question=query_text)

    # Load a free HuggingFace LLM (e.g. flan-t5-base)
    llm = HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature": 0, "max_length": 512})
    chain = LLMChain(llm=llm, prompt=prompt)

    response_text = chain.run({"context": context_text, "question": query_text})

    # Print results
    sources = [doc.metadata.get("source", None) for doc, _ in results]
    formatted_response = f"Response: {response_text}\n\nSources: {sources}"
    print(formatted_response)

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] query_text
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [1]:
import json

sample_data = {
  "product_name": "Copper Wire",
  "process_route": "Mining → Smelting → Casting → Transport",
  
  "user_inputs": {
    "energy_source": "Electricity",
    "transport_mode": "Rail",
    "transport_distance_km": 407.64,
    "recycled_content_percent": 74.52,
    "location": "South America",
    "functional_unit": "1 kg Copper Wire",
    "raw_material_type": "Aluminium Scrap",
    "processing_method": "Conventional"
  },
  
  "ai_predictions": {
    "gwp_kg_co2_eq": 1082.34,
    "material_circularity_indicator": 0.51,
    "water_consumption_m3": 12.23,
    "end_of_life_recycling_rate_percent": 74.77,
    "energy_per_material_mj": 2.29,
    "total_air_emissions_kg": 39.36,
    "total_water_emissions_kg": 1.48,
    "circularity_score": 50.66,
    "potential_gwp_reduction_renewable_percent": 15.0,
    "potential_mci_improvement_recycling_percent": 10.0
  },
  
  "benchmarks": {
    "industry_average_gwp": 1200.0,
    "best_in_class_mci": 0.8,
    "sector_average_water_m3": 15.0
  }
}

# Save to a JSON file
with open("sample_context.json", "w") as f:
    json.dump(sample_data, f, indent=2)

print("✅ sample_context.json created successfully!")


✅ sample_context.json created successfully!
