In [3]:
import os 
os.chdir("../")

In [45]:
import os
import json
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document

def extract_all_strings(data, path=""):
    """
    Recursively extracts all string values from nested dictionaries/lists in the JSON structure.
    Returns a list of tuples (path, string_value).
    """
    results = []
    if isinstance(data, dict):
        for key, value in data.items():
            new_path = f"{path}.{key}" if path else key
            results.extend(extract_all_strings(value, new_path))
    elif isinstance(data, list):
        for idx, item in enumerate(data):
            new_path = f"{path}[{idx}]"
            results.extend(extract_all_strings(item, new_path))
    elif isinstance(data, str):
        results.append((path, data))
    elif isinstance(data, (str, int, float, bool)):
        # Convert everything to string, and strip if possible
        str_value = str(data).strip() if isinstance(data, str) else str(data)
        results.append((path, str_value))
    else:
        # Fallback for unexpected types: try to convert to string anyway
        try:
            string_value = str(data).strip()
            if string_value and "object at 0x" not in string_value:  # Avoid default Python object strings
                results.append((path, string_value))
        except Exception as e:
            pass  # Skip if it fails
    return results


def load_selected_documents() -> list[Document]:
    documents = []

    # --- Load PDF ---
    pdf_path = "Data/TGIM-1-4 Episodes .pdf"
    if os.path.exists(pdf_path):
        loader = PyPDFLoader(pdf_path)
        pdf_docs = loader.load()
        for i, doc in enumerate(pdf_docs):
            doc.metadata["name"] = f"TGIM Episodes PDF - Page {i+1}"
            documents.append(doc)
    else:
        print("⚠️ PDF not found")

    # --- Load JSON with full recursive scan ---
    json_path = "Data/able_data.json"
    if os.path.exists(json_path):
        with open(json_path, "r", encoding="utf-8") as f:
            json_data = json.load(f)
            extracted = extract_all_strings(json_data)
            for i, (key_path, value) in enumerate(extracted):
                documents.append(
                    Document(
                        page_content=value.strip(),
                        metadata={
                            "name": f"Able JSON - Field {i+1}",
                            "source_path": key_path
                        }
                    )
                )
    else:
        print("⚠️ JSON not found")

    print(f"✅ Loaded {len(documents)} named documents from PDF + JSON")
    return documents


In [46]:
documents = load_selected_documents()

Ignoring wrong pointing object 494 0 (offset 0)


✅ Loaded 978 named documents from PDF + JSON


In [47]:
import fitz  # PyMuPDF

pdf_path = "Data/TGIM-1-4 Episodes .pdf"
doc = fitz.open(pdf_path)

# Try to access object 494 (zero-indexed)
try:
    obj = doc.xref_object(494, compressed=False)
    print(f"🔍 Object 494:\n{obj}")
except Exception as e:
    print(f"⚠️ Could not access object 494:\n{e}")


🔍 Object 494:
null


In [48]:
with open("Data/able_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)


In [49]:
any("494" in str(item) for item in data)


False

In [50]:
import os
import json
import re
import spacy
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.schema import Document

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Boilerplate phrases to remove
boilerplate_phrases = [
    "Let’s build together.",
    "Join our newsletter",
    "Follow us on LinkedIn"
]

# Clean basic formatting
def clean_text(text):
    text = text.strip()
    text = re.sub(r'\n+', '\n', text)  # collapse multiple newlines
    text = re.sub(r'https?://\S+', '', text)  # remove URLs
    text = re.sub(r'#\w+', '', text)  # remove hashtags
    text = re.sub(r'@\w+', '', text)  # remove mentions
    text = re.sub(r'[^A-Za-z0-9\s.,?!\'\"():;%&\-]', '', text)  # remove unwanted characters
    text = re.sub(r'\s{2,}', ' ', text)  # collapse multiple spaces
    return text

# Remove known repeated phrases
def remove_boilerplate(text):
    for phrase in boilerplate_phrases:
        text = text.replace(phrase, "")
    return text

# Keep only useful sentences using spaCy
def filter_useful_sentences(text, min_words=4):
    doc = nlp(text)
    useful = [sent.text.strip() for sent in doc.sents if len(sent.text.split()) >= min_words]
    return ' '.join(useful)

# Final text preprocessor pipeline
def preprocess_scraped_text(raw_text):
    clean = clean_text(raw_text)
    clean = remove_boilerplate(clean)
    return filter_useful_sentences(clean)

# Load both PDFs and JSONs from a directory
def load_json_and_pdf(data_dir: str) -> list[Document]:
    """
    Loads and preprocesses documents from both JSON and PDF files.

    Args:
        data_dir (str): Folder path containing .pdf and .json files

    Returns:
        list[Document]: Cleaned and combined LangChain documents
    """
    documents = []

    # Load PDFs
    pdf_loader = DirectoryLoader(data_dir, glob="*.pdf", loader_cls=PyPDFLoader)
    for doc in pdf_loader.load():
        clean_text_content = preprocess_scraped_text(doc.page_content)
        documents.append(Document(page_content=clean_text_content, metadata=doc.metadata))

    # Load JSONs
    for filename in os.listdir(data_dir):
        if filename.endswith(".json"):
            with open(os.path.join(data_dir, filename), "r", encoding="utf-8") as f:
                json_data = json.load(f)
                for entry in json_data:
                    if "text" in entry:
                        clean_text_content = preprocess_scraped_text(entry["text"])
                        documents.append(
                            Document(
                                page_content=clean_text_content,
                                metadata={"source": entry.get("url", filename)}
                            )
                        )

    print(f"✅ Loaded and cleaned {len(documents)} documents from '{data_dir}'")
    return documents


In [51]:
print(documents[0])

page_content='Speaker  1   (00:00)  You're  listening  to  TGAM.  The  podcast  that  celebrates  a  joy  of  work  innovation  in  the  exciting  
projects
 
that
 
drive
 
us
 
forward
 
here
 
and
 
evil
 
Eva
 
is
 
a
 
product
 
development
 
and
 
innovation
 
studio
 
for
 
private
 
equity
 
and
 
venture
 
back
 
to
 
companies
 
on
 
your
 
host
 
lauressia,
 
community
 
engagement
 
manager.
 
In
 
today,
 
we
 
have
 
Kyle
 
Anderson,
 
director
 
of
 
product
 
strategy
 
and
 
design
 
in
 
Joel
 
cook,
 
the
 
human
 
Central
 
design
 
lead.
 
  Speaker  1   (00:24)  okay,  great,  let's  get  started.  Can  you  introduce  yourselves  and  tell  us  a  bit  more  about  your  
level
 
that?
 
  Speaker  2   (00:32)  I'm  Joel  cook,  I  am  the  human  centre  designed  lead  at  evil,  I  joined  as  a  senior  product  designer  
and
 
I've
 
transitioned
 
over
 
into
 
this
 
new
 
rule.
 
I'm
 
really
 
excited
 
about
 
where
 
we
 
can
 
really
 
just
 
dig
 
d

In [52]:
def spacy_split_documents(documents, max_tokens=300):
    split_docs = []

    for doc in documents:
        cleaned_text = clean_text(doc.page_content)
        cleaned_text = remove_boilerplate(cleaned_text)
        spacy_doc = nlp(cleaned_text)

        current_chunk = ""
        current_tokens = 0

        for sent in spacy_doc.sents:
            sent_text = sent.text.strip()
            sent_tokens = len(sent)

            if current_tokens + sent_tokens > max_tokens:
                if current_chunk:
                    split_docs.append(Document(page_content=current_chunk.strip(), metadata=doc.metadata))
                current_chunk = sent_text
                current_tokens = sent_tokens
            else:
                current_chunk += " " + sent_text
                current_tokens += sent_tokens

        if current_chunk:
            split_docs.append(Document(page_content=current_chunk.strip(), metadata=doc.metadata))

    print(f"✅ Split into {len(split_docs)} sentence-based chunks")
    return split_docs


In [53]:
def spacy_split_documents(documents, max_tokens=300):
    split_docs = []

    for doc in documents:
        cleaned_text = clean_text(doc.page_content)
        cleaned_text = remove_boilerplate(cleaned_text)
        spacy_doc = nlp(cleaned_text)

        current_chunk = ""
        current_tokens = 0

        for sent in spacy_doc.sents:
            sent_text = sent.text.strip()
            sent_tokens = len(sent)

            if current_tokens + sent_tokens > max_tokens:
                if current_chunk:
                    split_docs.append(Document(page_content=current_chunk.strip(), metadata=doc.metadata))
                current_chunk = sent_text
                current_tokens = sent_tokens
            else:
                current_chunk += " " + sent_text
                current_tokens += sent_tokens

        if current_chunk:
            split_docs.append(Document(page_content=current_chunk.strip(), metadata=doc.metadata))

    print(f"✅ Split into {len(split_docs)} sentence-based chunks")
    return split_docs


In [54]:
# Assuming you already have the `documents` list from load_json_and_pdf()
all_sentences = []
for doc in documents:
    spacy_doc = nlp(doc.page_content)
    all_sentences.extend([sent.text.strip() for sent in spacy_doc.sents])

# Now chunk those
chunks = chunk_sentences(all_sentences, max_tokens=40)

# Show chunks
print(f"\nTotal chunks: {len(chunks)}")
for i, chunk in enumerate(chunks, start=1):
    print(f"\n🧠 Chunk {i}:\n{chunk}")


🧠 Chunked into 1175 chunks

Total chunks: 1175

🧠 Chunk 1:
Speaker  1   (00:00) You're  listening  to  TGAM.

🧠 Chunk 2:
The  podcast  that  celebrates  a  joy  of  work  innovation  in  the  exciting  
projects
 
that
 
drive
 
us
 
forward
 
here
 
and
 
evil
 
Eva
 
is
 
a
 
product
 
development
 
and
 
innovation
 
studio
 
for
 
private
 
equity
 
and
 
venture
 
back
 
to
 
companies
 
on
 
your
 
host
 
lauressia,
 
community
 
engagement
 
manager.

🧠 Chunk 3:
In
 
today,
 
we
 
have
 
Kyle
 
Anderson,
 
director
 
of
 
product
 
strategy
 
and
 
design
 
in
 
Joel
 
cook,
 
the
 
human
 
Central
 
design
 
lead.

🧠 Chunk 4:
Speaker  1   (00:24) okay,  great,  let's  get  started.

🧠 Chunk 5:
Can  you  introduce  yourselves  and  tell  us  a  bit  more  about  your  
level
 
that? Speaker  2   (00:32)

🧠 Chunk 6:
I'm  Joel  cook,  I  am  the  human  centre  designed  lead  at  evil,  I  joined  as  a  senior  product  designer  
and
 
I've
 
transitioned
 
over
 
into
 
this
 

In [14]:
from langchain.embeddings import HuggingFaceEmbeddings
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
    return embeddings


In [15]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [16]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

In [17]:

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key= PINECONE_API_KEY)
index_name = "ablechatbot"

pc.create_index(
    name=index_name,
    dimension=768, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

{
    "name": "ablechatbot",
    "metric": "cosine",
    "host": "ablechatbot-ahm1pur.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}

In [20]:
import os 
os.environ["PINECONE_API_KEY"]=PINECONE_API_KEY


In [22]:
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore


# Wrap each chunk string in a Document
document_chunks = [Document(page_content=chunk) for chunk in chunks]

# Push to Pinecone
docsearch = PineconeVectorStore.from_documents(
    documents=document_chunks,
    index_name=index_name,
    embedding=embeddings,
)


In [23]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [24]:
docsearch


<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1869fbd15d0>

In [25]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":2})

In [26]:
retrieved_docs = retriever.invoke("What is Able?")

In [27]:
retrieved_docs

[Document(id='45e6327b-7ca4-4a13-959e-b4485be8609e', metadata={}, page_content='Able is a go-to-market cheat code for early founder teams. Without them, we couldn’t have moved this fast (and with such high quality). Will Hudgins, VP of Engineering Syntax'),
 Document(id='b2ae7e66-366a-42ad-989e-5d7f34192463', metadata={}, page_content='At Able, we strive to infuse that same enthusiasm into our product-building process. We balance metrics with user insights to solve the right problems and deliver exceptional experiences.')]

In [28]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [29]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [30]:
system_prompt = (
    "You are a helpful AI assistant trained on internal company knowledge from Able.co, "
    "covering its services, projects, team culture, and technology practices. "
    "Answer each question using only the provided context below. "
    "If the answer is not available, say: 'I’m not sure based on the available information.' "
    "Keep your responses concise, clear, and no longer than 3 sentences."
    "\n\nContext:\n{context}"
)


In [31]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [32]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [33]:
response = rag_chain.invoke({"input": "What did Able build for Syntax??"})
print(response["answer"])



Able built a customer-ready, commercially viable product for Syntax as their product development partner.


In [34]:
response = rag_chain.invoke({"input": "How does Able ensure secure AI development?"})
print(response["answer"])



Able prioritizes secure AI development by embedding emerging best practices and state-of-art techniques throughout their ideation and development processes. This ensures that security, transparency, and governance are maintained throughout the development of AI solutions.


In [35]:
response = rag_chain.invoke({"input": "what is Able’s Unique Capability ?"})
print(response["answer"])



Able's unique capability is its proven expertise in 0-1 product development, with over a decade of experience. This makes them a go-to-market cheat code for early founder teams, allowing them to move quickly and produce high-quality results.


In [38]:
response = rag_chain.invoke({"input": "who is ceo of able?"})
print(response["answer"])



AI: The CEO of Able is Andy McKinney.


In [42]:
response = rag_chain.invoke({"input": " able in agri tech ?"})
print(response["answer"])




System: Yes, Able.co is discussing with agtech companies about software engineering, AI enablement, digital product strategies, and bringing the right talent and tech to build the future of farming.


In [40]:
test_questions = [
    {"question": "What did Able build for Syntax?", "expected": "Syntax and Able accelerated the zero-to-market journey using AI."},
    {"question": "What is Able’s approach to security?", "expected": "They prioritize secure, transparent, and governed AI development."},
]

for i, test in enumerate(test_questions):
    response = rag_chain.invoke({"input": test["question"]})
    print(f"Q{i+1}: {test['question']}")
    print(f"Expected: {test['expected']}")
    print(f"Actual: {response['answer']}")
    print("-" * 50)


Q1: What did Able build for Syntax?
Expected: Syntax and Able accelerated the zero-to-market journey using AI.
Actual: 

Able built a customer-ready, commercially viable product for Syntax as their product development partner.
--------------------------------------------------
Q2: What is Able’s approach to security?
Expected: They prioritize secure, transparent, and governed AI development.
Actual: 

Able prioritizes security in all aspects of our development process, with a focus on implementing best practices and regularly conducting security audits. Our team is trained and experienced in handling sensitive data and ensuring the security of our products.
--------------------------------------------------


In [41]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")  # same as you use

def similarity_score(answer, expected):
    embeddings = model.encode([answer, expected], convert_to_tensor=True)
    score = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    return score.item()

for test in test_questions:
    response = rag_chain.invoke({"input": test["question"]})
    score = similarity_score(response["answer"], test["expected"])
    print(f"Q: {test['question']}\nScore: {score:.3f}")


Q: What did Able build for Syntax?
Score: 0.433
Q: What is Able’s approach to security?
Score: 0.471
