In [28]:
from langchain_community.document_loaders import TextLoader
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
import pandas as pd
import os
from langchain.docstore.document import Document
from transformers import AutoTokenizer, AutoModel
import torch
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import numpy as np
import pickle
from langchain.vectorstores import FAISS
from langchain.docstore.in_memory import InMemoryDocstore
import faiss

In [12]:
# Load data
pickle_file = os.path.join('/home/noaoh/chatDBG/RAG', 'first_data_batch.pkl')
data = pd.read_pickle(pickle_file)
# file_path = "bg_arc_output.xlsx"
# docs = pd.read_excel(file_path)

In [None]:
data.head()

In [14]:
# Convert dataframe rows to LangChain Documents
docs = [
    Document(page_content=row['combined'], metadata={"id": idx})
    for idx, row in data.iterrows()
]

In [15]:
# Initialize a text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,  # Maximum tokens per chunk
    chunk_overlap=50,  # Overlap between chunks to preserve context
)

# Split documents into chunks
chunked_docs = []
for doc in docs:
    chunks = text_splitter.split_text(doc.page_content)
    # Create new Document objects for each chunk and preserve metadata
    chunked_docs.extend([Document(page_content=chunk, metadata=doc.metadata) for chunk in chunks])

In [18]:
# Initialize the embeddings model
# class HuggingFaceEmbeddings:
#     def __init__(self, model_name):
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.model = AutoModel.from_pretrained(model_name)

#     def embed_text(self, text):
#         inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
#         with torch.no_grad():
#             outputs = self.model(**inputs)
#         return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# # Use AlephBERT
# hf_model = HuggingFaceEmbeddings("onlplab/alephbert-base")
# "Davlan/xlm-roberta-large-finetuned-hebrew" -- another option


# class DictaBERTEmbeddings:
#     def __init__(self, model_name="dicta-il/dictabert"):
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.model = AutoModel.from_pretrained(model_name)

#     def embed_text(self, text):
#         # Add explicit max_length and truncation
#         inputs = self.tokenizer(
#             text,
#             return_tensors="pt",
#             padding=True,
#             truncation=True,
#             max_length=512  # Explicitly set the max token length
#         )
#         with torch.no_grad():
#             outputs = self.model(**inputs)
#         # Use the mean pooling of the last hidden state for embeddings
#         return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# # Initialize DictaBERT embeddings
# dicta_emb_model = DictaBERTEmbeddings(model_name="dicta-il/dictabert")

# # Generate embeddings for all documents
# def embed_documents_with_dicta(documents, model):
#     return [model.embed_text(doc.page_content) for doc in documents]


class DictaBERTEmbeddings:
    def __init__(self, model_name="dicta-il/dictabert"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def embed_text(self, text_batch):
        inputs = self.tokenizer(
            text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512
        ).to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# Initialize the embedding model
dicta_model = DictaBERTEmbeddings()

# Generate embeddings in batches
def embed_documents_in_batches(documents, model, batch_size=16):
    embeddings = []
    for i in range(0, len(documents), batch_size):
        batch_docs = documents[i:i + batch_size]
        batch_texts = [doc.page_content for doc in batch_docs]
        batch_embeddings = model.embed_text(batch_texts)
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Generate embeddings
embeddings = embed_documents_in_batches(chunked_docs, dicta_model, batch_size=16)

In [21]:
# Save embeddings to a file
with open("embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

print("Embeddings saved to embeddings.pkl")

Embeddings saved to embeddings.pkl


In [None]:
with open("embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

print("Embeddings loaded from embeddings.pkl")

In [50]:
# Step 1: Pair texts with their embeddings
text_embedding_pairs = [(doc.page_content, embedding) for doc, embedding in zip(chunked_docs, embeddings)]

# # Step 2: Create an in-memory document store
# docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(chunked_docs)})

# # Step 3: Create FAISS index from text and embeddings
# faiss_index = FAISS.from_embeddings(
#     text_embeddings=text_embedding_pairs,  # Pair texts with their embeddings
#     embedding=dicta_model.embed_text,     # Embedding function for queries
#     docstore=docstore,                    # Document store
#     index_to_docstore_id={i: str(i) for i in range(len(chunked_docs))}  # Mapping
# )

# # Step 4: Save the FAISS index
# faiss_index.save_local("faiss_index")

docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(chunked_docs)})
faiss_index = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, embedding) for doc, embedding in zip(chunked_docs, embeddings)],
    embedding=dicta_model.embed_text,
    docstore=docstore,
    index_to_docstore_id={i: str(i) for i in range(len(chunked_docs))}
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [51]:
# Load the FAISS index
faiss_index = FAISS.load_local(
    folder_path="faiss_index", 
    embeddings=dicta_model.embed_text,  # Pass the embedding function
    allow_dangerous_deserialization=True  # Enable this to load the pickle
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [55]:
query = "מה יש בנגב?"

query_embedding = dicta_model.embed_text(query)

# בצע חיפוש ישיר עם FAISS
distances, indices = faiss_index.index.search(query_embedding, k=5)  # 5 מסמכים רלוונטיים

# מיפוי התוצאות למסמכים
relevant_docs = [chunked_docs[i] for i in indices[0] if i != -1]  # התעלמות מתוצאות ריקות

# הדפסת המסמכים הרלוונטיים
for doc in relevant_docs:
    print(f"Document content: {doc.page_content}")


Document content: דבר קשה ואין דבר גדול וחיוני מאשר הפרחת המרחבים האלה, וחברי יטבתה יכולים לספר לכם מה שנעשה שם ב-6 השנים האחרונות. אבל יטבתה אינה מדבר טפוסי, זהו אואזיס. שם יש כל הנתונים למשק פורח חלוצי - לא אומר כמו דגניה, אבל לא פחות מאשר בתל-יוסף. אבל ראיתי מה שעשו צעירים אחרים במדבר ממש, שלא היתה בה אף טפת מים, אף סימן של ירק, אף עץ אחד. רק שממה ערה וערומה. ועכשיו יש שם מרעה, יש שם נטיעות, יש שם לולים, ועדר צאן וסוסים - וגם מלאכת בחשבת של שטיחים. כי הדרום והנגב לא יבנו על חקלאות בלבד, אלא רק על משק מעורב, על חקלאות,
Document content: נדמה לי שאנחנו עושים הסיסמאות שלנו פלסתר, אנחנו קוראים לצעירים מן העיר שילכו אל הכפר, יש צעירים שנשמעים והולכים, אבל מה היא הדוגמה שאנחנו מראים להם, כשהמרכז החקלאי יושב בתל אביב, מה הוא עושה בתל אביב? יהיה איש אחד בתל אביב - גם תל אביב זה משהו, יש בה "רבבות אדם ובהמה רבה" כמו שכתוב ביונה, יש גם משקים סביב תל אביב ונחוץ שגם פה יהיה למי לפנות, נחוץ שגם בירושלים יהיה למי לפנות, אבל מדוע לא יוכלו אנשי הגליל לפנות למוסד בנצרת, ואנשי הדרום לפנות לאיש בבאר ש

In [None]:
# Define a simple prompt for the RAG chain
prompt_template = """
Use the following documents to answer the question.
If you don't know the answer, say "I don't know."

{context}

Question: {question}
Answer:
"""
retrieval_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [20]:
# Split text into chunks 
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)
# Define the embedding model
embeddings = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=api_key)
# Create the vector store 
vector = FAISS.from_documents(documents, embeddings)
# Define a retriever interface
retriever = vector.as_retriever()
# Define LLM
model = ChatMistralAI(mistral_api_key=api_key)
# Define prompt template
prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

# Create a retrieval chain to answer questions
document_chain = create_stuff_documents_chain(model, prompt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)
response = retrieval_chain.invoke({"input": "What were the two main things the author worked on before college?"})
print(response["answer"])

AttributeError: 'str' object has no attribute 'page_content'