In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import string
import pickle

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.base import Embeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA, LLMChain
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma


from DictaBERTEmbeddings import DictaBERTEmbeddings

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
DATA_PATH = './data/
api_key="YOUR-API-KEY"

In [None]:
# Load data
pickle_file = os.path.join(DATA_PATH, 'prepd_data.pkl')
data = pd.read_pickle(pickle_file)

In [None]:
# Convert dataframe rows to LangChain Documents
docs = [
    Document(page_content=row['clean_combined'], metadata={"id": row['book_id']})
    for idx, row in data.iterrows()
]

In [None]:
def split_text_by_words(text, max_words, overlap_words):
    # Split the text into words
    words = text.split()

    chunks = []
    start_idx = 0
    while start_idx < len(words):
        end_idx = min(start_idx + max_words, len(words))
        chunk = words[start_idx:end_idx]

        # Join the words back into text
        chunk_text = ' '.join(chunk)
        chunks.append(chunk_text)

        # Update the starting position with overlap
        start_idx += max_words - overlap_words

    return chunks


# Example usage for splitting documents
chunks = []
for doc in docs:
    temp_chunks = split_text_by_words(
        doc.page_content, max_words=260, overlap_words=35
    )
    chunks.extend([Document(page_content=chunk, metadata=doc.metadata) for chunk in temp_chunks])

In [None]:
# Check the number of chunks created
print(f"Number of chunks: {len(chunks)}")

In [None]:
# Initialize DictaBERT embeddings
embedding_model = DictaBERTEmbeddings(model_name="dicta-il/dictabert")

In [None]:
EMB_PATH = os.path.join('DATA_PATH', 'embedding_dictaBERT.pkl')
with open(EMB_PATH, 'wb') as f:
    pickle.dump(embedding_model, f)

In [None]:
vectorstore = Chroma.from_documents(
    chunks, 
    embedding_model, 
    persist_directory="chroma_db_dicta_emb" 
)