In [8]:
from docx import Document
import os
from pprint import pprint

In [9]:
doc_dir_path = 'docs/'
os.listdir(doc_dir_path)

['credit card.docx',
 'Current Account.docx',
 'Derivada Account.docx',
 'Direct Debit.docx',
 'e-account.docx',
 'Funds.docx',
 'Guarantees.docx',
 'Home Account.docx',
 'Jonior Account.docx',
 'loans.docx',
 'Long-term deposits.docx',
 'Medium-term deposits.docx',
 'Mortgage.docx',
 'Más particular Account.docx',
 'Particular Account.docx',
 'particular Plus Account.docx',
 'Payroll Account.docx',
 'Payroll.docx',
 'Pensions.docx',
 'Saving account.docx',
 'Securities.docx',
 'Short-Term Deposits.docx',
 'Taxes.docx']

In [16]:
import re


def clean_text(text):
    # Replace two or more consecutive empty lines with a single empty line
    cleaned_text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
    return cleaned_text


def word_wrap(text, width=80):
    """
    Wraps the input text to fit within a specified width.

    :param text: The text to be wrapped.
    :param width: The maximum width of each line. Default is 80 characters.
    :return: The wrapped text.
    """
    wrapped_text = []
    words = text.split()
    current_line = []

    for word in words:
        # If adding the new word exceeds the width, start a new line
        if sum(len(w) + 1 for w in current_line) + len(word) > width:
            wrapped_text.append(' '.join(current_line))
            current_line = [word]
        else:
            current_line.append(word)

    # Add the last line
    if current_line:
        wrapped_text.append(' '.join(current_line))

    return '\n'.join(wrapped_text)

In [17]:
def read_docs(doc_dir_path):
    docs = []
    for filename in os.listdir(doc_dir_path):
        filepath = os.path.join(doc_dir_path, filename)
        content = ''
        if os.path.isfile(filepath):
            doc = Document(filepath)
            full_text = []
            for para in doc.paragraphs:
                full_text.append(para.text)

            content = '\n'.join(full_text)

        docs.append({'id': filename.replace('.docx', '').title(),
                    'content': clean_text(content)})
    return docs

In [18]:
res_docs = read_docs(doc_dir_path)
res_docs

[{'id': 'Credit Card',
  'content': '                                                                 Product Type: Credit Card\nProduct Name: Classic Credit Card\nProduct Description: The Classic Credit Card offers a range of benefits including the ability to make both local and international transactions, with a 100% credit limit available for cash withdrawals. Cardholders can enjoy a grace period of up to 56 days and access to supplementary cards. The card includes features such as contactless payment, installment options for purchases, and international usage after two months of issuance. Fees include issuance and renewal charges of EGP 250 each, with supplementary cards costing EGP 100. Interest rates are 4% per month, and penalties for delays or exceeding credit limits are EGP 75. Additional charges apply for cash withdrawals and transactions outside Egypt. The card also provides access to discounts and promotions and allows online and contactless purchases.\n\nProduct Type: Cred

In [21]:
print(res_docs[5]['content'])

Product Type: Funds
Product Name: Banque Misr First Mutual Fund - First Issuance - Quarterly Periodic Income
Product Description: The Banque Misr First Mutual Fund - First Issuance is designed to achieve and distribute quarterly investment returns through a diversified portfolio that includes listed shares, governmental and non-governmental bonds, and other financial instruments. Launched on February 1, 1995, with an initial fund size of EGP 300 million, the fund has grown over time. Subscriptions and redemptions can be made daily until 1 PM at Banque Misr branches. The fund's fees include management fees, performance fees, and a bank fee, with specific rates based on fund size and performance. The Net Asset Value (NAV) is published weekly in Al-Ahram Newspaper. The minimum subscription amount is one certificate, and there is a 0.75% fee for redemptions.

Product Type: Funds
Product Name: Banque Misr Mutual Fund - Second Issuance - Capital Growth
Product Description: The Banque Misr Mu

In [22]:
all_content = '\n\n'.join([doc['content'] for doc in res_docs])

In [23]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

In [24]:
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ".", "!", "?", ",", ";", " ", ""],
    chunk_size=1000,
    chunk_overlap=100
)

In [25]:
character_split_texts = character_splitter.split_text(
    text=all_content
)

In [26]:
print(character_split_texts[5])
print(f"\nTotal chunks: {len(character_split_texts)}")

Product Type: Credit Card
Product Name: Titanium Credit Card
Product Description:The Titanium Credit Card provides a range of benefits, including the ability to use the card for both local and international purchases, with cash withdrawals permitted up to 100% of the credit limit. Cardholders enjoy a grace period of up to 56 days, a low monthly payment requirement of 5%, and access to exclusive VIP lounges at select international airports. The card also supports online and contactless transactions. Key transaction limits include a daily cash withdrawal limit of EGP 30,000 within Egypt and EGP 3,000 internationally. The card incurs fees such as EGP 350 for issuance and renewal, a 4% monthly interest rate, and additional charges for exceeding credit limits or delays in payment.

Total chunks: 143


In [56]:
token_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=0,
    tokens_per_chunk=256
)



In [57]:
token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

In [58]:
print(token_split_texts[2])
print(f"\nTotal chunks: {len(token_split_texts)}")

product description : the gold credit card offers a wide range of benefits including the ability to use the card for both local and international transactions, with up to 100 % of the credit limit available for cash withdrawals. it features the longest grace period of up to 56 days, and cardholders can enjoy a low payment limit of 5 % of monthly usage. the card can be used online, and supplementary cards can be issued. it provides various transaction limits, including a maximum of 400, 000 egp per month for online purchases within egypt and 7, 500 egp for international transactions. cash withdrawals have daily and monthly limits, and transactions can be managed through bm atms, branches, and internet banking. the card also offers promotions, discounts, and rewards, including 10, 000 welcome points and a range of redemption options. key charges include a 250 egp issuance fee, renewal fees, and interest rates of 4 % monthly

Total chunks: 143


In [79]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()

In [80]:
print(embedding_function([token_split_texts[10]]))

[[0.044968701899051666, -0.041686687618494034, -0.0019362138118594885, -0.01619979552924633, -0.030518412590026855, -0.011449359357357025, 0.012699919752776623, 0.06593688577413559, 0.00037584188976325095, 0.016124987974762917, -0.0682782381772995, -0.06864910572767258, -0.032926127314567566, -0.008229885250329971, -0.04538830369710922, -0.01234001386910677, 0.03183666989207268, -0.029664721339941025, -0.046771541237831116, 0.00440929876640439, 0.046151742339134216, 0.021561261266469955, 0.03583572432398796, 0.007808781694620848, -0.040788501501083374, -0.06365097314119339, 0.018461717292666435, -0.0544910691678524, 0.0327330008149147, -0.01216596458107233, 0.05421872437000275, 0.04341695457696915, 0.06638655811548233, 0.03434853255748749, -0.01940423995256424, -0.026096457615494728, -0.04305668920278549, -0.013785967603325844, -0.027396531775593758, 0.0037273878697305918, 0.023166701197624207, 0.02667490765452385, -0.03650318831205368, 0.006177032832056284, 0.08655451238155365, -0.029

In [81]:
chroma_client = chromadb.Client()

chroma_collection = chroma_client.get_or_create_collection(
    name="banking-products-chroma",
    embedding_function=embedding_function
)

new_ids = [str(id_) for id_ in range(len(token_split_texts))]

chroma_collection.add(
    ids=new_ids,
    documents=token_split_texts
)
chroma_collection.count()

Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Add of existing embedding ID: 24
Add of existing embedding ID: 25
Add of existing embedding ID: 26
Add of existing embedding ID: 27
Add of existing embedding ID: 28
Add of existing embedding ID: 29
Add of existing embe

143

In [62]:
import pickle

# Assuming 'token_split_texts' contains the documents and 'new_ids' contains the IDs

# Save the data to disk
with open('chroma_data/collection_data.pkl', 'wb') as file:
    pickle.dump({
        'ids': new_ids,
        'documents': token_split_texts
    }, file)

In [63]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import pickle


def load_chroma_collection():

    chroma_client = chromadb.Client()
    embedding_function = SentenceTransformerEmbeddingFunction()

    chroma_collection = chroma_client.get_or_create_collection(
        name="banking-products-chroma",
        embedding_function=embedding_function
    )

    with open('chroma_data/collection_data.pkl', 'rb') as file:
        data = pickle.load(file)

    # Extract IDs and documents
    new_ids = data['ids']
    token_split_texts = data['documents']

    chroma_collection.add(
        ids=new_ids,
        documents=token_split_texts
    )
    return chroma_collection

In [64]:
chroma_collection = load_chroma_collection()

Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Add of existing embedding ID: 24
Add of existing embedding ID: 25
Add of existing embedding ID: 26
Add of existing embedding ID: 27
Add of existing embedding ID: 28
Add of existing embedding ID: 29
Add of existing embe

In [65]:
# query = '''
# What are the loan options available, and what are their interest rates?
# '''
query = '''
do you have Individual Mortgage for Clients on Payroll provides financing up to 80%?
'''

In [66]:

res_docs = chroma_collection.query(
    query_texts=[query],
    n_results=10
)['documents'][0]  # type: ignore

In [67]:
for res in res_docs:
    print(word_wrap(res))
    print('\n')

product type : mortgage product name : individual mortgage for clients on
payroll product description : the individual mortgage for clients on payroll
provides financing up to 80 % of the housing unit's value with a loan term
extending up to 20 years. it includes a life and total disability insurance
policy and features low administrative expenses. to qualify, monthly income must
be at least egp 3, 000, and the maximum monthly premium should not exceed 40 %
of the monthly income. the maximum age for loan maturation is 65 years, and the
loan must end before the termination of service. required documents include
various housing unit documents such as the registered deed title, building
license, and real estate tax certification, or equivalent documents for non -
registered units. client documents needed are a loan application, valid national
id, recent income certificate, utility bill, and a pledge from the employer to
transfer the salary.


product description : home mortgage accounts i

In [68]:
from sentence_transformers import CrossEncoder
import numpy as np
import warnings
warnings.filterwarnings('ignore')
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [69]:
pairs = [[query, doc] for doc in res_docs]
scores = cross_encoder.predict(pairs)
scores

array([  9.535616  ,  -4.97031   ,  -0.14170337,   4.8246384 ,
         3.616743  ,   2.3038297 ,  -8.782955  ,  -5.876096  ,
         0.6303551 , -10.775097  ], dtype=float32)

In [70]:
new_sorted_indexes = np.argsort(scores)[::-1]
new_sorted_indexes

array([0, 3, 4, 5, 8, 2, 1, 7, 6, 9], dtype=int64)

In [71]:
new_sorted_docs = np.array(res_docs)[new_sorted_indexes]
new_sorted_docs

array(["product type : mortgage product name : individual mortgage for clients on payroll product description : the individual mortgage for clients on payroll provides financing up to 80 % of the housing unit's value with a loan term extending up to 20 years. it includes a life and total disability insurance policy and features low administrative expenses. to qualify, monthly income must be at least egp 3, 000, and the maximum monthly premium should not exceed 40 % of the monthly income. the maximum age for loan maturation is 65 years, and the loan must end before the termination of service. required documents include various housing unit documents such as the registered deed title, building license, and real estate tax certification, or equivalent documents for non - registered units. client documents needed are a loan application, valid national id, recent income certificate, utility bill, and a pledge from the employer to transfer the salary.",
       "product type : mortgage produc

In [72]:
for doc in new_sorted_docs:
    print(word_wrap(doc))
    print('\n')

product type : mortgage product name : individual mortgage for clients on
payroll product description : the individual mortgage for clients on payroll
provides financing up to 80 % of the housing unit's value with a loan term
extending up to 20 years. it includes a life and total disability insurance
policy and features low administrative expenses. to qualify, monthly income must
be at least egp 3, 000, and the maximum monthly premium should not exceed 40 %
of the monthly income. the maximum age for loan maturation is 65 years, and the
loan must end before the termination of service. required documents include
various housing unit documents such as the registered deed title, building
license, and real estate tax certification, or equivalent documents for non -
registered units. client documents needed are a loan application, valid national
id, recent income certificate, utility bill, and a pledge from the employer to
transfer the salary.


product type : mortgage product name : individ

In [73]:
from sentence_transformers import CrossEncoder
import numpy as np
import warnings
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import pickle

warnings.filterwarnings('ignore')


class InfoRetrial:
    def __init__(self, chroma_data_path: str = 'chroma_data/collection_data.pkl') -> None:
        self.is_collection_loaded = False
        self.chroma_collection: chromadb.Collection | None = None
        self.cross_encoder = None
        self.chroma_data_path = chroma_data_path

    def load_chroma_collection(self):

        chroma_client = chromadb.Client()
        embedding_function = SentenceTransformerEmbeddingFunction()

        chroma_collection = chroma_client.get_or_create_collection(
            name="banking-products-chroma",
            embedding_function=embedding_function
        )

        with open(self.chroma_data_path, 'rb') as file:
            data = pickle.load(file)

        # Extract IDs and documents
        new_ids = data['ids']
        token_split_texts = data['documents']

        chroma_collection.add(
            ids=new_ids,
            documents=token_split_texts
        )

        self.chroma_collection = chroma_collection
        self.cross_encoder = CrossEncoder(
            'cross-encoder/ms-marco-MiniLM-L-6-v2')

        self.is_collection_loaded = True

    def query(self, query_texts: str, n_results: int = 10):
        if not self.is_collection_loaded:
            self.load_chroma_collection()

        res_docs = self.chroma_collection.query(  # type: ignore
            query_texts=[query_texts],
            n_results=n_results
        )['documents'][0]

        pairs = [[query, doc] for doc in res_docs]
        scores = self.cross_encoder.predict(pairs)   # type: ignore

        new_sorted_indexes = np.argsort(scores)[::-1]
        new_sorted_docs = np.array(res_docs)[new_sorted_indexes]

        return new_sorted_docs

In [74]:
info = InfoRetrial()
info.load_chroma_collection()

Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Add of existing embedding ID: 24
Add of existing embedding ID: 25
Add of existing embedding ID: 26
Add of existing embedding ID: 27
Add of existing embedding ID: 28
Add of existing embedding ID: 29
Add of existing embe

In [77]:
query = '''
do you have Individual Mortgage for Clients on Payroll provides financing up to 80% ?
'''

In [78]:
info.query(query)

array(["product type : mortgage product name : individual mortgage for clients on payroll product description : the individual mortgage for clients on payroll provides financing up to 80 % of the housing unit's value with a loan term extending up to 20 years. it includes a life and total disability insurance policy and features low administrative expenses. to qualify, monthly income must be at least egp 3, 000, and the maximum monthly premium should not exceed 40 % of the monthly income. the maximum age for loan maturation is 65 years, and the loan must end before the termination of service. required documents include various housing unit documents such as the registered deed title, building license, and real estate tax certification, or equivalent documents for non - registered units. client documents needed are a loan application, valid national id, recent income certificate, utility bill, and a pledge from the employer to transfer the salary.",
       "product type : mortgage produc