<a href="https://colab.research.google.com/github/midhun-james/val-mod-with-gliner/blob/main/rag_gliner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!pip install sentence-transformers faiss-cpu pymupdf gliner  -qq

In [22]:
!pip install textdistance

Collecting textdistance
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Downloading textdistance-4.6.3-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.6.3


In [26]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gzip
import json
from gliner import GLiNER
import re
from collections import defaultdict
import textdistance
g_model=GLiNER.from_pretrained("urchade/gliner_medium-v2.1")
patterns = {
    'email': r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b',
    'phone': r'\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{4}\b',
    'id': r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
}

labels = ["name of a person", "name of an organization","IT service"]
with gzip.open("faker_dataset_v3.json.gz", "rt", encoding="utf-8") as f:
    fake_list= json.load(f)
fake_data={}
for d in fake_list:
    fake_data.update(d)
forward_mapping=defaultdict(dict)
backward_mapping=defaultdict(dict)
entity_to_fake={}
used_fakes={"names": set(), "company": set()}
SIMILARITY_THRESHOLD=0.7
def chunk_text_no_split(text, max_len=390, overlap_words=3):
    """
    Chunk text into pieces of max_len, ensuring no word is split between chunks.
    Overlap is handled by number of words, not characters.
    Returns a list of (chunk, start_char_index) tuples.
    """
    words = re.findall(r'\S+', text)
    chunks = []
    start = 0
    while start < len(words):
        end = start
        char_count = 0
        chunk_words = []
        while end < len(words) and char_count + len(words[end]) + (1 if chunk_words else 0) <= max_len:
            chunk_words.append(words[end])
            char_count += len(words[end]) + (1 if chunk_words else 0)
            end += 1
        chunk_text = ' '.join(chunk_words)
        # Find the character index in the original text
        if chunk_words:
            first_word = chunk_words[0]
            start_char = text.find(first_word, 0 if not chunks else chunks[-1][1] + 1)
        else:
            start_char = 0
        chunks.append((chunk_text, start_char))
        if end == len(words):
            break
        # Overlap by words
        start = end - overlap_words if end - overlap_words > 0 else end
    return chunks
# --- Step 1: Prepare Documents ---
def extract_entities_no_spacy(text):
    regex_entities = []
    for label, pattern in patterns.items():
        for match in re.finditer(pattern, text):
            regex_entities.append({
                'text': match.group(),
                'start': match.start(),
                'end': match.end(),
                'label': label
            })
    # Directly use the full text for GLiNER
    chunks = chunk_text_no_split(text, max_len=384, overlap_words=3)
    all_entities = []
    for chunk_text_part, offset in chunks:
        ents = g_model.predict_entities(chunk_text_part, labels, threshold=0.5)
        # If you want to map positions, you can do so here, but since we're not filtering, just offset
        for ent in ents:
            all_entities.append({
                'text': ent['text'],
                'start': ent['start'] + offset,
                'end': ent['end'] + offset,
                'label': ent['label']
            })
    all_entities.sort(key=lambda x: x['start'])
    return all_entities
def normalize_value(value: str) -> str:
    # Convert to lowercase and remove extra spaces
    return " ".join(value.lower().split())
def find_existing_match(key, norm_value):
    """Find a previously stored value that is very similar to norm_value."""
    for existing in forward_mapping.get(key, {}):
        similarity = textdistance.jaro_winkler.normalized_similarity(existing, norm_value)
        if similarity >= SIMILARITY_THRESHOLD:
            return existing
    return None
def get_fake_value(label, real_value):
    # Map label to key
    if label.lower() == "name of a person":
        key = "names"
    elif label.lower() == "name of an organization":
        key = "company"
    else:
        return real_value  # No masking for unknown labels
    norm_value=normalize_value(real_value)
    pool = fake_data.get(key, [])
    forward_mapping.setdefault(key, {})
    backward_mapping.setdefault(key, {})
    used_fakes.setdefault(key, set())
    existing_match = find_existing_match(key, real_value)
    if existing_match:
        fake = forward_mapping[key][existing_match]
        # Map this variation to same fake value
        forward_mapping[key][real_value] = fake
        backward_mapping[key][fake] = existing_match  # Keep original reference
        return fake
    # Check if already mapped in forward_mapping
    if norm_value in forward_mapping[key]:
        return forward_mapping[key][norm_value]

    # Find unused fake value
    for fake in pool:
        if fake not in used_fakes[key]:
            used_fakes[key].add(fake)

            # Update forward and backward mappings
            forward_mapping[key][norm_value] = fake
            backward_mapping[key][fake] = real_value

            return fake

    return real_value
def mask_sentence(sentence):

        flat_map = {}
        for entity, value_map in forward_mapping.items():
            for original, fake in value_map.items():
                flat_map[original] = fake
        # Pre-lowercased lookup for fast replacement
        flat_map_lower = {k.lower(): v for k, v in flat_map.items()}
        # Identify which keys are present in the sentence (case-insensitive)
        sentence_lower = sentence.lower()
        matched_keys = [k for k in flat_map if k.lower() in sentence_lower]

        if matched_keys:
            # Sort matched keys by length (longest first) to avoid partial replacement
            matched_keys.sort(key=len, reverse=True)

            # Build regex pattern with alternation

            pattern = re.compile(
                r'(?<!\w)([\{\(\["\'\*\_]*?)(' +
                '|'.join(re.escape(k) for k in matched_keys) +
                r')([\}\)\]"\'\*\_]*?)(?!\w)',flags=re.IGNORECASE
            )
            def replace_match(match):
                prefix = match.group(1)  # e.g., '{' or '**'
                core = match.group(2)    # e.g., 'abc'
                suffix = match.group(3)  # e.g., '}' or '**'

                replaced = flat_map_lower.get(core.lower(), core)
                # print(f'{match.group(0)} => {prefix}{replaced}{suffix}')
                return f"{prefix}{replaced}{suffix}"
            sentence = pattern.sub(replace_match, sentence)

        return sentence
def unmask_summary(sentence):
        flat_map = {}
        for entity, value_map in backward_mapping.items():
            for original, fake in value_map.items():
                flat_map[original] = fake
        # Pre-lowercased lookup for fast replacement
        flat_map_lower = {}
        for fake,original in flat_map.items():
            fake_lower=fake.lower()
            flat_map_lower[fake_lower]=original

            core=re.sub(r'\b(co|llc|inc|group|international|corporation|ltd|)\.?$', '', fake_lower, flags=re.IGNORECASE).strip()
            if core and core!= fake_lower:
                flat_map_lower[core]=original
        # Identify which keys are present in the sentence (case-insensitive)
        sentence_lower = sentence.lower()
        matched_keys = [k for k in flat_map_lower if k in sentence_lower]

        if matched_keys:
            # Sort matched keys by length (longest first) to avoid partial replacement
            matched_keys.sort(key=len, reverse=True)

            # Build regex pattern with alternation
            pattern = re.compile(
                r'(?<!\w)([\{\(\["\'\*\_]*?)(' +
                '|'.join(re.escape(k) for k in matched_keys) +
                r')([\}\)\]"\'\*\_]*?)(?!\w)',flags=re.IGNORECASE
            )

            def replace_match(match):
                prefix = match.group(1)  # e.g., '{' or '**'
                core = match.group(2)    # e.g., 'abc'
                suffix = match.group(3)  # e.g., '}' or '**'

                replaced = flat_map_lower.get(core.lower(), core)
                # print(f'{match.group(0)} => {prefix}{replaced}{suffix}')
                return f"{prefix}{replaced}{suffix}"

            sentence = pattern.sub(replace_match, sentence)

        return sentence


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [13]:
import fitz  # PyMuPDF
import time

# --- Step 1: Extract text from PDF ---
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text() + "\n"
    return text

pdf_path = "test.pdf"
text = extract_text_from_pdf(pdf_path)

# --- Step 2: Split text into chunks ---
def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks
start=time.time()
entities=extract_entities_no_spacy(text)
end=time.time()
print(f'time taken to extract entities is {end-start}')




Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


time taken to extract entities is 123.10105228424072


In [27]:
#example usage

for ent in entities:
  label=ent["label"].lower()
  if label in ["name of a person", "name of an organization"]:
    get_fake_value(label,ent["text"])
  print(f'entity: {ent["text"]}, label: {ent["label"]}')
mapping={
    "forward_mapping":forward_mapping,
    "backward_mapping":backward_mapping,

}

with open("descriptive_mapping_docs.json","w", encoding="utf-8") as f:
    json.dump(mapping, f, ensure_ascii=False, indent=4)

entity: Company, label: name of an organization
entity: UST Global Inc, label: name of an organization
entity: information technology services, label: IT service
entity: UST Global Inc, label: name of an organization
entity: Customer, label: name of a person
entity: Customer, label: name of a person
entity: Customer, label: name of a person
entity: Customer, label: name of a person
entity: Customer, label: name of a person
entity: Protected Personnel, label: name of a person
entity: Protected Personnel, label: name of a person
entity: UST Global Inc, label: name of an organization
entity: UST Global, label: IT service
entity: UST Global Inc, label: name of an organization
entity: Discloser, label: name of a person
entity: Discloser, label: name of a person
entity: UST Global Inc, label: name of an organization
entity: UST, label: IT service
entity: UST GLOBAL INC, label: name of an organization
entity: ANAND PAG INC, label: name of an organization
entity: Samuel Ramzy, label: name of a

In [28]:
masked_string=mask_sentence(text)
print(masked_string)


  
Teaming Agreement (Santiago-Lester LLC) v4October2018  
  
Page 1 of 12  
         Proprietary & Confidential  
TEAMING AGREEMENT  
  
This TEAMING AGREEMENT (this “Agreement”) is made as of   14 August 2020 (the “Effective Date"), by and 
between Santiago-Lester LLC, a Delaware corporation with offices located at 5 Polaris Way, Aliso Viejo, CA 92656 (“UST 
Global”) and  Harris-Roman Group,(“Williams-Waller Co”),  a  Texas corporation with offices locate at 1300 W Walnut Hill Lane, 
Suite 111, Irving. TX 75038. Williams-Waller Co and UST Global are also referred to individually as a “Party” and collectively 
as the “Parties”.  
 
BACKGROUND  
  
A. 
The Parties, because of their unique and complementary capabilities, have determined that they would 
benefit from a teaming arrangement in order to develop and secure contracts with potential Customers (as defined below) 
in certain geographic markets for each other’s information technology services, solutions and products (as may be mo

In [29]:
print(unmask_summary(masked_string))

  
Teaming Agreement (ust global inc) v4October2018  
  
Page 1 of 12  
         Proprietary & Confidential  
TEAMING AGREEMENT  
  
This TEAMING AGREEMENT (this “Agreement”) is made as of   14 August 2020 (the “Effective Date"), by and 
between ust global inc, a Delaware corporation with offices located at 5 Polaris Way, Aliso Viejo, CA 92656 (“UST 
Global”) and  ANAND PAG INC,(“Company”),  a  Texas corporation with offices locate at 1300 W Walnut Hill Lane, 
Suite 111, Irving. TX 75038. Company and UST Global are also referred to individually as a “Party” and collectively 
as the “Parties”.  
 
BACKGROUND  
  
A. 
The Parties, because of their unique and complementary capabilities, have determined that they would 
benefit from a teaming arrangement in order to develop and secure contracts with potential Customers (as defined below) 
in certain geographic markets for each other’s information technology services, solutions and products (as may be more 
specifically identified in Exhibi

In [17]:
documents = masked_string.split("\n")
# --- Step 2: Load Embedding Model ---
model = SentenceTransformer('all-MiniLM-L6-v2')

# --- Step 3: Convert Documents to Embeddings ---
embeddings = model.encode(documents)
embeddings = np.array(embeddings, dtype=np.float32)

# --- Step 4: Build FAISS Vector Store ---
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

# --- Step 5: Retrieval Function ---
def retrieve(query, k=3):
    query_vector = model.encode([query])
    query_vector = np.array(query_vector, dtype=np.float32)
    distances, indices = index.search(query_vector, k)
    return [documents[i] for i in indices[0]]

# --- Step 6: Simple LLM Simulation ---
def generate_answer(query, context):
    # Simulate by concatenating context (since no LLM)
    return f"Context used:\n{unmask_summary(context)}\n\nAnswer: Based on the context, {query}"

# --- Step 7: RAG Query ---
def rag_query(query):
    masked_query=mask_sentence(query)
    print(f'masked query is :{masked_query}')
    retrieved_docs = retrieve(masked_query)
    context = "\n".join(retrieved_docs)
    return generate_answer(query, context)


In [18]:

# --- Test ---
print(rag_query("details about Anand PAG Inc "))

masked query is :details about Bryan-Randall Inc 
Context used:
ANAND PAG INC  
ANAND PAG INC 
ANAND PAG INC 

Answer: Based on the context, details about Anand PAG Inc 


In [10]:
import fitz  # PyMuPDF


# --- Step 1: Extract text from PDF ---
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text() + "\n"
    return text

pdf_path = "test.pdf"
text = extract_text_from_pdf(pdf_path)

# --- Step 2: Split text into chunks ---
def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# documents = chunk_text(text)

# # --- Step 3: Create embeddings ---
# model = SentenceTransformer('all-MiniLM-L6-v2')
# embeddings = model.encode(documents)
# embeddings = np.array(embeddings, dtype=np.float32)

# # --- Step 4: Build FAISS index ---
# dim = embeddings.shape[1]
# index = faiss.IndexFlatL2(dim)
# index.add(embeddings)

# # --- Step 5: Retrieval ---
# def retrieve(query, k=2):
#     query_vector = model.encode([query])
#     query_vector = np.array(query_vector, dtype=np.float32)
#     distances, indices = index.search(query_vector, k)
#     return [documents[i] for i in indices[0]]

# # --- Step 6: RAG Query ---
# def rag_query(query):
#     retrieved_docs = retrieve(query)
#     context = "\n".join(retrieved_docs)
#     return f"Context:\n{context}\n\nAnswer: Based on this, {query}"

# # --- Test ---
# print(rag_query("Who is Maya Patel?"))


In [11]:
print(text)

  
Teaming Agreement (UST Global Inc) v4October2018  
  
Page 1 of 12  
         Proprietary & Confidential  
TEAMING AGREEMENT  
  
This TEAMING AGREEMENT (this “Agreement”) is made as of   14 August 2020 (the “Effective Date"), by and 
between UST Global Inc, a Delaware corporation with offices located at 5 Polaris Way, Aliso Viejo, CA 92656 (“UST 
Global”) and  Anand PAG Inc,(“Company”),  a  Texas corporation with offices locate at 1300 W Walnut Hill Lane, 
Suite 111, Irving. TX 75038. Company and UST Global are also referred to individually as a “Party” and collectively 
as the “Parties”.  
 
BACKGROUND  
  
A. 
The Parties, because of their unique and complementary capabilities, have determined that they would 
benefit from a teaming arrangement in order to develop and secure contracts with potential Customers (as defined below) 
in certain geographic markets for each other’s information technology services, solutions and products (as may be more 
specifically identified in Exhibi