<a href="https://colab.research.google.com/github/midhun-james/val-mod-with-gliner/blob/main/gliner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install gliner rapidfuzz

Collecting gliner
  Downloading gliner-0.2.19-py3-none-any.whl.metadata (8.8 kB)
Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting onnxruntime (from gliner)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->gliner)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->gliner)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->gliner)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->gliner)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux201

In [None]:
pip install polars

In [24]:
import pandas as pd
import json
from collections import defaultdict
import os
import time
import gzip
import sqlite3
import sqlparse
from sqlparse.sql import Token
from sqlparse.tokens import Literal,String
from datetime import datetime
import random
import re
import string
import polars as pl
from openpyxl import load_workbook
from gliner import GLiNER
from rapidfuzz import process, fuzz

class DataMaskerCSV:
    def __init__(self,file_path):
        # self.entity_column_map={
        #                 'names': 'names',
        #                 'emails': 'emails',
        #                 'phone': 'phone',
        #                 'credit': 'credit',
        #                 'url': 'url',
        #                 'location': 'location',
        #                 'company': 'company',
        #             }
        self.file_path=file_path
        self.base_name=os.path.splitext(os.path.basename(self.file_path))[0]
        self.output_dir=self.base_name
        os.makedirs(self.output_dir, exist_ok=True)
        self.entity_column_map={
                'name': 'company',
                'domain': 'url',
                'locality': 'location',
                }
        self.model=GLiNER.from_pretrained("urchade/gliner_base")
        self.sensitive_columns = self.entity_column_map.keys()
        start=time.time()
        self.faker_data_path= 'faker_dataset_v3.json.gz'
        with gzip.open(self.faker_data_path, 'rt',encoding='utf-8') as f:
            faker_list = json.load(f)
        end=time.time()
        print(f"⏳ Faker data loaded in {end-start:.6f} seconds")
        self.faker_data = {}
        for d in faker_list:
            self.faker_data.update(d)
        self.domain_pool= self.faker_data['url']
        self.forward_mapping = defaultdict(dict)
        self.backward_mapping = defaultdict(dict)
        self.mapping= defaultdict(dict)
        self.fake_data_index = defaultdict(int)
        self.used_fakes = defaultdict(set)
        self.used_urls = set()
        self.url_extensions =  [
                                    ".com", ".net", ".org", ".edu", ".gov", ".co", ".us", ".uk", ".in", ".ru",
                                    ".jp", ".cn", ".de", ".fr", ".it", ".nl", ".es", ".br", ".au", ".ca",
                                    ".ch", ".se", ".no", ".za", ".mx", ".ar", ".be", ".kr", ".pl", ".tr",
                                    ".ua", ".ir", ".sa", ".ae", ".my", ".sg", ".hk", ".tw", ".nz", ".id",
                                    ".th", ".ph", ".vn", ".bd", ".lk", ".np", ".pk", ".cz", ".gr", ".hu",
                                    ".fi", ".dk", ".il", ".ie", ".pt", ".sk", ".si", ".ro", ".bg", ".rs",
                                    ".lt", ".lv", ".ee", ".hr", ".ba", ".md", ".ge", ".kz", ".by", ".tm",
                                    ".uz", ".af", ".qa", ".om", ".kw", ".bh", ".ye", ".jo", ".lb", ".sy",
                                    ".iq", ".ps", ".az", ".am", ".kg", ".mn", ".bt", ".mv", ".mm", ".kh",
                                    ".la", ".tl", ".sb", ".fj", ".pg", ".to", ".tv", ".ws", ".fm", ".ki"
                                ]


    @staticmethod
    def time_it(func):
        def wrapper(*args, **kwargs):
            start = time.time()
            result = func(*args, **kwargs)
            end = time.time()
            print(f'\n⏳ Execution time {func.__name__}: {end-start:.6f} seconds')
            return result
        return wrapper

    @time_it
    def csv_extraction(self):

        output_csv_path=os.path.join(self.output_dir,f'new_{self.base_name}.csv')
        if self.file_path.endswith('.xlsx'):
            sheet_names = pd.ExcelFile(self.file_path).sheet_names
            df=pl.read_excel(self.file_path,engine='calamine',sheet_name=sheet_names)
            combined_df = pl.concat(df.values(), how="diagonal")
            combined_df.write_csv('intermediate.csv')
            self.file_path='intermediate.csv'

        all_data={}
        for col in self.sensitive_columns:
            entity=self.entity_column_map.get(col)
            if entity:
                all_data.setdefault(entity, [])
        df=pd.read_csv(self.file_path)
        for col in self.sensitive_columns:
            if col in df.columns:
                entity=self.entity_column_map.get(col)
                if entity:
                    values=df[col].dropna().to_list()
                    all_data[entity].extend(values)
                else:
                    entity=self.entity_column_map.get(col.lower())
                    if entity:
                        all_data[entity].extend([None]*len(df))
        max_len=max([len(v) for v in all_data.values()])
        for entity in all_data:
            all_data[entity].extend([None]*(max_len-len(all_data[entity])))
        final_df=pd.DataFrame(all_data)
        final_df.to_csv(output_csv_path,index=False)

        if self.file_path == 'intermediate.csv': os.remove(self.file_path)
        self.anonymize_csv(output_csv_path)

    def _get_fake_value(self, entity, original_value):
        """Return consistent fake value for an original value."""
        col_key =  entity  # default fallback if column not passed


        if original_value in self.forward_mapping[col_key]:
            return self.forward_mapping[col_key][original_value]
        if entity =='url':
            while True:
                domain1,domain2=random.sample(self.domain_pool,2)
                fake_value=f"https://{domain1.lower()}.{domain2.lower()}.co"
                if fake_value not in self.used_fakes[entity]:
                    break
            self.used_fakes[entity].add(fake_value)
            self.forward_mapping[col_key][original_value] = fake_value
            self.backward_mapping[col_key][fake_value] = original_value
            return fake_value

        while self.fake_data_index[entity] < len(self.faker_data[entity]):
            fake_value = self.faker_data[entity][self.fake_data_index[entity]]
            self.fake_data_index[entity] += 1

            if fake_value not in self.used_fakes[entity]:
                self.used_fakes[entity].add(fake_value)
                self.forward_mapping[col_key][original_value] = fake_value
                self.backward_mapping[col_key][fake_value] = original_value
                return fake_value

        counter=1
        base_fake_value=original_value
        while True:
            fallback_value= self.modify_fake_value(entity, base_fake_value,  counter=counter)
            if fallback_value not in self.used_fakes[entity]:
                self.used_fakes[entity].add(fallback_value)
                self.forward_mapping[col_key][original_value] = fallback_value
                self.backward_mapping[col_key][fallback_value] = original_value
                return fallback_value
            counter+=1


    def modify_fake_value(self,entity,original_value,counter=1):
        """Modify the fake value to ensure uniqueness."""
        if entity=="names":
            base=random.choice(self.faker_data['names'])
            return base+f"{string.ascii_lowercase[counter % 26]}"
        elif entity=="emails":
            base=random.choice(self.faker_data['emails'])
            name,domain=base.split('@')
            return f"{name}{counter}@{domain}"
        elif entity=="url":
            fake_value=original_value
            while fake_value in self.used_urls:
                ext=random.choice(self.url_extensions)
                if not fake_value.endswith(ext):
                    fake_value=fake_value+ext
            self.used_urls.add(fake_value)
            return fake_value
        elif entity=="phone":
            base=random.choice(self.faker_data['phone'])
            return f"{base[:-2]}{counter % 100:02d}"
        elif entity == "company":
            base=random.choice(self.faker_data['company'])
            return f"{base} Group {counter % 100_000_000 + 1}"
        elif entity == "credit":
            return f"{original_value[:-4]}{counter % 10000:04d}"
        else:
            return f"{original_value}-{counter}"

    @time_it
    def anonymize_csv(self, input_csv_path):
        df = pd.read_csv(input_csv_path)
        for entity in df.columns:

            if entity not in self.faker_data:
                print(f"Warning: No fake data available for entity type '{entity}' '.")
                continue

            df[entity] = df[entity].apply(lambda val: self._get_fake_value(entity, val) if pd.notna(val) else val)

        output_csv_path=os.path.join(self.output_dir,f'{self.base_name}_masked.csv')
        df.to_csv(output_csv_path, index=False)

        combined_mapping = {
            "metadata": {
                "timestamp": datetime.now().isoformat(),
                "columns_anonymized": list(self.forward_mapping.keys()),
                "total_entries": {
                    col: len(self.forward_mapping[col]) for col in self.forward_mapping
                }
            },
            "forward_mapping": self.forward_mapping,
            "backward_mapping": self.backward_mapping,
        }
        map_path =f'{self.base_name}_mapping.json'
        with open(map_path, 'w') as f:
            json.dump(combined_mapping, f, indent=2)


        # print(f"Anonymized CSV saved to: {output_csv_path}")
        print(f" mapping saved to: {map_path}")


    @time_it
    def deanonymize_csv(self,anonymized_csv_path,map_path,deanonymized_csv_path):
        df = pd.read_csv(anonymized_csv_path)

        with open(map_path, 'r') as f:
            self.backward_mapping = json.load(f).get("backward_mapping", {})

        for col in self.sensitive_columns:
            entity= self.entity_column_map.get(col.lower())
            if col not in df.columns:
                continue
            backward_map = self.backward_mapping.get(entity, {})

            df[col]=df[col].apply(lambda val:backward_map.get(val,entity) if pd.notna(val) else val )
        df.to_csv(deanonymized_csv_path,index=False)
        print(f"Deanonymized CSV saved to: {deanonymized_csv_path}")
    @time_it
    def csv_to_sql(self,csv_path,_db_path,table_name):
        try:
            df=pd.read_csv(csv_path)
            conn=sqlite3.connect(_db_path)
            df.to_sql(table_name,conn,if_exists='replace',index=False)
            conn.close()
        except Exception as e:
            print(f"❌ Failed to import CSV: {e}")

    def correct_word(self,word,threshold=65):
      valid_list=[company.lower() for company in self.forward_mapping['company'].keys()]
      match=process.extractOne(word,valid_list,scorer=fuzz.ratio)
      return match[0] if match and match[1]>=threshold else word

    def find_and_correct_entities(self,text):
      valid_company=[company.lower() for company in self.forward_mapping['company'].keys()]
      entities=self.model.predict_entities(text,labels=["person","organization"])

      corrected_entities=[]
      for ent in entities:
        entity_text=ent['text']
        start=ent['start']
        end=ent['end']
        label=ent['label']
        if label=="organization":
          corrected = self.correct_word(entity_text)
        else: corrected=entity_text
        corrected_entities.append({
          'original':entity_text,
          'corrected':corrected,
          'start':start,
          'end':end,
          'label':label
        })
      return corrected_entities

    def replace_entities_in_text(self,text,entities):
      entities=sorted(entities,key=lambda x:x['start'], reverse=True)
      for ent in entities:
        text= text[:ent['start']] + ent['corrected'] + text[ent['end']:]
      return text

file_path = 'companies_100k.csv'
masker = DataMaskerCSV(file_path)
masker.csv_extraction()

entities=masker.find_and_correct_entities("companies are accenure and wlrmart and they are doing fine")
print(entities)
corrected_text=masker.replace_entities_in_text("companies are accenure and wlrmart and they are doing fine",entities)
print(corrected_text)


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]



⏳ Faker data loaded in 2.107525 seconds


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


 mapping saved to: companies_100k_mapping.json

⏳ Execution time anonymize_csv: 1.747187 seconds

⏳ Execution time csv_extraction: 2.853902 seconds
[{'original': 'accenure', 'corrected': 'accenture', 'start': 14, 'end': 22, 'label': 'organization'}, {'original': 'wlrmart', 'corrected': 'walmart', 'start': 27, 'end': 34, 'label': 'organization'}]
companies are accenure and walmart and they are doing fine
companies are accenture and walmart and they are doing fine
companies are accenture and walmart and they are doing fine


In [None]:
from gliner import GLiNER
from rapidfuzz import process, fuzz

model = GLiNER.from_pretrained("urchade/gliner_base")

def correct_word(word, valid_list, threshold=65):
    match = process.extractOne(word, valid_list, scorer=fuzz.ratio)
    return match[0] if match and match[1] >= threshold else word  # fallback if no good match

def find_and_correct_entities(text, valid_names, valid_company):
    entities = model.predict_entities(text, labels=["person", "organization"])

    corrected_entities = []

    for ent in entities:
        entity_text = ent['text']
        start = ent['start']
        end = ent['end']
        label = ent['label']

        if label == "person":
            tokens = entity_text.split()
            corrected_tokens = [
                correct_word(token.lower(), valid_names).capitalize()
                for token in tokens
            ]
            corrected = " ".join(corrected_tokens)

        elif label == "organization":
            corrected = correct_word(entity_text.lower(), valid_company).capitalize()

        else: corrected = entity_text

        corrected_entities.append({
            'original': entity_text,
            'corrected': corrected,
            'start': start,
            'end': end,
            'label': label
        })

    return corrected_entities

def replace_entities_in_text(text, entities):
    entities = sorted(entities, key=lambda x: x['start'], reverse=True)

    for ent in entities:
        text = text[:ent['start']] + ent['corrected'] + text[ent['end']:]
    return text

def main():
  text = """Theressa is a manager working for Wiliams. Ruthu Jenkinz is the mother of Fransis Waner and Kaitlyy Meeeers.
  Donaa Gavln hasn't been seen for a while now by Nickol. THis is concerning!"""
  valid_names = [name.lower() for name in forward_mapping['names']]
  valid_company = [company.lower() for company in forward_mapping['company']]

  entities = find_and_correct_entities(text, valid_names, valid_company)
  corrected_text = replace_entities_in_text(text, entities)

  print("\nCorrected Text:\n", corrected_text)

main()



To learn more about accelerating pandas on Colab, see the [10 minute guide](https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/getting_started_tutorials/cudf_pandas_colab_demo.ipynb) or
 [US stock market data analysis demo](https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/getting_started_tutorials/cudf_pandas_stocks_demo.ipynb).