In [2]:
! pip install sentence-transformers tqdm

Defaulting to user installation because normal site-packages is not writeable
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-5.1.2


In [3]:
import json
import os
import re
import hashlib
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from collections import defaultdict

# -------------------------------
# 1Ô∏è‚É£ Utility functions
# -------------------------------

def normalize_text(text: str) -> str:
    """Normalize text for hashing and comparison."""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)            # collapse whitespace
    text = re.sub(r'[^\w\s]', '', text)         # remove punctuation
    return text.strip()

def compute_hash(text: str) -> str:
    """Generate a unique hash ID for dedup tracking."""
    return hashlib.md5(normalize_text(text).encode()).hexdigest()

# -------------------------------
# 2Ô∏è‚É£ Load and group by category
# -------------------------------




  from .autonotebook import tqdm as notebook_tqdm





In [6]:
input_file = "C:/Users/madha/vsCode/Github/legal-assist-rag/data/test.json"  # your JSON file
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Group all items by their category
category_data = defaultdict(list)
for item in data:
    category = item.get("query-category", "Unknown").strip()
    category_data[category].append(item)

print(f"Found {len(category_data)} categories:")
for cat, items in category_data.items():
    print(f" - {cat}: {len(items)} records")

# -------------------------------
# 3Ô∏è‚É£ Deduplication setup
# -------------------------------

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

output_dir = "deduplicated_data"
os.makedirs(output_dir, exist_ok=True)

# -------------------------------
# 4Ô∏è‚É£ Process category-wise
# -------------------------------

for category, items in category_data.items():
    print(f"\nüîπ Processing category: {category}")

    # Step 1: remove exact duplicates
    seen_hashes = set()
    unique_items = []
    for item in items:
        qtext = item.get("query-text", "")
        qhash = compute_hash(qtext)
        if qhash not in seen_hashes:
            unique_items.append(item)
            seen_hashes.add(qhash)

    print(f"   ‚ûú After exact dedup: {len(unique_items)} items")

    # Step 2: semantic deduplication
    query_texts = [i["query-text"] for i in unique_items]
    embeddings = model.encode(query_texts, convert_to_tensor=True, show_progress_bar=True)

    to_remove = set()
    threshold = 0.85  # tune this based on test results

    # Compare each query with the rest
    for i in tqdm(range(len(unique_items)), desc=f"Semantic dedup ({category})"):
        if i in to_remove:
            continue
        for j in range(i + 1, len(unique_items)):
            if j in to_remove:
                continue
            sim = util.cos_sim(embeddings[i], embeddings[j]).item()
            if sim > threshold:
                # Mark j as duplicate (keep the first one)
                to_remove.add(j)

    final_items = [item for idx, item in enumerate(unique_items) if idx not in to_remove]

    print(f"   ‚ûú After semantic dedup: {len(final_items)} items")

    # Step 3: save category-wise cleaned data
    output_path = os.path.join(output_dir, f"{category.replace(' ', '_')}_clean.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(final_items, f, ensure_ascii=False, indent=4)

    print(f"   ‚úÖ Saved cleaned file: {output_path}")

Found 9 categories:
 - Civil Law: 87 records
 - Family Law: 307 records
 - Criminal Law: 134 records
 - Labour: 32 records
 - Property Law: 224 records
 - Business Law: 17 records
 - Consumer Law: 2 records
 - Constitutional Law: 5 records
 - Taxation: 4 records

üîπ Processing category: Civil Law
   ‚ûú After exact dedup: 87 items


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:08<00:00,  2.69s/it]
Semantic dedup (Civil Law): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 87/87 [00:00<00:00, 196.77it/s]


   ‚ûú After semantic dedup: 87 items
   ‚úÖ Saved cleaned file: deduplicated_data\Civil_Law_clean.json

üîπ Processing category: Family Law
   ‚ûú After exact dedup: 307 items


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:25<00:00,  2.60s/it]
Semantic dedup (Family Law): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 307/307 [00:05<00:00, 52.92it/s] 


   ‚ûú After semantic dedup: 307 items
   ‚úÖ Saved cleaned file: deduplicated_data\Family_Law_clean.json

üîπ Processing category: Criminal Law
   ‚ûú After exact dedup: 134 items


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:12<00:00,  2.42s/it]
Semantic dedup (Criminal Law): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 134/134 [00:01<00:00, 121.02it/s]


   ‚ûú After semantic dedup: 134 items
   ‚úÖ Saved cleaned file: deduplicated_data\Criminal_Law_clean.json

üîπ Processing category: Labour
   ‚ûú After exact dedup: 32 items


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:04<00:00,  4.26s/it]
Semantic dedup (Labour): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 32/32 [00:00<00:00, 484.17it/s]


   ‚ûú After semantic dedup: 32 items
   ‚úÖ Saved cleaned file: deduplicated_data\Labour_clean.json

üîπ Processing category: Property Law
   ‚ûú After exact dedup: 224 items


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:19<00:00,  2.78s/it]
Semantic dedup (Property Law): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224/224 [00:03<00:00, 71.27it/s] 


   ‚ûú After semantic dedup: 224 items
   ‚úÖ Saved cleaned file: deduplicated_data\Property_Law_clean.json

üîπ Processing category: Business Law
   ‚ûú After exact dedup: 17 items


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:02<00:00,  2.60s/it]
Semantic dedup (Business Law): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17/17 [00:00<00:00, 3397.82it/s]


   ‚ûú After semantic dedup: 17 items
   ‚úÖ Saved cleaned file: deduplicated_data\Business_Law_clean.json

üîπ Processing category: Consumer Law
   ‚ûú After exact dedup: 2 items


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  3.96it/s]
Semantic dedup (Consumer Law): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<?, ?it/s]


   ‚ûú After semantic dedup: 2 items
   ‚úÖ Saved cleaned file: deduplicated_data\Consumer_Law_clean.json

üîπ Processing category: Constitutional Law
   ‚ûú After exact dedup: 5 items


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.74it/s]
Semantic dedup (Constitutional Law): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 3042.44it/s]


   ‚ûú After semantic dedup: 5 items
   ‚úÖ Saved cleaned file: deduplicated_data\Constitutional_Law_clean.json

üîπ Processing category: Taxation
   ‚ûú After exact dedup: 4 items


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  2.03it/s]
Semantic dedup (Taxation): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 3123.09it/s]

   ‚ûú After semantic dedup: 4 items
   ‚úÖ Saved cleaned file: deduplicated_data\Taxation_clean.json



