# Exact Dedup

In [2]:
import json
from collections import Counter

input_file = "coarse_cleaned_patents_filtered.jsonl"
output_file_dedup = "coarse_cleaned_patents_filtered_dedup.jsonl"

# load all records
records = []
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        records.append(json.loads(line))

print(f"Original number of documents: {len(records)}")

# Count exact duplicates based on claim1 text
claim_texts = [r.get("claim1","").strip() for r in records]
counter = Counter(claim_texts)

# Identify duplicates (more than 1 occurrence)
duplicates = {text for text, count in counter.items() if count > 1}
num_duplicates = sum(counter[text] - 1 for text in duplicates)
print(f"Number of duplicate claim1 texts: {len(duplicates)}")
print(f"Total duplicate instances removed: {num_duplicates}")

# Keep only first occurrence of each claim1 text
seen = set()
filtered_records = []
for r in records:
    claim_text = r.get("claim1","").strip()
    if claim_text not in seen:
        filtered_records.append(r)
        seen.add(claim_text)

print(f"Number of documents after exact claim1 deduplication: {len(filtered_records)}")

# Save filtered JSONL
with open(output_file_dedup, "w", encoding="utf-8") as f_out:
    for r in filtered_records:
        f_out.write(json.dumps(r, ensure_ascii=False) + "\n")


Original number of documents: 74002
Number of duplicate claim1 texts: 16
Total duplicate instances removed: 16
Number of documents after exact claim1 deduplication: 73986


In [3]:
import json
from collections import Counter

input_file = "coarse_cleaned_patents_filtered.jsonl"
output_file_dedup = "coarse_cleaned_patents_filtered_dedup.jsonl"

# load all records
records = []
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        records.append(json.loads(line))

print(f"Original number of documents: {len(records)}")

# --- Dedup on claim1 ---
claim_texts = [r.get("claim1","").strip() for r in records]
claim_counter = Counter(claim_texts)
duplicates_claim = {text for text, count in claim_counter.items() if count > 1}
num_duplicates_claim = sum(claim_counter[text]-1 for text in duplicates_claim)
print(f"Claim1 duplicates found: {len(duplicates_claim)}, total duplicate instances: {num_duplicates_claim}")

# keep only first occurrence of each claim1
seen_claims = set()
records_claim_dedup = []
for r in records:
    ctext = r.get("claim1","").strip()
    if ctext not in seen_claims:
        records_claim_dedup.append(r)
        seen_claims.add(ctext)

print(f"Documents after claim1 deduplication: {len(records_claim_dedup)}")

# --- Dedup on description ---
desc_texts = [r.get("description","").strip() for r in records_claim_dedup]
desc_counter = Counter(desc_texts)
duplicates_desc = {text for text, count in desc_counter.items() if count > 1}
num_duplicates_desc = sum(desc_counter[text]-1 for text in duplicates_desc)
print(f"Description duplicates found: {len(duplicates_desc)}, total duplicate instances: {num_duplicates_desc}")

# keep only first occurrence of each description
seen_descs = set()
records_final = []
for r in records_claim_dedup:
    dtext = r.get("description","").strip()
    if dtext not in seen_descs:
        records_final.append(r)
        seen_descs.add(dtext)

print(f"Documents after description deduplication: {len(records_final)}")

# Save filtered JSONL
with open(output_file_dedup, "w", encoding="utf-8") as f_out:
    for r in records_final:
        f_out.write(json.dumps(r, ensure_ascii=False) + "\n")


Original number of documents: 74002
Claim1 duplicates found: 16, total duplicate instances: 16
Documents after claim1 deduplication: 73986
Description duplicates found: 0, total duplicate instances: 0
Documents after description deduplication: 73986


In [4]:
from datasets import load_dataset

# Load the JSONL file
dataset = load_dataset("json", data_files="coarse_cleaned_patents_filtered_dedup.jsonl")

dataset = dataset["train"]

# Push to Hugging Face Hub
dataset.push_to_hub("mhurhangee/ep-patents-coarse-cleaned",commit_message="with exact deduping")

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/8 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  52%|#####2    | 83.9MB /  161MB            

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  29%|##8       | 45.5MB /  159MB            

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|2         | 3.67MB /  162MB            

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|2         | 3.67MB /  162MB            

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|2         | 3.67MB /  159MB            

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|2         | 3.73MB /  159MB            

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|2         | 3.67MB /  161MB            

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|2         | 3.67MB /  162MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mhurhangee/ep-patents-coarse-cleaned/commit/8b7987e97173685207775b2871954dd76cebe5dd', commit_message='with exact deduping', commit_description='', oid='8b7987e97173685207775b2871954dd76cebe5dd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mhurhangee/ep-patents-coarse-cleaned', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mhurhangee/ep-patents-coarse-cleaned'), pr_revision=None, pr_num=None)