In [4]:
import json
import re

input_path = 'data/ConceptNet/test.jsonl'
output_path = 'data/ConceptNet/test_filtered.jsonl'

# Match [MASK] followed by punctuation and optional spaces at the end
pattern = re.compile(r'\s*\[MASK\]\s*[\.\?\!\,\;\:\"]?\s*$')

kept = 0
with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
    for line in infile:
        item = json.loads(line)
        sentence = item.get("masked_sentences", [""])[0]
        
        if pattern.search(sentence):
            # Remove [MASK] and trailing punctuation/spaces
            cleaned = re.sub(r'\s*\[MASK\]\s*[\.\?\!\,\;\:\"]?\s*$', '', sentence).rstrip()
            item["masked_sentences"][0] = cleaned
            outfile.write(json.dumps(item) + '\n')
            kept += 1

print(f"✅ Done. Kept {kept} items with cleaned masked_sentences.")


✅ Done. Kept 18796 items with cleaned masked_sentences.


In [5]:
import json
import re

input_path = 'data/Squad/test.jsonl'
output_path = 'data/Squad/test_filtered.jsonl'

# Match [MASK] + punctuation + optional whitespace at end of string
pattern = re.compile(r'\s*\[MASK\]\s*[\.\?\!\,\;\:\"]?\s*$')

kept = 0
with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
    for line in infile:
        item = json.loads(line)
        sentence = item.get("masked_sentences", [""])[0]
        
        if pattern.search(sentence):
            # Clean sentence: remove [MASK] and trailing punctuation/whitespace
            cleaned = re.sub(r'\s*\[MASK\]\s*[\.\?\!\,\;\:\"]?\s*$', '', sentence).rstrip()
            item["masked_sentences"][0] = cleaned
            outfile.write(json.dumps(item) + '\n')
            kept += 1

print(f"✅ Done. Kept {kept} entries in filtered Squad file.")


✅ Done. Kept 213 entries in filtered Squad file.


In [6]:
import json
import re
import glob

input_files = glob.glob('data/TREx/P*.jsonl')
output_path = 'data/TREx/combined_filtered.jsonl'

pattern = re.compile(r'\s*\[MASK\]\s*[\.\?\!\,\;\:\"]?\s*$')

total_files = 0
total_objects = 0
total_sentences = 0

with open(output_path, 'w', encoding='utf-8') as outfile:
    for file_path in input_files:
        total_files += 1
        with open(file_path, 'r', encoding='utf-8') as infile:
            for line in infile:
                item = json.loads(line)
                seen = set()
                cleaned_evidences = []

                for ev in item.get("evidences", []):
                    sentence = ev.get("masked_sentence", "")
                    if sentence in seen:
                        continue
                    seen.add(sentence)

                    if pattern.search(sentence):
                        cleaned = re.sub(r'\s*\[MASK\]\s*[\.\?\!\,\;\:\"]?\s*$', '', sentence).rstrip()
                        cleaned_evidences.append({"masked_sentence": cleaned})
                        total_sentences += 1

                if cleaned_evidences:
                    item["evidences"] = cleaned_evidences
                    outfile.write(json.dumps(item) + '\n')
                    total_objects += 1

print(f"✅ Finished {total_files} files. Kept {total_objects} objects and {total_sentences} cleaned sentences in total.")


✅ Finished 41 files. Kept 11635 objects and 153907 cleaned sentences in total.
