In [1]:
import json
#  '/media/melissa/EXTERNAL_USB/DocRED'
from collections import Counter
docred_path='/media/melissa/EXTERNAL_USB/DocRED'
redocred_path='/media/melissa/EXTERNAL_USB/Re-DocRED'
def load_dataset(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)




In [2]:
na_label = "Na"
test_label="P17"

In [31]:
docred = load_dataset('/media/melissa/EXTERNAL_USB/DocRED/train_distant.json')
word2id = load_dataset('/media/melissa/EXTERNAL_USB/DocRED/DocRED_baseline_metadata/word2id.json')
redocred = load_dataset('/media/melissa/EXTERNAL_USB/Re-DocRED/data/train_revised.json')

In [32]:
len(docred)

101873

In [30]:
len(redocred)

3053

In [4]:
# Check for NA labels
na_count = 0
total_relations = 0

for sample in docred:
    for label in sample['labels']:
        total_relations += 1
        if label['r'] == 'Na':
            na_count += 1

print(f"Total relations: {total_relations}")
print(f"NA-labeled relations: {na_count}")

Total relations: 38180
NA-labeled relations: 0


In [5]:
rel2id_path ='/media/melissa/EXTERNAL_USB/DocRED/DocRED_baseline_metadata/rel2id.json'
with open(rel2id_path, "r", encoding="utf-8") as f:
    rel2id = json.load(f)


In [6]:
all_keys_docred = set()

for doc in docred:
    all_keys_docred.update(doc.keys())
print(all_keys_docred)


{'vertexSet', 'sents', 'labels', 'title'}


In [7]:
all_keys_redocred = set()

for re in redocred:
    all_keys_redocred.update(re.keys())
print(all_keys_redocred)

{'vertexSet', 'sents', 'labels', 'title'}


In [20]:
pprint(docred[120])
pprint(redocred[120])

{'labels': [{'evidence': [0], 'h': 6, 'r': 'P131', 't': 8},
            {'evidence': [0], 'h': 8, 'r': 'P150', 't': 6},
            {'evidence': [0], 'h': 8, 'r': 'P150', 't': 5},
            {'evidence': [0], 'h': 8, 'r': 'P150', 't': 0},
            {'evidence': [0], 'h': 5, 'r': 'P131', 't': 8},
            {'evidence': [0], 'h': 0, 'r': 'P131', 't': 8}],
 'sents': [['Zigong',
            'dialect',
            '(',
            ')',
            'is',
            'a',
            'branch',
            'of',
            'Southwestern',
            'Mandarin',
            ',',
            'spoken',
            'mainly',
            'in',
            'Zigong',
            ',',
            'Fushun',
            ',',
            'Weiyuan',
            ',',
            'east',
            'Rongxian',
            'and',
            'some',
            'parts',
            'of',
            'Yibin',
            ',',
            'Neijiang',
            ',',
            'Longchang',
          

In [9]:
def basic_stats(dataset, name=""):
    print(f"Dataset: {name}")
    print("Total documents:", len(dataset))
    
    total_entities = sum(len(d['vertexSet']) for d in dataset)
    print("Total unique entities:", total_entities)
    
    total_rels = sum(len(d.get('labels', [])) for d in dataset)
    print("Total annotated relations:", total_rels)

    rel_dist = {}
    for d in dataset:
        for rel in d.get('labels', []):
            rel_type = rel['r']
            rel_dist[rel_type] = rel_dist.get(rel_type, 0) + 1

    print("Top 5 frequent relations:")
    for rel, count in sorted(rel_dist.items(), key=lambda x: -x[1])[:5]:
        print(f"  {rel}: {count}")

basic_stats(docred, "DocRED")
basic_stats(redocred, "Re-DocRED")


Dataset: DocRED
Total documents: 3053
Total unique entities: 59493
Total annotated relations: 38180
Top 5 frequent relations:
  P17: 8921
  P131: 4193
  P27: 2689
  P150: 2004
  P577: 1142
Dataset: Re-DocRED
Total documents: 3053
Total unique entities: 59359
Total annotated relations: 85932
Top 5 frequent relations:
  P131: 20402
  P17: 14401
  P27: 4665
  P150: 3369
  P800: 3055


In [10]:
def cross_sentence_stats(dataset):
    cross = 0
    intra = 0
    for doc in dataset:
        sentences = doc['sents']
        sent_map = {}  # map token index to sentence id
        idx = 0
        for i, sent in enumerate(sentences):
            for _ in sent:
                sent_map[idx] = i
                idx += 1

        for rel in doc.get('labels', []):
            h_mention = doc['vertexSet'][rel['h']][0]
            t_mention = doc['vertexSet'][rel['t']][0]
            h_sent = sent_map[h_mention['pos'][0]]
            t_sent = sent_map[t_mention['pos'][0]]
            if h_sent == t_sent:
                intra += 1
            else:
                cross += 1
    total = cross + intra
    print(f"Cross-sentence: {cross} ({cross/total:.2%})")
    print(f"Intra-sentence: {intra} ({intra/total:.2%})")

cross_sentence_stats(docred)
cross_sentence_stats(redocred)


Cross-sentence: 6420 (16.82%)
Intra-sentence: 31760 (83.18%)
Cross-sentence: 16158 (18.80%)
Intra-sentence: 69774 (81.20%)


In [11]:
def compare_relation_counts(docred, redocred):
    redocred_map = {d['title']: d for d in redocred if 'title' in d}
    mismatch_count = 0

    for d in docred:
        title = d.get('title', None)
        if title and title in redocred_map:
            rel1 = {(r['h'], r['t'], r['r']) for r in d.get('labels', [])}
            rel2 = {(r['h'], r['t'], r['r']) for r in redocred_map[title].get('labels', [])}
            if rel1 != rel2:
                mismatch_count += 1

    print(f"Documents with different labels between DocRED and Re-DocRED: {mismatch_count}")

compare_relation_counts(docred, redocred)


Documents with different labels between DocRED and Re-DocRED: 3025


In [12]:
def find_label(mylabel,data_name):   
    na_count = 0
    golden_count = 0
    rel_counter = Counter()
    
    for doc in data_name:
        labels = doc.get("labels", []) + doc.get("labels2_annotator_id", [])
        for label in labels:
            r = label["r"]
            if r == mylabel:
                na_count += 1
            else:
                golden_count += 1
            rel_counter[r] += 1
            
    print(f"Total positive (golden) labels: {golden_count}")
    print(f"Total NA labels: {na_count}")
    print(f"Ratio NA : Golden = {na_count} : {golden_count} = {na_count / golden_count:.2f}")

    print("\nTop 10 relations:")
    for r, c in rel_counter.most_common(10):
        print(f"{r:10} : {c}")

In [13]:
#test the function
find_label(test_label,docred)

Total positive (golden) labels: 29259
Total NA labels: 8921
Ratio NA : Golden = 8921 : 29259 = 0.30

Top 10 relations:
P17        : 8921
P131       : 4193
P27        : 2689
P150       : 2004
P577       : 1142
P175       : 1052
P569       : 1044
P570       : 805
P527       : 632
P161       : 621


In [14]:
#this is only for checking to see if any na sample available
#normally only positive relations are saved.
#na relations are implicit soit is normal to get zero for negatives
find_label(na_label,docred)

Total positive (golden) labels: 38180
Total NA labels: 0
Ratio NA : Golden = 0 : 38180 = 0.00

Top 10 relations:
P17        : 8921
P131       : 4193
P27        : 2689
P150       : 2004
P577       : 1142
P175       : 1052
P569       : 1044
P570       : 805
P527       : 632
P161       : 621


In [15]:
 find_label(na_label,redocred)

Total positive (golden) labels: 85932
Total NA labels: 0
Ratio NA : Golden = 0 : 85932 = 0.00

Top 10 relations:
P131       : 20402
P17        : 14401
P27        : 4665
P150       : 3369
P800       : 3055
P527       : 2313
P361       : 2112
P175       : 1773
P577       : 1621
P463       : 1299


In [16]:
def find_possible_pair(data_name):
    total_pairs = 0
    positive_pairs = 0

    for doc in data_name:
        num_entities = len(doc["vertexSet"])
        all_pairs = set((h, t) for h in range(num_entities) for t in range(num_entities) if h != t)

        # Get positive pairs from labels and labels2_annotator_id
        raw_labels = doc.get("labels", []) + doc.get("labels2_annotator_id", [])
        pos_pairs = set((label["h"], label["t"]) for label in raw_labels)

        total_pairs += len(all_pairs)
        positive_pairs += len(pos_pairs)

    print(f"Total entity pairs: {total_pairs}")
    print(f"Positive pairs (golden labels): {positive_pairs}")
    print(f"Negative pairs (NA): {total_pairs - positive_pairs}")
    print(f"Ratio (neg/pos): {(total_pairs - positive_pairs) / positive_pairs:.2f}")


In [17]:
find_possible_pair(docred)

Total entity pairs: 1198650
Positive pairs (golden labels): 35615
Negative pairs (NA): 1163035
Ratio (neg/pos): 32.66


In [18]:
find_possible_pair(redocred)

Total entity pairs: 1193092
Positive pairs (golden labels): 67808
Negative pairs (NA): 1125284
Ratio (neg/pos): 16.60


In [26]:
from pprint import pprint
testnumber=780
pprint(docred[testnumber]['labels'])
print('----------------------')
pprint(redocred[testnumber]['labels'])


[{'evidence': [0], 'h': 4, 'r': 'P35', 't': 5},
 {'evidence': [0, 1, 2], 'h': 5, 'r': 'P26', 't': 3},
 {'evidence': [2], 'h': 5, 'r': 'P140', 't': 8},
 {'evidence': [0, 1, 2], 'h': 5, 'r': 'P27', 't': 1},
 {'evidence': [0, 1, 2], 'h': 3, 'r': 'P27', 't': 4},
 {'evidence': [0, 1, 2], 'h': 3, 'r': 'P26', 't': 5},
 {'evidence': [2], 'h': 3, 'r': 'P140', 't': 8},
 {'evidence': [0, 1, 2], 'h': 3, 'r': 'P27', 't': 1},
 {'evidence': [0], 'h': 0, 'r': 'P577', 't': 2},
 {'evidence': [2], 'h': 7, 'r': 'P1001', 't': 4},
 {'evidence': [2], 'h': 7, 'r': 'P577', 't': 2},
 {'evidence': [2], 'h': 7, 'r': 'P1001', 't': 1},
 {'evidence': [0], 'h': 5, 'r': 'P27', 't': 4}]
----------------------
[{'evidence': [0], 'h': 4, 'r': 'P35', 't': 5},
 {'evidence': [0, 1, 2], 'h': 5, 'r': 'P26', 't': 3},
 {'evidence': [2], 'h': 5, 'r': 'P140', 't': 8},
 {'evidence': [0, 1, 2], 'h': 5, 'r': 'P27', 't': 1},
 {'evidence': [0, 1, 2], 'h': 3, 'r': 'P27', 't': 4},
 {'evidence': [0, 1, 2], 'h': 3, 'r': 'P26', 't': 5},
 {

In [21]:
def generate_key(sample):
    rels = {(r["h"], r["t"]) for r in sample.get("labels", [])}
    return sample["title"], frozenset(rels)

# First, index DocRED
unique = {}
for sample in docred:
    key = sample["title"]
    unique[key] = sample

# Then overwrite with Re-DocRED if duplicate title exists
for sample in redocred:
    key = sample["title"]
    unique[key] = sample  # this will overwrite DocRED version

merged_data = list(unique.values())


In [27]:
merged_data[testnumber]['labels']

[{'r': 'P35', 'h': 4, 't': 5, 'evidence': [0]},
 {'r': 'P26', 'h': 5, 't': 3, 'evidence': [0, 1, 2]},
 {'r': 'P140', 'h': 5, 't': 8, 'evidence': [2]},
 {'r': 'P27', 'h': 5, 't': 1, 'evidence': [0, 1, 2]},
 {'r': 'P27', 'h': 3, 't': 4, 'evidence': [0, 1, 2]},
 {'r': 'P26', 'h': 3, 't': 5, 'evidence': [0, 1, 2]},
 {'r': 'P140', 'h': 3, 't': 8, 'evidence': [2]},
 {'r': 'P27', 'h': 3, 't': 1, 'evidence': [0, 1, 2]},
 {'r': 'P577', 'h': 0, 't': 2, 'evidence': [0]},
 {'r': 'P1001', 'h': 7, 't': 4, 'evidence': [2]},
 {'r': 'P577', 'h': 7, 't': 2, 'evidence': [2]},
 {'r': 'P1001', 'h': 7, 't': 1, 'evidence': [2]},
 {'r': 'P27', 'h': 5, 't': 4, 'evidence': [0]},
 {'h': 4, 't': 6, 'r': 'P194', 'evidence': []},
 {'h': 7, 't': 2, 'r': 'P585', 'evidence': []},
 {'h': 5, 't': 4, 'r': 'P1001', 'evidence': [0]},
 {'h': 6, 't': 4, 'r': 'P1001', 'evidence': []}]