In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tenacity import retry, stop_after_attempt, wait_fixed
from time import sleep
from datetime import datetime
# from tqdm import tqdm
from tqdm.notebook import tqdm  # <--- Use notebook version for Jupyter
import requests
import os

from src.paths import LOCAL_RAW_DATA_PATH
import requests
import json
import os
import threading
import time
from functools import wraps
tqdm.pandas()

In [13]:
output_dir = LOCAL_RAW_DATA_PATH / "Babelscape/SREDFM"


for set_ in ('train', 'test', 'validation'): 
    response = requests.get(f"https://huggingface.co/api/datasets/Babelscape/SREDFM/parquet/all_languages/{set_}")

    # Check status and print output
    if response.status_code == 200:
        urls = response.json()
    else:
        print(f"Failed to get data: {response.status_code}")

    os.makedirs(f"{output_dir}/{set_}", exist_ok=True)

    for i, url in enumerate(urls):
        response = requests.get(url)
        if response.status_code == 200:
            output_path = f"{output_dir}/{set_}/file_{i}.parquet"
            with open(output_path, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded `{url}` to `{output_path}`")
        else:
            print(f"Failed to download {url}")


Downloaded `https://huggingface.co/api/datasets/Babelscape/SREDFM/parquet/all_languages/train/0.parquet` to `/home/murilo/RelNetCare/data/raw/Babelscape/SREDFM/train/file_0.parquet`
Downloaded `https://huggingface.co/api/datasets/Babelscape/SREDFM/parquet/all_languages/train/1.parquet` to `/home/murilo/RelNetCare/data/raw/Babelscape/SREDFM/train/file_1.parquet`
Downloaded `https://huggingface.co/api/datasets/Babelscape/SREDFM/parquet/all_languages/train/2.parquet` to `/home/murilo/RelNetCare/data/raw/Babelscape/SREDFM/train/file_2.parquet`
Downloaded `https://huggingface.co/api/datasets/Babelscape/SREDFM/parquet/all_languages/train/3.parquet` to `/home/murilo/RelNetCare/data/raw/Babelscape/SREDFM/train/file_3.parquet`
Downloaded `https://huggingface.co/api/datasets/Babelscape/SREDFM/parquet/all_languages/train/4.parquet` to `/home/murilo/RelNetCare/data/raw/Babelscape/SREDFM/train/file_4.parquet`
Downloaded `https://huggingface.co/api/datasets/Babelscape/SREDFM/parquet/all_languages/tr

In [8]:
import pandas as pd

# Specify the directory
full_df = pd.DataFrame()
for set_ in tqdm(('test', 'validation',
                  'train'
                  )):
    parquet_dir = f"/home/murilo/RelNetCare/data/raw/Babelscape/SREDFM/{set_}"

    # Load all parquet files in the directory into a single DataFrame
    set_data = pd.read_parquet(parquet_dir)

    set_data['set'] = set_
    
    full_df = pd.concat([full_df, set_data], axis=0)
    
full_df['set'] = full_df['set'].replace({'validation': 'dev'})
    
full_df.head()

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,docid,title,uri,lan,text,entities,relations,set
0,2205750,نظام تصنيف إيلو,Q105955,ar,تصغير|أرياد إيلو مخترع النظام المسمى على اسمه\...,"[{'uri': '', 'surfaceform': 'تصغير|أرياد إيلو ...","[{'subject': 8, 'predicate': 'sport', 'object'...",test
1,2163926,مطار تجكجة,Q6588191,ar,مطار تجكجة (إياتا: TIY، إيكاو: GQND) هو مطار ي...,"[{'uri': 'Q1248784', 'surfaceform': 'مطار', 't...","[{'subject': 4, 'predicate': 'instance of', 'o...",test
2,2164075,آيل مونتي (تشيلي),Q13038,ar,آيل مونتي (تشيلي) هي بلدية تقع في تشيلي في محا...,"[{'uri': 'Q15284', 'surfaceform': 'بلدية', 'ty...","[{'subject': 3, 'predicate': 'country', 'objec...",test
3,2303966,جزيرة جانا,Q20383893,ar,جزيرة جانا هي جزيرة عائمة فوق المياه الإقليمية...,"[{'uri': 'Q851', 'surfaceform': 'المملكة العرب...","[{'subject': 4, 'predicate': 'country', 'objec...",test
4,2210670,حرب ترانسنيستريا,Q285280,ar,حرب ترانسنيستريا وهو نزاع مسلح كان بين إنفصالي...,"[{'uri': 'Q907112', 'surfaceform': 'ترانسنيستر...","[{'subject': 11, 'predicate': 'participant', '...",test


In [9]:
full_df.reset_index(drop=True, inplace=True)
mask = full_df['lan'] == "en"
df = full_df[mask]

In [10]:
relation_counts = df.relations.explode().apply(lambda x: x['predicate']).value_counts()
relation_counts

relations
country                                             1203526
located in the administrative territorial entity     891611
date of birth                                        466929
sport                                                369147
point in time                                        347785
                                                     ...   
gens                                                    171
parent club                                             169
top-level Internet domain                               169
afflicts                                                154
points/goal scored by                                   128
Name: count, Length: 400, dtype: int64

In [11]:
focus_relation_dict = {
    'per:spouse': 'spouse',
    'per:siblings': 'sibling',
    'per:parents': 'child',  
    'per:children': 'child',
    'per:visitors_of_place': None,
    'per:visited_place': None,  # A bit of a stretch
    'per:residents_of_place': None,
    'per:place_of_residence': None, # 'residence',
    'per:pet': None, 
    'per:acquaintance': None,  # Close?
    'per:other_family': 'family'  
}
[i for i in relation_counts.index if i in focus_relation_dict.values()]
mask = relation_counts.index.isin(focus_relation_dict.values())
display(relation_counts[mask])
tgt_rels = relation_counts[mask].index

relations
child      37936
spouse     28051
sibling    23906
family      5078
Name: count, dtype: int64

In [12]:
df['tgt_relations'] = df.relations.progress_apply(lambda rels: [r for r in rels if r['predicate'] in tgt_rels])
df['tgt_relations_cnt'] = df.tgt_relations.progress_apply(len)
df['text_turn_count'] = df.text.progress_apply(lambda x: x.split('. ')).progress_apply(len)
df['text_turn_count'].value_counts()


  0%|          | 0/1911309 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tgt_relations'] = df.relations.progress_apply(lambda rels: [r for r in rels if r['predicate'] in tgt_rels])


  0%|          | 0/1911309 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tgt_relations_cnt'] = df.tgt_relations.progress_apply(len)


  0%|          | 0/1911309 [00:00<?, ?it/s]

  0%|          | 0/1911309 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_turn_count'] = df.text.progress_apply(lambda x: x.split('. ')).progress_apply(len)


text_turn_count
2      477410
1      392220
3      326421
4      215397
5      145340
        ...  
120         1
89          1
65          1
78          1
96          1
Name: count, Length: 85, dtype: int64

In [13]:
mask = ((df['tgt_relations_cnt'] < 5) & (df['tgt_relations_cnt'] > 0) & (df['text_turn_count'] < 12))
df[mask]

Unnamed: 0,docid,title,uri,lan,text,entities,relations,set,tgt_relations,tgt_relations_cnt,text_turn_count
13503,1822594,Hans Ulrik Gyldenløve,Q672470,en,Hans Ulrik Gyldenløve (10 March 1615 – 31 Janu...,"[{'uri': 'Q153586', 'surfaceform': 'Christian ...","[{'subject': 0, 'predicate': 'child', 'object'...",test,"[{'subject': 0, 'predicate': 'child', 'object'...",2,2
13532,3213263,Rip Hunter,Q965001,en,Rip Hunter is a fictional superhero appearing ...,"[{'uri': 'Q188784', 'surfaceform': 'superhero'...","[{'subject': 12, 'predicate': 'occupation', 'o...",test,"[{'subject': 7, 'predicate': 'child', 'object'...",1,5
13570,28846426,Fyodor Petrovich Komissarzhevsky,Q4229603,en,Fyodor Petrovich Komissarzhevsky () (1832 – 14...,"[{'uri': 'Q27914', 'surfaceform': 'tenor', 'ty...","[{'subject': 10, 'predicate': 'voice type', 'o...",test,"[{'subject': 10, 'predicate': 'child', 'object...",2,5
13585,165554,"James Scott, 1st Duke of Monmouth",Q140235,en,"James Scott, 1st Duke of Monmouth, 1st Duke of...","[{'uri': 'Q215248', 'surfaceform': 'KG', 'type...","[{'subject': 19, 'predicate': 'date of birth',...",test,"[{'subject': 5, 'predicate': 'child', 'object'...",1,5
13607,48803105,Maggie Murdock,Q23306601,en,"Margaret Grace ""Maggie"" Murdock is a fictional...","[{'uri': 'Q1114502', 'surfaceform': 'American ...","[{'subject': 13, 'predicate': 'spouse', 'objec...",test,"[{'subject': 13, 'predicate': 'spouse', 'objec...",3,4
...,...,...,...,...,...,...,...,...,...,...,...
3917908,5583461,Archibald Campbell Fraser of Lovat,Q4786255,en,"Archibald Campbell Fraser of Lovat, (16 August...","[{'uri': 'Q5964135', 'surfaceform': 'Simon Fra...","[{'subject': 3, 'predicate': 'sibling', 'objec...",train,"[{'subject': 3, 'predicate': 'sibling', 'objec...",1,2
3918034,5749332,Francis Nixon,Q5482055,en,Francis Nixon may refer to:\n\n*Francis A. Nix...,"[{'uri': 'Q5479914', 'surfaceform': 'Francis A...","[{'subject': 0, 'predicate': 'child', 'object'...",train,"[{'subject': 0, 'predicate': 'child', 'object'...",1,2
3918047,5749529,Hannah Milhous Nixon,Q5648899,en,"Hannah Milhous Nixon (March 7, 1885 – Septembe...","[{'uri': 'Q11696', 'surfaceform': 'President',...","[{'subject': 10, 'predicate': 'child', 'object...",train,"[{'subject': 10, 'predicate': 'child', 'object...",1,10
3918139,5751250,Anders Martin-Löf,Q4753868,en,Anders Martin-Löf (born 16 March 1940) is a Sw...,"[{'uri': 'Q34', 'surfaceform': 'Swedish', 'typ...","[{'subject': 26, 'predicate': 'country of citi...",train,"[{'subject': 26, 'predicate': 'sibling', 'obje...",1,11


In [14]:
df[mask]['tgt_relations'].explode().progress_apply(lambda x: x['predicate']).value_counts()


  0%|          | 0/53713 [00:00<?, ?it/s]

tgt_relations
child      22914
spouse     17717
sibling    10022
family      3060
Name: count, dtype: int64

In [15]:
tgt_rels_ctn = df[mask]['tgt_relations'].shape[0]
tgt_rels_df = df[mask]
no_rel_sample_cnt = int(0.35 * tgt_rels_ctn)
mask = df['tgt_relations'].apply(lambda x: len(x) == 0)
no_rel_df = df[mask].sample(no_rel_sample_cnt, random_state=42)
out_df = pd.concat([tgt_rels_df, no_rel_df], axis=0).sort_values('docid')
out_df

Unnamed: 0,docid,title,uri,lan,text,entities,relations,set,tgt_relations,tgt_relations_cnt,text_turn_count
2314871,100041,Sargon II,Q208336,en,"Sargon II (Neo-Assyrian cuneiform: ""Šarru-kīn""...","[{'uri': 'Q401', 'surfaceform': 'Neo-Assyrian ...","[{'subject': 17, 'predicate': 'sibling', 'obje...",train,"[{'subject': 17, 'predicate': 'sibling', 'obje...",4,9
2216763,1000508,Haute-Provence Observatory,Q2013015,en,"The Haute-Provence Observatory (OHP, ) is an a...","[{'uri': 'Q62832', 'surfaceform': 'astronomica...","[{'subject': 20, 'predicate': 'country', 'obje...",train,"[{'subject': 6, 'predicate': 'spouse', 'object...",2,11
2314874,100061,Wario,Q536138,en,is a fictional character and antagonist in Nin...,"[{'uri': 'Q245204', 'surfaceform': 'antagonist...","[{'subject': 2, 'predicate': 'characters', 'ob...",train,"[{'subject': 24, 'predicate': 'sibling', 'obje...",1,7
2216771,1000626,Paranal Observatory,Q10914152,en,Paranal Observatory is an astronomical observa...,"[{'uri': 'Q62832', 'surfaceform': 'astronomica...","[{'subject': 1, 'predicate': 'item operated', ...",train,[],0,6
2314881,100096,Edwardian era,Q185852,en,The Edwardian era or Edwardian period of Briti...,"[{'uri': 'Q1379246', 'surfaceform': 'British h...","[{'subject': 19, 'predicate': 'named after', '...",train,"[{'subject': 13, 'predicate': 'child', 'object...",1,8
...,...,...,...,...,...,...,...,...,...,...,...
2216607,999111,Guy Auguste de Rohan-Chabot,Q3121456,en,"Guy Auguste de Rohan-Chabot known as the ""comt...","[{'uri': 'Q9068', 'surfaceform': 'Voltaire', '...","[{'subject': 6, 'predicate': 'date of birth', ...",train,"[{'subject': 1, 'predicate': 'child', 'object'...",1,3
2216623,999297,George Washington Whistler,Q4470194,en,"George Washington Whistler (May 19, 1800 – Apr...","[{'uri': 'Q13582652', 'surfaceform': 'civil en...","[{'subject': 9, 'predicate': 'occupation', 'ob...",train,"[{'subject': 9, 'predicate': 'child', 'object'...",1,4
78673,999379,.ar,Q38300,en,.ar is the Internet country code top-level dom...,"[{'uri': 'Q75', 'surfaceform': 'Internet', 'ty...","[{'subject': 6, 'predicate': 'instance of', 'o...",dev,[],0,4
2314865,99970,Ernie Kovacs,Q990890,en,"Ernest Edward Kovacs (January 23, 1919 – Janua...","[{'uri': 'Q310819', 'surfaceform': 'Johnny Car...","[{'subject': 30, 'predicate': 'date of birth',...",train,"[{'subject': 30, 'predicate': 'spouse', 'objec...",1,10


In [16]:
out_df['tgt_relations']

2314871    [{'subject': 17, 'predicate': 'sibling', 'obje...
2216763    [{'subject': 6, 'predicate': 'spouse', 'object...
2314874    [{'subject': 24, 'predicate': 'sibling', 'obje...
2216771                                                   []
2314881    [{'subject': 13, 'predicate': 'child', 'object...
                                 ...                        
2216607    [{'subject': 1, 'predicate': 'child', 'object'...
2216623    [{'subject': 9, 'predicate': 'child', 'object'...
78673                                                     []
2314865    [{'subject': 30, 'predicate': 'spouse', 'objec...
2216711                                                   []
Name: tgt_relations, Length: 44101, dtype: object

In [17]:
tmp1 = out_df['tgt_relations'].explode().progress_apply(lambda x: x['predicate'] if isinstance(x, dict)  else 'Empty-Dialogue').value_counts()
tmp2 = out_df['tgt_relations'].explode().progress_apply(lambda x: x['predicate'] if isinstance(x, dict)  else 'Empty-Dialogue').value_counts(normalize=True)
tmp = pd.concat([tmp1, tmp2], axis=1)
display(tmp)
tmp.sum()

  0%|          | 0/65146 [00:00<?, ?it/s]

  0%|          | 0/65146 [00:00<?, ?it/s]

Unnamed: 0_level_0,count,proportion
tgt_relations,Unnamed: 1_level_1,Unnamed: 2_level_1
child,22914,0.351733
spouse,17717,0.271958
Empty-Dialogue,11433,0.175498
sibling,10022,0.153839
family,3060,0.046971


count         65146.0
proportion        1.0
dtype: float64

In [18]:
out_df.shape

(44101, 11)

In [41]:

babel_to_dialogre_rel_dict = {v: k for k,v in focus_relation_dict.items() if v is not None}
babel_to_dialogre_rel_dict

{'spouse': 'per:spouse',
 'sibling': 'per:siblings',
 'child': 'per:children',
 'family': 'per:other_family'}

In [44]:
out_df['tgt_relations_out_format'] = out_df.progress_apply(lambda row: [{
                'x': row['entities'][r['subject']]['surfaceform'],
                'r': [babel_to_dialogre_rel_dict[r['predicate']]],
                'y': row['entities'][r['object']]['surfaceform']
            } for r in row['tgt_relations']], axis=1)

  0%|          | 0/44101 [00:00<?, ?it/s]

In [36]:
idx = 7
out_df['text'].iloc[idx], out_df['tgt_relations_out_format'].iloc[idx]

('Hiranyakashipu (; also known as Hiranyakashyap) was an Asura and king of the "daityas" from the Puranic scriptures of Hinduism. His name literally translates to "clothed in gold" ("hiranya" "gold" "kashipu" "soft cushion"), and is often interpreted as depicting one who is fond of wealth and sensual comforts. In the Puranas, however, it is also stated the name was derived from a golden throne called \'Hiranyakashipu\' the asura sat in or nearby during the "Atiratra" (Soma) sacrifice. \n\nHiranyakashipu\'s elder brother, Hiranyaksha was slain by Varaha avatar of Lord Vishnu. Angered by this, Hiranyakashipu decided to gain magical powers by performing a penance for Lord Brahma. He is subsequently killed by the Narasimha avatara of Lord Vishnu. His tale depicts the futility of desiring power over others and the strength of God\'s protection over his fully surrendered devotees (in the case of his son Prahlada).\nHiranyakashipu, according to legend, earned a boon from Lord Brahma that made

In [37]:
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize

out_df['sentences'] = out_df['text'].progress_apply(sent_tokenize)


[nltk_data] Downloading package punkt to /home/murilo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/44101 [00:00<?, ?it/s]

In [38]:
result = out_df.progress_apply(lambda row: (row['sentences'], row['tgt_relations_out_format']), axis=1).tolist()
result

  0%|          | 0/44101 [00:00<?, ?it/s]

[(['Sargon II (Neo-Assyrian cuneiform: "Šarru-kīn", probably meaning "the faithful king" or "the legitimate king") was the king of the Neo-Assyrian Empire from the downfall of his predecessor Shalmaneser V in 722 BC to his death in battle in 705 BC.',
   "Though Sargon claimed to be the son of the previous king Tiglath-Pileser III (745–727 BC), this is uncertain and he probably gained the throne through usurping it from Shalmaneser V. Sargon is recognized as one of the most important Neo-Assyrian kings due to his role in founding the Sargonid dynasty, which would rule the Neo-Assyrian Empire until its fall less than a century after Sargon's death.",
   'The king probably took the name Sargon from the legendary ruler Sargon of Akkad, who had founded the Akkadian Empire and ruled most of Mesopotamia almost two thousand years prior.',
   'Through his military campaigns aimed at world conquest, Sargon II aspired to follow in the footsteps of his ancient namesake.',
   'Sargon sought to pro

In [47]:
out_df.shape

(44101, 13)

In [45]:
from src.paths import LOCAL_PROCESSED_DATA_PATH
import json  # Don't forget to import json if you haven't

for set_ in out_df['set'].unique():
    mask = out_df['set'] == set_
    data = out_df[mask].progress_apply(lambda row: (row['sentences'], row['tgt_relations_out_format']), axis=1).tolist()
    output_dir = LOCAL_PROCESSED_DATA_PATH / f'dialog-re-babelscape-sredfm'
    os.makedirs(output_dir, exist_ok=True)
    output_path = output_dir / f'{set_}.json'
    
    with open(output_path, mode='w', encoding='utf-8') as fp:
        json.dump(data, fp)


  0%|          | 0/43412 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/464 [00:00<?, ?it/s]

In [53]:
def extract_target_relationships(data, target_rels=None):
    if target_rels is None:
        target_rels = {'child', 'children', 'parent', 'parents', 'father', 'mother',
                       'sister', 'brother', 'sibling', 'siblings', 'pet', 'pets', 'relative', 'relatives'}

    output_data = []

    for row in data['rows']:
        text = row['row']['text']
        relations = row['row']['relations']
        entities = row['row']['entities']

        output_rels = [
            {
                'subject': entities[rel['subject']]['surfaceform'],
                'relation': rel['predicate'],
                'object': entities[rel['object']]['surfaceform']
            }
            for rel in relations if rel['predicate'] in target_rels
        ]

        if output_rels:
            output_data.append((text, output_rels))

    return output_data

# Usage example
# data = {'rows': [...]}  # Your input data
target_relationships = extract_target_relationships(data)
target_relationships

[('Flavius Severianus (died 313) was the son of the Roman Emperor Flavius Valerius Severus.',
  [{'subject': 'Flavius Valerius Severus',
    'relation': 'child',
    'object': 'Flavius Severianus'}])]

In [52]:
data['rows']

[{'row_idx': 0,
  'row': {'docid': '49316154',
   'title': 'Émilie Andéol',
   'uri': 'Q14775089',
   'text': "Émilie Andéol (born 30 October 1987) is a French judoka competing in the women's +78\xa0kg division. She won gold at the 2014 European Judo Championships in Montpellier, and bronze in the 2014 World Judo Championships in Chelyabinsk and gold at the 2015 European Judo Championships in Baku. She won the gold medal at the 2016 Summer Olympics in Rio de Janeiro.",
   'entities': [{'uri': 'Q11420',
     'surfaceform': 'judo',
     'type': 'Concept',
     'start': 49,
     'end': 53},
    {'uri': 'Q16540914',
     'surfaceform': '2014 European Judo Championships',
     'type': 'EVE',
     'start': 118,
     'end': 150},
    {'uri': 'Q18012542',
     'surfaceform': '2014 World Judo Championships',
     'type': 'Concept',
     'start': 185,
     'end': 214},
    {'uri': 'Q18701815',
     'surfaceform': '2015 European Judo Championships',
     'type': 'EVE',
     'start': 246,
     'en

In [21]:
data['rows'][0]['row']['text']
data['rows'][0]['row']['entities']
# data['rows'][0]['row']['relations']



[{'subject': 5, 'predicate': 'sport', 'object': 0},
 {'subject': 5, 'predicate': 'date of birth', 'object': 6},
 {'subject': 1, 'predicate': 'participant', 'object': 5},
 {'subject': 2, 'predicate': 'participant', 'object': 5},
 {'subject': 4, 'predicate': 'participant', 'object': 5},
 {'subject': 1, 'predicate': 'point in time', 'object': 7},
 {'subject': 1, 'predicate': 'point in time', 'object': 8},
 {'subject': 2, 'predicate': 'point in time', 'object': 7},
 {'subject': 2, 'predicate': 'point in time', 'object': 8},
 {'subject': 3, 'predicate': 'point in time', 'object': 9}]