# Check GPU

In [1]:
import torch

gpu_available = torch.cuda.is_available()
print(f"GPU Available: {gpu_available}")

if gpu_available:
    gpu_name = torch.cuda.get_device_name(0)
    print(f"GPU Name: {gpu_name}")

GPU Available: False


# Load cleaned annotated SWiPE dataset

In [2]:
from datasets import load_from_disk

swipe_clean_dataset = load_from_disk("../data/swipe_clean_annotated")
swipe_clean_dataset

DatasetDict({
    train: Dataset({
        features: ['r_content', 's_content', 'annotations', 'edits'],
        num_rows: 3861
    })
    validation: Dataset({
        features: ['r_content', 's_content', 'annotations', 'edits'],
        num_rows: 479
    })
    test_id: Dataset({
        features: ['r_content', 's_content', 'annotations', 'edits'],
        num_rows: 483
    })
    test_ood: Dataset({
        features: ['r_content', 's_content', 'annotations', 'edits'],
        num_rows: 368
    })
})

# Inspect elaborative simplification samples

## Count number of elaborations in the dataset

In [5]:
from collections import defaultdict

def count_specific_annotations(dataset, specific_categories):
    category_count = defaultdict(int)
    for example in dataset:
        if example['annotations'] is not None:
            if example['annotations']: 
                for annotation in example['annotations']:
                    if annotation['category'] in specific_categories:
                        category_count[annotation['category']] += 1
    return category_count

# eleboration categories
specific_categories = ['semantic_elaboration_background', 'semantic_elaboration_example', 'semantic_elaboration_generic']

# Count for each dataset
train_specific_counts = count_specific_annotations(swipe_clean_dataset['train'], specific_categories)
val_specific_counts = count_specific_annotations(swipe_clean_dataset['validation'], specific_categories)
test_id_specific_counts = count_specific_annotations(swipe_clean_dataset['test_id'], specific_categories)
test_ood_specific_counts = count_specific_annotations(swipe_clean_dataset['test_ood'], specific_categories)

In [6]:
print('semantic_elaboration_generic')
print("Train :", train_specific_counts['semantic_elaboration_generic'])
print("Validation:", val_specific_counts['semantic_elaboration_generic'])
print("Test ID:", test_id_specific_counts['semantic_elaboration_generic'])
print("Test OOD:", test_ood_specific_counts['semantic_elaboration_generic'])

semantic_elaboration_generic
Train : 2260
Validation: 270
Test ID: 295
Test OOD: 356


In [7]:
print('semantic_elaboration_background')
print("Train :", train_specific_counts['semantic_elaboration_background'])
print("Validation:", val_specific_counts['semantic_elaboration_background'])
print("Test ID:", test_id_specific_counts['semantic_elaboration_background'])
print("Test OOD:", test_ood_specific_counts['semantic_elaboration_background'])

semantic_elaboration_background
Train : 604
Validation: 76
Test ID: 81
Test OOD: 39


In [8]:
print('semantic_elaboration_example')
print("Train :", train_specific_counts['semantic_elaboration_example'])
print("Validation:", val_specific_counts['semantic_elaboration_example'])
print("Test ID:", test_id_specific_counts['semantic_elaboration_example'])
print("Test OOD:", test_ood_specific_counts['semantic_elaboration_example'])

semantic_elaboration_example
Train : 98
Validation: 7
Test ID: 20
Test OOD: 13


In [29]:
swipe_clean_dataset['train'][1]['annotations']

[{'category': 'lexical_generic', 'gi': 0, 'opis': [1, 2]},
 {'category': 'lexical_generic', 'gi': 1, 'opis': [4, 5, 6]},
 {'category': 'syntactic_deletion', 'gi': 2, 'opis': [8]},
 {'category': 'syntactic_sentence_fusion', 'gi': 3, 'opis': [12, 11, 10]},
 {'category': 'lexical_generic', 'gi': 4, 'opis': [14, 15]},
 {'category': 'syntactic_deletion', 'gi': 5, 'opis': [17]},
 {'category': 'nonsim_format', 'gi': None, 'opis': [19, 20]}]

## Save each elaboration category in dataframe

In [30]:
import pandas as pd
from tqdm.notebook import tqdm

def get_texts_with_specific_annotations(dataset_dict, specific_categories):
    # store texts per category
    category_data = {category: {'raw_texts': [], 'references': [], 'annotations': [], 'ids': [], 'splits': []} for category in specific_categories}
    for split_name, dataset in dataset_dict.items():
        for idx, example in tqdm(enumerate(dataset), total=len(dataset)):
            if example['annotations']: 
                for annotation in example['annotations']:
                    if annotation['category'] in specific_categories:
                        
                        category_data[annotation['category']]['raw_texts'].append(example['r_content'])
                        category_data[annotation['category']]['references'].append(example['s_content'])
                        category_data[annotation['category']]['annotations'].append(annotation['category'])
                        category_data[annotation['category']]['ids'].append(idx)
                        category_data[annotation['category']]['splits'].append(split_name)

    # turn each category's data into a separate dfs
    dataframes = {}
    for category, data in category_data.items():
        df = pd.DataFrame({
            'id': data['ids'],
            'split': data['splits'],              
            'text': data['raw_texts'],
            'reference': data['references'],
            'annotation': data['annotations']
        })
        dataframes[category] = df

    return dataframes

# elaboration categories
specific_categories = ['semantic_elaboration_background', 'semantic_elaboration_example', 'semantic_elaboration_generic']

In [31]:
dataframes = get_texts_with_specific_annotations(swipe_clean_dataset, specific_categories)

  0%|          | 0/3861 [00:00<?, ?it/s]

  0%|          | 0/479 [00:00<?, ?it/s]

  0%|          | 0/483 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

In [32]:
background_df = dataframes['semantic_elaboration_background']
example_df = dataframes['semantic_elaboration_example']
generic_df = dataframes['semantic_elaboration_generic']

In [33]:
print('Semantic Elaboration background: ', len(background_df))
print('Semantic Elaboration example: ', len(example_df))
print('Semantic Elaboration generic: ', len(generic_df))

Semantic Elaboration background:  800
Semantic Elaboration example:  138
Semantic Elaboration generic:  3181


In [35]:
background_df.to_csv('../data/elaborations/background.csv', index=False)
example_df.to_csv('../data/elaborations/example.csv', index=False)
generic_df.to_csv('../data/elaborations/generic.csv', index=False)

# Investigate main features of elaboration texts

In [54]:
from IPython.core.display import HTML

def highlight_elaborations(text, annotation, edits):
    # split into words
    words = text.split()
    
    # set to store the positions of elaborations
    elaboration_positions = set()
    
    # interate through edits to collect positions for 'elaboration' categories
    for edit in edits:
        if edit['category'] == annotation:
            elaboration_positions.update(edit['opis'])  # add positions of elaboration
    
    # highlight words at elaboration positions
    highlighted_words = []
    for idx, word in enumerate(words):
        if idx in elaboration_positions:
            # Wrap the word in a span with green background
            highlighted_words.append(f'<span style="background-color: #90EE90">{word}</span>')
        else:
            highlighted_words.append(word)
    
    # join the words back into a single text
    highlighted_text = ' '.join(highlighted_words)
    
    # display as HTML
    display(HTML(highlighted_text))

In [63]:
import random

random.seed(42)  

def get_random_samples(df, num_samples=10):

    samples = df.sample(n=num_samples, random_state=42)
    for _, sample in samples.iterrows():
        
        idx = sample['id']
        ds_split = sample['split']
        category = sample['annotation']
        record = swipe_clean_dataset[ds_split][idx]
        
        print('Text: ', sample['text'],end  ='\n\n')
        print('Ref: ', sample['reference'],end='\n\n')
        print('Edits :',record['annotations'], end='\n\n')
        highlight_elaborations(sample['text'],category, record['annotations'])
        print('-' * 130, end='\n\n')

## Example elaborations

In [42]:
import pandas as pd

example_df = pd.read_csv('../data/elaborations/example.csv')
example_df.head()

Unnamed: 0,id,split,text,reference,annotation
0,21,train,Ferdinand Alexander Araneta Marcos III (Englis...,Ferdinand Alexander Araneta Marcos III (Tagalo...,semantic_elaboration_example
1,24,train,"Louis Antoine of France, Duke of Angoulême (6 ...","Louis Antoine of France, Duke of Angoulême (6 ...",semantic_elaboration_example
2,25,train,"In musical terminology, tempo (Italian for 'ti...","Tempo (Italian for 'time, movement') is the sp...",semantic_elaboration_example
3,49,train,"A land bridge, in biogeography,what history’en...",A land bridge is a term in biogeography. It is...,semantic_elaboration_example
4,71,train,The Dytiscidae – based on the Greek dytikos (δ...,"The Dytiscidae (Greek dytikos (δυτικός), ""able...",semantic_elaboration_example


In [64]:
get_random_samples(background_df)

Text:  Shigeo Sugimoto (杉本 茂雄, Sugimoto Shigeo, December 4, 1926 – April 2, 2002) was a Japanese football player. He played for Japan national team.

Ref:  Shigeo Sugimoto (born 4 December 1926 - died 2 April 2002) is a former Japanese football player. He has played for Japan national team.

Edits : [{'category': 'discourse_reordering', 'gi': 0, 'opis': [1, 4]}, {'category': 'lexical_entity', 'gi': 1, 'opis': [2]}, {'category': 'discourse_reordering', 'gi': 2, 'opis': [6, 8]}, {'category': 'syntactic_generic', 'gi': 3, 'opis': [10, 11]}, {'category': 'semantic_elaboration_background', 'gi': 4, 'opis': [13]}, {'category': 'syntactic_generic', 'gi': None, 'opis': [15]}]



----------------------------------------------------------------------------------------------------------------------------------

Text:  Yutakayama Hiromitsu (豊山広光) (born 22 October 1947 as Hiromitsu Nagahama) is a former sumo wrestler from Shibata, Niigata, Japan. A former amateur champion, he turned professional in 1970. His highest rank was komusubi. He wrestled for Tokitsukaze stable and took his shikona or fighting name from the head coach who recruited him, former ozeki Yutakayama Katsuo. After his retirement in 1981 he became an elder of the Japan Sumo Association, and founded the Minato stable which he led from 1982 until 2010.

Ref:  Yutakayama Hiromitsu (豊山広光, (22 October 1947 – 19 September 2020), born Hiromitsu Nagahama (長濱廣光, Nagahama Hiromitsu)) was a sumo wrestler. He was born in Shibata, Niigata. He turned professional in 1970. His highest rank was komusubi. He wrestled for Tokitsukaze stable. After his retirement in 1981 he became an elder of the Japan Sumo Associati

----------------------------------------------------------------------------------------------------------------------------------

Text:  The Paris Institute of Political Studies (French: Institut d'études politiques de Paris, French pronunciation: ​[(l)ɛ̃stity detyd pɔlitik dəpaˈʁi];), simply referred to as Sciences Po (French pronunciation: ​[sjɑ̃sˈpo]), is a public research and higher education institution in Paris, France. Established in 1872 as the École libre des sciences politiques, Sciences Po has educated France's political elites and is considered one of the most prestigious universities in the world for the social sciences. Sciences Po maintains faculties in political science as well as in economics, history, sociology, law, finance, business, communication, social and urban policy, management and journalism. It is ranked 5th in the world for Politics and International Studies in 2015 (1st in France). The School has produced many notable alumni in the fields of law, economi

----------------------------------------------------------------------------------------------------------------------------------

Text:  Isabelle Geneviève Marie Anne "France" Gall (9 October 1947 – 7 January 2018) was a French yé-yé singer. She won the Eurovision Song Contest in 1965. She was married to, and had a successful singing career in partnership with, the late French singer-songwriter Michel Berger, until his death. The couple had two children.

Ref:  Isabelle Geneviève Marie Anne "France" Gall (9 October 1947 – 7 January 2018) was a French singer. She was born in Paris. Gall was known for her songs "Laisse tomber les filles", "Poupée de cire, poupée de son", and "Ella, elle l'a". She was a Eurovision Song Contest contestant for the 1965 contest. Gall was married to Michel Berger, until his death. Gall died of an infection complicated from cancer at a Paris hospital on 7 January 2018 at the age of 70.

Edits : [{'category': 'semantic_deletion', 'gi': None, 'opis': [1]}, {'c

----------------------------------------------------------------------------------------------------------------------------------

Text:  Kilner jar is a rubber-sealed, screw-topped jar used for the storage of food, which was invented by the Kilner family and produced by John Kilner & Co., Yorkshire, England. Classically, it was a glass plug with a rubber seal attached to it in the top, with the whole being secured with a metal screw-top lid. Contemporary "Kilner-style" jars usually have a lid made entirely of metal. Kilner jars are used for storing and preserving home-made jams, marmalades and other relishes. They are also used for pickling food such as eggs, onions and garlic. In 2000 The Rayware Group purchased the design, patent and trademark for the original Kilner Jar and remains committed to developing the much loved and respected brand. John Kilner was found to be the great-great-great grandfather of Jeremy Clarkson.

Ref:  A Kilner jar is a jar that has a rubber top. It does 

----------------------------------------------------------------------------------------------------------------------------------

Text:  Ivica Kralj (Serbian Cyrillic: Ивица Краљ) (born March 26, 1973 in Tivat, Montenegro, SFR Yugoslavia) is a professional football goalkeeper. His height is 197 cm and he weighs 90 kg. His previous clubs are Arsenal Tivat, Jastrebac Proleter Niš, FK Partizan Belgrade, FC Porto and PSV Eindhoven. He made his international debut in a 3-2 victory away at Argentina, and went on to play 41 times for Yugoslavia. He played in all of Yugoslavia's France 98 World cup games, and in 6 of Yugoslavia's 8 Euro 2000 Qualifying Games, where he conceded 6 goals. His surname "Kralj" means "King" in Serbian. In early August 2007, Kralj and Russian side FC Rostov agreed to a contract that ties the goalkeeper to the club until the end of the Russian league season in December 2007. At the time of Kralj's signing the club was already in a bad position with slim chances of a

----------------------------------------------------------------------------------------------------------------------------------

Text:  Mark Steven Kirk (born September 15, 1959) is the junior United States Senator from Illinois and a member of the Republican Party. Previously, Kirk was a member of the U.S. House of Representatives, representing Illinois's 10th congressional district. Kirk is a social moderate and fiscal conservative. Born in Champaign, Illinois, he graduated from Cornell University, the London School of Economics, and Georgetown University Law Center. He practiced law throughout the 1980s and 1990s. He joined the United States Navy Reserve as a Direct Commission Officer in the Intelligence career field in 1989 and was recalled to active duty for the 1999 NATO bombing of Yugoslavia. He participated in Operation Northern Watch in Iraq the following year. He attained the rank of Commander and retired from the Navy Reserve in 2013. Kirk was elected to the House in 2000

----------------------------------------------------------------------------------------------------------------------------------

Text:  Herne is a village in South East England, divided by the Thanet Way from the seaside resort of Herne Bay. Administratively it is in the civil parish of Herne and Broomfield in Kent. Between Herne and Broomfield is the former hamlet of Hunters Forstal; Herne Common lies to the south. The hamlet of Bullockstone is about one mile to the west.

Ref:  Herne is a small historical village, near Herne Bay in Kent, South East England. The hamlet of Bullockstone is about one mile to the west.

Edits : [{'category': 'semantic_elaboration_background', 'gi': 0, 'opis': [1]}, {'category': 'discourse_reordering', 'gi': 1, 'opis': [3, 7]}, {'category': 'discourse_reordering', 'gi': 2, 'opis': [5, 8]}, {'category': 'semantic_deletion', 'gi': None, 'opis': [9]}]



----------------------------------------------------------------------------------------------------------------------------------

Text:  Pernik (Bulgarian: Перник [ˈpɛrnik]) is a town in western Bulgaria (about 20 kilometres (12 miles) south-west of Sofia) with a population of 75,964 as of 2020[update]. Pernik is the most populated town in western Bulgaria after Sofia. It is the main town of Pernik Province and lies on both banks of the Struma River in the Pernik Valley between the Golo Bardo Mountain, Vitosha Mountain, Lyulin and Viskyar mountains. Pernik is the principal town of Pernik Province – a province in western Bulgaria, which is next to the Serbian border. Originally the site of a Thracian fortress founded in the 4th century BC, and later a Roman settlement, Pernik became part of the Bulgarian Empire in the early 9th century as an important fortress. The medieval town was a key Bulgarian stronghold during Bulgarian tsar Samuil's wars against the Byzantine Empire in the 11th

----------------------------------------------------------------------------------------------------------------------------------

Text:  Andre Tyler Iguodala (Also Known as king Slaw)(ig-wə-DAH-lə; born January 28, 1984) is an American professional basketball player who currently plays for the Golden State Warriors of the National Basketball Association (NBA). The swingman was an NBA All-Star in 2012 and has been named to the NBA All-Defensive Team twice. Iguodala won an NBA championship with the Warriors in 2015, when he was named the NBA Finals Most Valuable Player (MVP). He was also a member of the United States national team at the 2010 FIBA World Championship and 2012 Summer Olympics, winning the gold medal both times. Iguodala went to Lanphier High School in Springfield, Illinois before attending the University of Arizona. After completing two years, he declared for the 2004 NBA draft where he was drafted with the ninth overall pick by the Philadelphia 76ers. Iguodala played in

----------------------------------------------------------------------------------------------------------------------------------

