# SemEval-2016 Task 2: Interpretable Semantic Textual Similarity

- https://aclanthology.org/S16-1082/
- https://alt.qcri.org/semeval2016/task2/

In [1]:
from pathlib import Path

data_dir = Path('../data')

hub_username = 'marcov'

corpus_name = 'SEMEVAL_2016_TASK_2'
corpus_url = 'http://alt.qcri.org/semeval2016/task2/data/uploads/train_2015_10_22.utf-8.tar.gz'
corpus_filename = 'train_2015_10_22.utf-8.tar.gz'
corpus_dir = data_dir / corpus_name
corpus_dir.mkdir(parents=True, exist_ok=True)

headlines_path = corpus_dir / 'train_2015_10_22.utf-8' / 'STSint.input.headlines.wa'
images_path = corpus_dir / 'train_2015_10_22.utf-8' / 'STSint.input.images.wa'

In [2]:
!wget -nc {corpus_url} -P {corpus_dir}
!tar xzvf {corpus_dir}/{corpus_filename} -C {corpus_dir}

--2024-05-08 11:25:17--  http://alt.qcri.org/semeval2016/task2/data/uploads/train_2015_10_22.utf-8.tar.gz
Resolving alt.qcri.org (alt.qcri.org)... 37.186.61.205
Connecting to alt.qcri.org (alt.qcri.org)|37.186.61.205|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://alt.qcri.org/semeval2016/task2/data/uploads/train_2015_10_22.utf-8.tar.gz [following]
--2024-05-08 11:25:18--  https://alt.qcri.org/semeval2016/task2/data/uploads/train_2015_10_22.utf-8.tar.gz
Connecting to alt.qcri.org (alt.qcri.org)|37.186.61.205|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 322368 (315K) [application/x-gzip]
Saving to: ‘../data/SEMEVAL_2016_TASK_2/train_2015_10_22.utf-8.tar.gz’


2024-05-08 11:25:22 (372 KB/s) - ‘../data/SEMEVAL_2016_TASK_2/train_2015_10_22.utf-8.tar.gz’ saved [322368/322368]

x train_2015_10_22.utf-8/STSint.input.headlines.wa
x train_2015_10_22.utf-8/00-readme.txt
x train_2015_10_22.utf-8/STSint.input.images.wa


In [3]:
import re
from datasets import Dataset

sentence_pattern = re.compile(r"""
<sentence \s id="(?P<id>[^"]*)" \s status="(?P<status>[^"]*)"> \s*
// (?P<sent1>.*?) \s*
// (?P<sent2>.*?) \s*
<source>(?P<source>.*?)</source> \s*
<translation>(?P<translation>.*?)</translation> \s*
<alignment>(?P<alignment>.*?)</alignment> \s*
</sentence>
""", re.DOTALL | re.VERBOSE)

def load_dataset(path):
    return Dataset.from_list([
        parse_match(m)
        for m in sentence_pattern.finditer(path.read_text())
    ])

def parse_match(m):
    alignment_id = m['id']
    sent1 = m['sent1'].strip()
    sent2 = m['sent2'].strip()
    return {
        'id': int(m['id']),
        'sent1': m['sent1'].strip(),
        'sent2': m['sent2'].strip(),
        'sent1_tokens': parse_tokens(m['source']),
        'sent2_tokens': parse_tokens(m['translation']),
        'alignment': parse_alignment(m['alignment']),
    }

def parse_tokens(text):
    tokens = [None]
    for line in text.strip().splitlines():
        token_id, token = line.split()[:2]
        assert int(token_id) == len(tokens)
        tokens.append(token)
    return tokens

def parse_alignment(text):
    alignments = []
    for line in text.strip().splitlines():
        token_ids, label, score, _ = line.split(' // ')
        sent1, sent2 = token_ids.split(' <==> ')
        alignments.append({
            'label': label,
            'score': 0 if score == 'NIL' else int(score),
            'sent1': [int(n) for n in sent1.split()],
            'sent2': [int(n) for n in sent2.split()],
        })
    return alignments

In [4]:
from datasets import DatasetDict

ds = DatasetDict({
    'headlines': load_dataset(headlines_path),
    'images': load_dataset(images_path),
})

ds

DatasetDict({
    headlines: Dataset({
        features: ['id', 'sent1', 'sent2', 'sent1_tokens', 'sent2_tokens', 'alignment'],
        num_rows: 756
    })
    images: Dataset({
        features: ['id', 'sent1', 'sent2', 'sent1_tokens', 'sent2_tokens', 'alignment'],
        num_rows: 750
    })
})

In [5]:
ds['headlines'].to_pandas()

Unnamed: 0,id,sent1,sent2,sent1_tokens,sent2_tokens,alignment
0,1,Former Nazi death camp guard Demjanjuk dead at 91,"John Demjanjuk , convicted Nazi death camp gua...","[None, Former, Nazi, death, camp, guard, Demja...","[None, John, Demjanjuk, ,, convicted, Nazi, de...","[{'label': 'EQUI', 'score': 5, 'sent1': [8, 9]..."
1,2,Saudis to permit women to compete in Olympics,Saudi Women Allowed To Compete At Olympics,"[None, Saudis, to, permit, women, to, compete,...","[None, Saudi, Women, Allowed, To, Compete, At,...","[{'label': 'EQUI', 'score': 5, 'sent1': [5, 6]..."
2,3,US drone strike kills 5 militants in Pakistan,Drone strike kills four suspected militants in...,"[None, US, drone, strike, kills, 5, militants,...","[None, Drone, strike, kills, four, suspected, ...","[{'label': 'EQUI', 'score': 5, 'sent1': [4], '..."
3,4,"Syria peace plan conditions "" unacceptable , ""...",Syria peace dashed as deadline passes,"[None, Syria, peace, plan, conditions, "", unac...","[None, Syria, peace, dashed, as, deadline, pas...","[{'label': 'NOALI', 'score': 0, 'sent1': [10],..."
4,5,Syrian regime denies role in Houla massacre,Syria blames rebels for Houla massacre of over...,"[None, Syrian, regime, denies, role, in, Houla...","[None, Syria, blames, rebels, for, Houla, mass...","[{'label': 'NOALI', 'score': 0, 'sent1': [3], ..."
...,...,...,...,...,...,...
751,752,US Senator McCain meets opposition leaders in ...,SFG meeting reviews situation in Mali,"[None, US, Senator, McCain, meets, opposition,...","[None, SFG, meeting, reviews, situation, in, M...","[{'label': 'NOALI', 'score': 0, 'sent1': [4], ..."
752,753,Tens of Thousands of Ukrainians Protest in Kyiv,Tens of thousands line up to cast votes in Bhutan,"[None, Tens, of, Thousands, of, Ukrainians, Pr...","[None, Tens, of, thousands, line, up, to, cast...","[{'label': 'NOALI', 'score': 0, 'sent1': [4, 5..."
753,754,Obama pledges to reignite economy,Obama to press Congress to act on economy,"[None, Obama, pledges, to, reignite, economy]","[None, Obama, to, press, Congress, to, act, on...","[{'label': 'NOALI', 'score': 0, 'sent1': [0], ..."
754,755,Pak missions in UAE gear up for elections,Islamist parties in Egypt unite for elections,"[None, Pak, missions, in, UAE, gear, up, for, ...","[None, Islamist, parties, in, Egypt, unite, fo...","[{'label': 'SIMI', 'score': 2, 'sent1': [3, 4]..."


In [6]:
ds.push_to_hub(
    repo_id=f'{hub_username}/{corpus_name}',
    private=True,
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/marcov/SEMEVAL_2016_TASK_2/commit/35a9118f7b6b9db6c1a11a57cb9b18a0c374287d', commit_message='Upload dataset', commit_description='', oid='35a9118f7b6b9db6c1a11a57cb9b18a0c374287d', pr_url=None, pr_revision=None, pr_num=None)