# 0. Setup

In [7]:
from datasets import load_dataset
from pprint import pprint

# 1. Swiss Judgement Prediction

Link: https://huggingface.co/datasets/rcds/swiss_judgment_prediction

## Original languages

In [3]:
# Load original dataset (with the original languages)
swiss = load_dataset('swiss_judgment_prediction', 'all', trust_remote_code=True)

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/668M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/29.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/59709 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8208 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17357 [00:00<?, ? examples/s]

In [4]:
print(swiss)

DatasetDict({
    train: Dataset({
        features: ['id', 'year', 'text', 'label', 'language', 'region', 'canton', 'legal area', 'source_language'],
        num_rows: 59709
    })
    validation: Dataset({
        features: ['id', 'year', 'text', 'label', 'language', 'region', 'canton', 'legal area', 'source_language'],
        num_rows: 8208
    })
    test: Dataset({
        features: ['id', 'year', 'text', 'label', 'language', 'region', 'canton', 'legal area', 'source_language'],
        num_rows: 17357
    })
})


In [8]:
# Access the first example in the training set
pprint(swiss['train'][0])

# Access the first example in the validation set
pprint(swiss['validation'][0])

# Access the first example in the test set
pprint(swiss['test'][0])

{'canton': 'zh',
 'id': 2,
 'label': 0,
 'language': 'de',
 'legal area': 'insurance law',
 'region': 'Zürich',
 'source_language': 'n/a',
 'text': 'A.- Der 1955 geborene V._ war seit 1. September 1986 hauptberuflich '
         'als technischer Kaufmann bei der Firma A._ AG tätig und im Rahmen '
         'einer Nebenbeschäftigung (Nachtarbeit) ab Mai 1990 bei einem '
         'Bewachungsdienst angestellt gewesen, als er am 10. Februar 1991 in '
         'Norwegen beim Hundeschlittenfahren eine Muskelruptur im Bereich des '
         'linken Oberschenkels erlitt. Die Verletzung wurde am 26. Februar '
         '1991 mittels Muskelnaht operativ versorgt (Bericht des Dr. med. B._, '
         'Oberarzt, Chirurgische Klinik X._ vom 28. Februar 1991). '
         'Beweglichkeits- und Sensibilitätsausfälle führten zum Beizug des Dr. '
         'med. W._, Spezialarzt FMH Neurologie, welcher eine Ischiadicusparese '
         'links, wahrscheinlich traumatisch bedingt, diagnostizierte (Bericht '
  

## English translation

In [10]:
# Load translated dataset
swiss_trans = load_dataset('swiss_judgment_prediction', 'all+mt', trust_remote_code=True)

Generating train split:   0%|          | 0/238818 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8208 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17357 [00:00<?, ? examples/s]

In [11]:
print(swiss_trans)

DatasetDict({
    train: Dataset({
        features: ['id', 'year', 'text', 'label', 'language', 'region', 'canton', 'legal area', 'source_language'],
        num_rows: 238818
    })
    validation: Dataset({
        features: ['id', 'year', 'text', 'label', 'language', 'region', 'canton', 'legal area', 'source_language'],
        num_rows: 8208
    })
    test: Dataset({
        features: ['id', 'year', 'text', 'label', 'language', 'region', 'canton', 'legal area', 'source_language'],
        num_rows: 17357
    })
})


In [12]:
# Access the first example in the training set
pprint(swiss_trans['train'][0])

# Access the first example in the validation set
pprint(swiss_trans['validation'][0])

# Access the first example in the test set
pprint(swiss_trans['test'][0])

{'canton': 'zh',
 'id': 2,
 'label': 0,
 'language': 'de',
 'legal area': 'insurance law',
 'region': 'Zürich',
 'source_language': 'n/a',
 'text': 'A.- Der 1955 geborene V._ war seit 1. September 1986 hauptberuflich '
         'als technischer Kaufmann bei der Firma A._ AG tätig und im Rahmen '
         'einer Nebenbeschäftigung (Nachtarbeit) ab Mai 1990 bei einem '
         'Bewachungsdienst angestellt gewesen, als er am 10. Februar 1991 in '
         'Norwegen beim Hundeschlittenfahren eine Muskelruptur im Bereich des '
         'linken Oberschenkels erlitt. Die Verletzung wurde am 26. Februar '
         '1991 mittels Muskelnaht operativ versorgt (Bericht des Dr. med. B._, '
         'Oberarzt, Chirurgische Klinik X._ vom 28. Februar 1991). '
         'Beweglichkeits- und Sensibilitätsausfälle führten zum Beizug des Dr. '
         'med. W._, Spezialarzt FMH Neurologie, welcher eine Ischiadicusparese '
         'links, wahrscheinlich traumatisch bedingt, diagnostizierte (Bericht '
  

In [14]:
# Get the different values for the "source_language" field in the training set
source_languages = set(swiss_trans['train']['source_language'])
print(source_languages)

{'fr', 'de', 'it', 'n/a'}


In [15]:
# Get the first instance in the training set where the source language is NOT n/a
pprint(swiss_trans['train'].filter(lambda x: x['source_language'] != 'n/a')[0])

Filter:   0%|          | 0/238818 [00:00<?, ? examples/s]

{'canton': 'n/a',
 'id': 0,
 'label': 0,
 'language': 'de',
 'legal area': 'insurance law',
 'region': 'n/a',
 'source_language': 'fr',
 'text': 'A.-a) L._ arbeitet seit 1981 als Gesundheitsinstaller in der '
         'Schweiz. Am 20. Februar 1992, während seiner Arbeit, fiel er von '
         'einer Skala. Konsultiert am Tag nach dem Vorfall stellte der Arzt '
         'D._ die Diagnose von cervikalischer Entorse, Lombakontusionen und '
         'Verzerrung des rechten Schulters mit mehreren Hematomen fest; er '
         'zeigte ab dem Tag der Konsultation eine vollständige '
         'Arbeitslosigkeit (Bericht vom 4. März 1992). Die schweizerische '
         'nationale Unfallkasse (CNA) hat den Fall übernommen. Seitdem klagt '
         'L._ sich über cervikale Schmerzen, Gehirnschmerzen, Schmerzen sowie '
         'kontinuierliche Wärmegefühle und nimmt keine berufliche Aktivität '
         'mehr auf. Er hat mehrere Untersuchungen unterzogen, deren Ergebnisse '
         'innerhalb de

In [None]:
# Get the different values for the "language" field in the training set
target_languages = set(swiss_trans['train']['language'])
print(target_languages)

{'fr', 'en', 'de', 'it'}


In [17]:
# Get the first instance in the training set where the language is"en"
pprint(swiss_trans['train'].filter(lambda x: x['language'] == 'en')[0])

Filter:   0%|          | 0/238818 [00:00<?, ? examples/s]

{'canton': 'sg',
 'id': 2,
 'label': 1,
 'language': 'en',
 'legal area': 'insurance law',
 'region': 'Eastern Switzerland',
 'source_language': 'de',
 'text': 'A. The 1945-born S._ signed on the 20th. In January 1997, the '
         'Invalid Insurance due to pain in the back, neck and right leg to the '
         'service transfer. The fourth place of St. Gallen took among other '
         'things. Doctors from Dr. L. 1. April 1997 as well as Dr. H._, head '
         'doctor of the Clinic for Neurosurgery, Canton Hospital X._, of 29. '
         'October 1996, 9 and 20. in June 1997. On the basis of this, the '
         'IV-point refused the right to a disability pension (arrest of 4. in '
         'August 1997). B. The Court of Justice of the Canton. Decision of '
         '17th. September 1999 good, he raised the order from 4. In August '
         '1997, the case was submitted back to the IV position, so that it '
         'understood in the sense of the considerations and then regain

In [32]:
# Get the original language for the text above
original_language = swiss_trans['train'].filter(lambda x: (x['id'] == 2) and (x['language'] == 'de') and (x['source_language'] == 'n/a'))
pprint(original_language[0])

{'canton': 'zh',
 'id': 2,
 'label': 0,
 'language': 'de',
 'legal area': 'insurance law',
 'region': 'Zürich',
 'source_language': 'n/a',
 'text': 'A.- Der 1955 geborene V._ war seit 1. September 1986 hauptberuflich '
         'als technischer Kaufmann bei der Firma A._ AG tätig und im Rahmen '
         'einer Nebenbeschäftigung (Nachtarbeit) ab Mai 1990 bei einem '
         'Bewachungsdienst angestellt gewesen, als er am 10. Februar 1991 in '
         'Norwegen beim Hundeschlittenfahren eine Muskelruptur im Bereich des '
         'linken Oberschenkels erlitt. Die Verletzung wurde am 26. Februar '
         '1991 mittels Muskelnaht operativ versorgt (Bericht des Dr. med. B._, '
         'Oberarzt, Chirurgische Klinik X._ vom 28. Februar 1991). '
         'Beweglichkeits- und Sensibilitätsausfälle führten zum Beizug des Dr. '
         'med. W._, Spezialarzt FMH Neurologie, welcher eine Ischiadicusparese '
         'links, wahrscheinlich traumatisch bedingt, diagnostizierte (Bericht '
  

Translation by DeepL:

'A.- V._, born in 1955, had been working full-time '
 'as a technical salesman for the company A._ AG since September 1, 1986 and had been employed as part of '
 'a secondary occupation (night work) by a '
 'security service from May 1990 when he suffered a muscle rupture in the area of his '
 'left thigh while dog sledding in '
 'Norway on February 10, 1991. The injury was surgically treated on February 26, '
 '1991 by means of muscle suturing (report by Dr. med. B._, '
 'Senior Physician, Surgical Clinic X._ dated February 28, 1991). '
 'Loss of mobility and sensitivity led to the consultation of Dr. '
 'med. W._, specialist in FMH Neurology, who diagnosed a left ischiadicus palsy '
 'probably caused by trauma (report '
 'dated March 5, 1991). Dr. med. S._, specialist in surgery FMH, '
 'Chief Physician Hospital X._ (report dated October 28, 1992) confirmed that '
 'pressure damage '
 'to the sciatic nerve had occurred during the operation on February 10, 1991. With a splint and '
 ' support shoe, V._ was, however, able to walk without pain or a limp; his '
 ' ability to work was neither restricted with regard to his activity as a businessman '
 ' nor with regard to the secondary occupation as a '
 ' guard, which he had now given up. With the uncontested and legally binding '
 'decision of 16 December...

**Conclusion**: the translation to English in the dataset is pure trash. Good translation APIs that we could use:
1. DeepL: https://www.deepl.com/en/pro#developer (5€ per month + €20.00 per 1.000.000 characters)
2. Google Cloud Translation API (Google Translate API): https://cloud.google.com/translate/docs ($300 free credit, but after that very expensive)
3. Libre translate (free translator, but doesn't work very well): https://libretranslate.com/

## How much would translating the original dataset cost? Counting the number of characters of the original dataset

In [35]:
def count_total_characters(data):
    return sum(len(item['text']) for item in data)

# Training set character count
total_characters_train = count_total_characters(swiss['train'])
print(f"Total characters: {total_characters_train}")

# Validation set character count
total_characters_validation = count_total_characters(swiss['validation'])
print(f"Total characters: {total_characters_validation}")

# Test set character count
total_characters_test = count_total_characters(swiss['test'])
print(f"Total characters: {total_characters_test}")

# Number of total instances in all sets
total_characters = total_characters_train + total_characters_validation + total_characters_test
print(f"Total characters in all sets: {total_characters}")

Total characters: 203219455
Total characters: 25056077
Total characters: 59217191
Total characters in all sets: 287492723


287 million characters. Oh my god. If we only translated, say, French judgements (approx. 35% of the total) we would have to translate 100M characters.

# 2. PUBHEALTH

Link: https://huggingface.co/datasets/ImperialCollegeLondon/health_fact

In [36]:
pubhealth = load_dataset('health_fact', trust_remote_code=True)

README.md:   0%|          | 0.00/8.61k [00:00<?, ?B/s]

health_fact.py:   0%|          | 0.00/7.08k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1235 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1225 [00:00<?, ? examples/s]

In [37]:
print(pubhealth)

DatasetDict({
    train: Dataset({
        features: ['claim_id', 'claim', 'date_published', 'explanation', 'fact_checkers', 'main_text', 'sources', 'label', 'subjects'],
        num_rows: 9832
    })
    test: Dataset({
        features: ['claim_id', 'claim', 'date_published', 'explanation', 'fact_checkers', 'main_text', 'sources', 'label', 'subjects'],
        num_rows: 1235
    })
    validation: Dataset({
        features: ['claim_id', 'claim', 'date_published', 'explanation', 'fact_checkers', 'main_text', 'sources', 'label', 'subjects'],
        num_rows: 1225
    })
})


In [38]:
pprint(pubhealth['train'][0])
pprint(pubhealth['validation'][0])
pprint(pubhealth['test'][0])

{'claim': '"The money the Clinton Foundation took from from foreign '
          'governments while Hillary Clinton was secretary of state ""is '
          'clearly illegal. … The Constitution says you can’t take this '
          'stuff."',
 'claim_id': '15661',
 'date_published': 'April 26, 2015',
 'explanation': '"Gingrich said the Clinton Foundation ""took money from from '
                'foreign governments while (Hillary Clinton) was secretary of '
                'state. It is clearly illegal. … The Constitution says you '
                'can’t take this stuff."" A clause in the Constitution does '
                'prohibit U.S. officials such as former Secretary of State '
                'Hillary Clinton from receiving gifts, or emoluments, from '
                'foreign governments. But the gifts in this case were '
                'donations from foreign governments that went to the Clinton '
                'Foundation, not Hillary Clinton. She was not part of the '
     