# Annotation Setup for DE-EN

We are going to create an annotation batch that consists of 500 samples, such that:
* 450 samples come from the batch of 5000 low comet-qe translations
* 50 samples come from a control group (20 good translations and 30 hallucinations for which we know the correct annotation)

In [1]:
import pandas as pd 
import numpy as np 
import pickle
from tqdm import tqdm

## Low COMET-QE Translations

In [127]:
df_lowcomet = pd.read_pickle("/home/nunomg/mt-hallucinations/HALO/fairseq/data-bin/wmt18_de-en_heldout/checkpoint_best/dataframes/heldout_lowcomet_w_bicleaner.pkl")

In [129]:
df_laser = pd.read_pickle("/home/nunomg/mt-hallucinations/HALO/fairseq/data-bin/wmt18_de-en_heldout/checkpoint_best/dataframes/heldout_lowlaser_w_bicleaner.pkl")

In [135]:
pd.concat([df_lowcomet.sort_values(by="comet-qe")[:5000], df_laser.sort_values(by="laser")[:5000]]).index.duplicated().sum()

474

### Further finer-grained filtering

Remove source sentences that contain multiple sentences (errors in segmentation)

In [95]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

idxs_to_drop = []
for idx in tqdm(df_lowcomet.index):
    src_sentence = df_lowcomet.loc[idx].src
    if len(sent_tokenize(src_sentence, language="german"))!=1:
        idxs_to_drop.append(idx)

mult_sentence_flag = np.zeros(len(df_lowcomet))
j = 0
for i in tqdm(df_lowcomet.index):
    if i in idxs_to_drop:
        mult_sentence_flag[j] = int(1)
    j += 1
mult_sentence_flags = [int(val) for val in mult_sentence_flag]

df_lowcomet["mult_sentence_flag"] = mult_sentence_flags
df_lowcomet = df_lowcomet.loc[df_lowcomet["mult_sentence_flag"]==0]

[nltk_data] Downloading package punkt to /home/nunomg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 10000/10000 [00:01<00:00, 6722.74it/s]
100%|██████████| 10000/10000 [00:00<00:00, 246021.88it/s]


Remove source sentences that contain weird characters

In [96]:
list_of_characters = ["√", "】","【", "Ћ", "Џ", "ờ", "Ī", "\x92 "]
cyrillic_letters = u"абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
cyrillic_letters = [char for char in cyrillic_letters]
special_characters = list_of_characters + cyrillic_letters

idxs_to_drop = []
for idx in tqdm(df_lowcomet.index):
    src_sentence = df_lowcomet.loc[idx].src
    if any(char in src_sentence for char in special_characters):
        idxs_to_drop.append(idx)

special_char_flag = np.zeros(len(df_lowcomet))
j = 0
for i in tqdm(df_lowcomet.index):
    if i in idxs_to_drop:
        special_char_flag[j] = int(1)
    j += 1
special_char_flags = [int(val) for val in special_char_flag]

df_lowcomet["special_char_flag"] = special_char_flags
df_lowcomet = df_lowcomet.loc[df_lowcomet["special_char_flag"]==0]

100%|██████████| 9383/9383 [00:00<00:00, 10473.79it/s]
100%|██████████| 9383/9383 [00:00<00:00, 335757.59it/s]


Remove sentences that might contain profanity/adult content

In [97]:
from nltk.stem import *
list_of_words = ["porn", "anal", "masturbation", "oral", "sex", "ass", "cock", "dick", "fuck", "suck", "interracial", "orgasm", "amateur", "nipples", "pussy",\
    "lesbian", "strapon", "gay", "cunt", "fucked", "fucking", "fucks", "cunts", "cocks", "dicks", "fucking", "suckin", "bitch", "bitches", "stud", "studs", "shagging", "horny", "erotic", \
        "load", "unload", "horned", "piss", "pissed", "pissing"]

ps = PorterStemmer()
list_of_stems = [ps.stem(word) for word in list_of_words]

idxs_to_drop = []
for idx in tqdm(df_lowcomet.index):
    ref_sentence = df_lowcomet.loc[idx].ref.lower().split()
    ref_sentence = [ps.stem(word) for word in ref_sentence]
    if any(word in ref_sentence for word in list_of_stems):
        idxs_to_drop.append(idx)

adult_content_flag = np.zeros(len(df_lowcomet))
j = 0
for i in tqdm(df_lowcomet.index):
    if i in idxs_to_drop:
        adult_content_flag[j] = int(1)
    j += 1
adult_content_flags = [int(val) for val in adult_content_flag]

df_lowcomet["adult_content_flag"] = adult_content_flags
df_lowcomet = df_lowcomet.loc[df_lowcomet["adult_content_flag"]==0]

100%|██████████| 8969/8969 [00:02<00:00, 3869.17it/s]
100%|██████████| 8969/8969 [00:00<00:00, 611805.76it/s]


In [98]:
idxs_to_drop = []
for idx in tqdm(df_lowcomet.index):
   src_sentence = df_lowcomet.loc[idx].src
   if src_sentence[0].islower():
      idxs_to_drop.append(idx)

lowercase_first_token_flag = np.zeros(len(df_lowcomet))
j = 0
for i in tqdm(df_lowcomet.index):
   if i in idxs_to_drop:
      lowercase_first_token_flag[j] = int(1)
   j += 1
lowercase_first_token_flags = [int(val) for val in lowercase_first_token_flag]

df_lowcomet["lowercase_first_token_flag"] = lowercase_first_token_flags
df_lowcomet = df_lowcomet.loc[df_lowcomet["lowercase_first_token_flag"]==0]

100%|██████████| 8761/8761 [00:00<00:00, 10758.08it/s]
100%|██████████| 8761/8761 [00:00<00:00, 331711.15it/s]


In [99]:
df_lowcomet = df_lowcomet.sort_values(by="comet-qe")[:5000]

## Define samples for annotation

In [100]:
df_samples_for_annotation = df_lowcomet.sample(450, random_state=1).sort_values(by="idx")

## Good Translations

In [101]:
df_all = pd.read_pickle("/home/nunomg/mt-hallucinations/HALO/fairseq/data-bin/wmt18_de-en_heldout/checkpoint_best/dataframes/heldoutwstats_w_bicleaner.pkl")

In [102]:
df_all_topcomet_qe = df_all.sort_values(by="comet-qe", ascending=False)[:500]
df_goodtranslations_for_annotation = df_all_topcomet_qe.sample(20, random_state=1).sort_values(by="idx")

## Honey Hallucinations

In [103]:
df_lowcomet_nonsampled = df_lowcomet.drop(labels=df_samples_for_annotation.index)

In [104]:
idxs_honey = [1170887, 455127, 956979, 672999, 1743002, 655196, 147363, 766274, 1353331, 1502803, 608532, 1450030, 220206, 1027125, 575742, 522781, 786981, 1148666, 1783556,\
717862, 1728538, 982673, 1661325, 945866, 1474903, 118944, 31402, 753861, 1440578, 1155396]

In [105]:
df_idxs = []
for idx in idxs_honey:
    sample = df_lowcomet_nonsampled.loc[df_lowcomet_nonsampled["idx"]==idx]
    df_idxs.append(sample.index.values[0])

honey_halls = df_lowcomet_nonsampled.loc[df_idxs]

## Join everything

In [106]:
df_samples_for_annotation["type"] = ["TO ANNOTATE"] * len(df_samples_for_annotation)
df_goodtranslations_for_annotation["type"] = ["GOOD"] * len(df_goodtranslations_for_annotation)
honey_halls["type"] = ["CONTROL HALL"] * len(honey_halls)

In [109]:
final_df = pd.concat([df_samples_for_annotation,df_goodtranslations_for_annotation, honey_halls]).sample(frac=1, random_state=1)
final_df_for_csv = pd.concat([df_samples_for_annotation[["idx", "src", "mt"]],df_goodtranslations_for_annotation[["idx", "src", "mt"]], honey_halls[["idx", "src", "mt"]]]).sample(frac=1, random_state=1)

In [35]:
pd.concat([df_samples_for_annotation,df_goodtranslations_for_annotation, honey_halls]).sample(frac=1, random_state=1).to_csv("annotation_phase_1_de-en.csv")

In [115]:
final_df.to_pickle("df_annotation_phase_1_de-en.pkl")

In [2]:
pd.read_pickle("df_annotation_phase_1_de-en.pkl")

Unnamed: 0,idx,src,mt,ref,src_ids,mt_ids,ref_ids,score,comet-qe,f1_bpe,...,f2_word,repscore_word,chrf2,unk_flag,bicleaner_score,mult_sentence_flag,special_char_flag,adult_content_flag,lowercase_first_token_flag,type
361086,1366719,Eine Coursage bringt den Ganzanzug erst richti...,A Coursage takes the whole course in the right...,A corset gives the suit the right shape.,"[687, 11441, 23, 688, 3601, 20, 10058, 1692, 8...","[131, 11441, 23, 688, 2047, 6, 1156, 682, 7, 6...","[131, 390, 736, 359, 2953, 6, 11073, 6, 431, 7...",-0.84412,-0.904549,0,...,0,9,51.033658,0,0.585,0.0,0.0,0.0,0.0,TO ANNOTATE
597777,1487862,"Nicht wringen oder bügeln, nicht zusammengerol...","Don't rush or wear, not rolled together or let...",Never wring out nor iron. Never leave rolled u...,"[2136, 541, 10195, 83, 28621, 5273, 4, 49, 728...","[5180, 38, 133, 605, 3782, 75, 18220, 4, 58, 1...","[26086, 541, 4081, 202, 2897, 9632, 5, 26086, ...",-0.79799,-0.550274,0,...,0,11,29.153174,0,0.537,0.0,0.0,0.0,0.0,TO ANNOTATE
257051,219906,Diese Schlucht zieht dich sofort in deinen Bann.,This ravine will take you straight into your b...,This canyon will enthrall you from the very fi...,"[360, 4350, 2447, 11619, 9880, 3375, 7, 20176,...","[119, 1830, 389, 557, 52, 307, 69, 8883, 219, ...","[119, 86, 144, 225, 52, 1164, 1288, 1027, 69, ...",-0.71318,-0.717811,0,...,0,10,20.360183,0,0.500,0.0,0.0,0.0,0.0,TO ANNOTATE
583859,300172,Doch Achtung: Seeleoparden sind Fleischfresser...,But beware: souls are meat buffers and robberies.,But divers beware: leopard seals are meat eate...,"[1874, 5465, 35, 15832, 770, 229, 263, 67, 898...","[664, 33, 3489, 35, 94, 6402, 37, 7942, 296, 6...","[664, 7734, 33, 3489, 35, 838, 27645, 1066, 10...",-0.82310,-0.783460,0,...,0,6,36.218615,0,0.517,0.0,0.0,0.0,0.0,TO ANNOTATE
282744,1502803,Bei Beugungen kommt es zum Sprödbruch.,There is a explosion when it comes to denials.,It breaks with crisp fracture when bent.,"[843, 516, 805, 276, 1273, 60, 126, 9387, 1175...","[492, 16, 15, 13595, 520, 267, 54, 2028, 12, 2...","[130, 22755, 36, 6305, 6145, 6545, 1873, 630, ...",-0.94981,-0.669801,0,...,0,5,16.500485,0,0.575,0.0,0.0,0.0,0.0,CONTROL HALL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723129,1123933,Die Qualit t Produkt Ihnen erm glicht es uns w...,The quality of the product gives us the opport...,The quality product you provide allows us to f...,"[44, 11159, 306, 423, 2024, 417, 8975, 600, 86...","[26, 724, 8, 6, 1417, 2953, 254, 6, 1324, 12, ...","[26, 724, 1417, 69, 642, 2336, 254, 12, 1997, ...",-0.70877,-0.533898,0,...,0,18,41.229053,0,0.525,0.0,0.0,0.0,0.0,TO ANNOTATE
306448,314033,Fischerei - Maßnahmen zur Wiederauffüllung der...,Fisheries - cod recovery measures 9,Fisheries - Recovery measures for cod 9,"[3821, 31, 370, 81, 14426, 20500, 10, 8255, 33...","[8636, 31, 13994, 4434, 467, 803, 2]","[8636, 31, 25506, 467, 18, 13994, 803, 2]",-0.23178,-0.515588,0,...,0,1,73.476191,0,0.605,0.0,0.0,0.0,0.0,TO ANNOTATE
1835257,1714382,"Dublin Flughafen-Berechtigung , Hauptmitte der...","Dublin Airport Authorisation , the main centre...","Dublin Airport Authority, main center of the c...","[6635, 3202, 14, 210, 13197, 917, 142, 1496, 2...","[6635, 3746, 17991, 1507, 142, 6, 734, 998, 8,...","[6635, 3746, 8045, 4, 734, 2908, 8, 6, 785, 14...",-0.37625,-0.908761,0,...,0,14,73.378879,0,0.685,0.0,0.0,0.0,0.0,TO ANNOTATE
176517,1020427,Andernfalls wird es nicht mit WINS ...,Otherwise it will not use WINS ...,"Otherwise, it will not be using WINS ...","[24993, 64, 60, 49, 32, 278, 2438, 200, 497, 2]","[15238, 54, 52, 58, 256, 278, 2438, 200, 497, 2]","[15238, 4, 54, 52, 58, 33, 977, 278, 2438, 200...",-0.27093,-0.722248,0,...,0,2,60.611791,0,0.647,0.0,0.0,0.0,0.0,TO ANNOTATE
