# Import

In [2]:
import numpy as np
import pandas as pd
import re
import math
import pickle as pkl
from googletrans import Translator
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from datasets import load_from_disk, Dataset, concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# tokenized_train = train_dataset.map(tokenize_function, batched=True)

# Utils

In [4]:
def tokenize_function(tokenizer, examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=9)

def translate_text(str, src="en", dest="eo"):
    translator = Translator()
    return translator.translate(str, src=src, dest=dest).text

# OOD (55 classes)

## Classifiers

In [5]:
models = {
    "multi" :   dict(),
    "xlm"   :   dict(),
    "xln"   :   dict()
    }

model_names = list(models.keys())

model_names

['multi', 'xlm', 'xln']

In [6]:
models_dir = "./classifiers/ood_finetuned"
for m in model_names:
  tokenizer_link = f"{models_dir}/{m}/{m}_tokenizer"
  model_link = f"{models_dir}/{m}/{m}_model"
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_link)
  model = AutoModelForSequenceClassification.from_pretrained(model_link)
  classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, function_to_apply="none", return_all_scores=True)
  models[m]["tokenizer"] = tokenizer
  models[m]["model"] = model
  models[m]["classifier"] = classifier



### Count number of parameters

In [7]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [12]:
count_parameters()

117674935

In [15]:
for model_name in model_names:
  model = models[model_name]["model"]
  print(f"{model_name}     \t: {count_parameters(model)}")

multi     	: 117674935
xlm     	: 278085943
xln     	: 117351223


## Datasets

### Massive

In [41]:
# MASSIVE english test
en_dataset = load_from_disk("datasets\massive_en_test.hf")

# # Esperanto english test
# es_dataset = load_from_disk("datasets\esperanto.hf")

In [42]:
categroies_list = [
    'datetime_query',
    'iot_hue_lightchange',
    'transport_ticket',
    'takeaway_query',
    'qa_stock',
    'general_greet',
    'recommendation_events',
    'music_dislikeness',
    'iot_wemo_off',
    'cooking_recipe',
    'qa_currency',
    'transport_traffic',
    'general_quirky',
    'weather_query',
    'audio_volume_up',
    'email_addcontact',
    'takeaway_order',
    'email_querycontact',
    'iot_hue_lightup',
    'recommendation_locations',
    'play_audiobook',
    'lists_createoradd',
    'news_query',
    'alarm_query',
    'iot_wemo_on',
    'general_joke',
    'qa_definition',
    'social_query',
    'music_settings',
    'audio_volume_other',
    'calendar_remove',
    'iot_hue_lightdim',
    'calendar_query',
    'email_sendemail',
    'iot_cleaning',
    'audio_volume_down',
    'play_radio',
    'cooking_query',
    'datetime_convert',
    'qa_maths',
    'iot_hue_lightoff',
    'iot_hue_lighton',
    'transport_query',
    'music_likeness',
    'email_query',
    'play_music',
    'audio_volume_mute',
    'social_post',
    'alarm_set',
    'qa_factoid',
    'calendar_set',
    'play_game',
    'alarm_remove',
    'lists_remove',
    'transport_taxi',
    'recommendation_movies',
    'iot_coffee',
    'music_query',
    'play_podcasts',
    'lists_query']

#### Full

In [43]:
labels_list, counts = np.unique(en_dataset["intent"], return_counts=True)
labels_list, counts

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
        52, 53, 54, 55, 56, 57, 58, 59]),
 array([ 88,  36,  35,  35,  26,   1,  43,   4,  18,  72,  39,  15, 169,
        156,  13,  12,  22,  26,  27,  31,  41,  39, 124,  34,  10,  19,
         57,  25,   6,   6,  67,  21, 126, 114,  26,  11,  72,  15,  25,
         43,   3,  51,  36, 119, 176,  32,  81,  41, 141, 209,  35,  21,
         52,  23,  20,  36,  35,  63,  51], dtype=int64))

In [44]:
num_ood = 5
len(counts[-num_ood:]), np.sum(counts[-num_ood:])/np.sum(counts)

(5, 0.06893073301950235)

In [45]:
ood_labels = set(labels_list[-num_ood:])

num_labels = len(categroies_list)-len(ood_labels)
num_labels

55

In [46]:
en_X_test = np.array(en_dataset["utt"])
en_Y_test = np.array(en_dataset["intent"])

len(en_X_test), len(en_Y_test)

(2974, 2974)

In [47]:
test_ID = np.ones(en_X_test.shape)
for i in range(len(en_Y_test)):
  if en_Y_test[i] in ood_labels: test_ID[i]=0

In [48]:
en_X_test_ID = en_X_test[np.where(test_ID==1)]
en_Y_test_ID = en_Y_test[np.where(test_ID==1)]

en_X_test_OOD = en_X_test[np.where(test_ID==0)]
en_Y_test_OOD = en_Y_test[np.where(test_ID==0)]

len(en_X_test_ID), len(en_Y_test_ID), len(en_X_test_OOD), len(en_Y_test_OOD)

(2769, 2769, 205, 205)

In [49]:
len(en_X_test_ID) + len(en_X_test_OOD) == len(en_X_test)

True

In [50]:
2974-2769 == np.sum(counts[-num_ood:])

True

In [51]:
en_ID = {"label":en_Y_test_ID.copy(), "text":en_X_test_ID.copy()}
en_ID = Dataset.from_pandas(pd.DataFrame(data=en_ID))

en_OOD = {"label":en_Y_test_OOD.copy(), "text":en_X_test_OOD.copy()}
en_OOD = Dataset.from_pandas(pd.DataFrame(data=en_OOD))

In [52]:
en_ID.save_to_disk("./datasets/en_ID_test.hf")
en_OOD.save_to_disk("./datasets/en_OOD_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 2769/2769 [00:00<00:00, 236817.99 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 205/205 [00:00<00:00, 68322.00 examples/s] 


#### Short

In [53]:
short_query = 10

##### ID

In [54]:
texts = list()
labels = list()

for item in en_ID:
  query = item["text"]
  if len(query.split())<=short_query:
    texts.append(query)
    labels.append(item["label"])
len(texts), len(labels)

(2435, 2435)

In [55]:
labels[:5], texts[:5]

([48, 46, 1, 41, 40],
 ['wake me up at five am this week',
  'quiet',
  'pink is all we need',
  'and the darkness has fallen',
  'olly turn the lights off in the bedroom'])

In [56]:
en_ID_short = {"label":labels.copy(), "text":texts.copy()}
en_ID_short = Dataset.from_pandas(pd.DataFrame(data=en_ID_short))
en_ID_short.save_to_disk("./datasets/en_ID_short_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 2435/2435 [00:00<00:00, 116978.17 examples/s]


##### OOD

In [57]:
texts = list()
labels = list()

for item in en_OOD:
  query = item["text"]
  if len(query.split())<=short_query:
    texts.append(query)
    labels.append(item["label"])
len(texts), len(labels)

(190, 190)

In [58]:
labels[:5], texts[:5]

([56, 57, 57, 57, 56],
 ['run coffee maker',
  'who is the singer of song',
  'i want to know about this song',
  'who sings the song about a long black train',
  'i want a coffee'])

In [59]:
en_OOD_short = {"label":labels.copy(), "text":texts.copy()}
en_OOD_short = Dataset.from_pandas(pd.DataFrame(data=en_OOD_short))
en_OOD_short.save_to_disk("./datasets/en_OOD_short_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 190/190 [00:00<00:00, 19700.82 examples/s]


#### Long

In [61]:
long_query = 30

##### ID

In [62]:
texts = list()
labels = list()

for item in en_ID_short:
  query = item["text"]
  label = item["label"]
  query_len = len(query.split())
  repeat = math.ceil(long_query/query_len)
  query = [query for _ in range(repeat)]
  query = ", ".join(query)
  texts.append(query)
  labels.append(label)

len(texts), len(labels)

(2435, 2435)

In [63]:
labels[:5], texts[:5]

([48, 46, 1, 41, 40],
 ['wake me up at five am this week, wake me up at five am this week, wake me up at five am this week, wake me up at five am this week',
  'quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet, quiet',
  'pink is all we need, pink is all we need, pink is all we need, pink is all we need, pink is all we need, pink is all we need',
  'and the darkness has fallen, and the darkness has fallen, and the darkness has fallen, and the darkness has fallen, and the darkness has fallen, and the darkness has fallen',
  'olly turn the lights off in the bedroom, olly turn the lights off in the bedroom, olly turn the lights off in the bedroom, olly turn the lights off in the bedroom'])

In [64]:
en_ID_long = {"label":labels.copy(), "text":texts.copy()}
en_ID_long = Dataset.from_pandas(pd.DataFrame(data=en_ID_long))
en_ID_long.save_to_disk("./datasets/en_ID_long_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 2435/2435 [00:00<00:00, 155804.34 examples/s]


##### OOD

In [65]:
texts = list()
labels = list()

for item in en_OOD_short:
  query = item["text"]
  label = item["label"]
  query_len = len(query.split())
  repeat = math.ceil(long_query/query_len)
  query = [query for _ in range(repeat)]
  query = ", ".join(query)
  texts.append(query)
  labels.append(label)

len(texts), len(labels)

(190, 190)

In [66]:
labels[:5], texts[:5]

([56, 57, 57, 57, 56],
 ['run coffee maker, run coffee maker, run coffee maker, run coffee maker, run coffee maker, run coffee maker, run coffee maker, run coffee maker, run coffee maker, run coffee maker',
  'who is the singer of song, who is the singer of song, who is the singer of song, who is the singer of song, who is the singer of song',
  'i want to know about this song, i want to know about this song, i want to know about this song, i want to know about this song, i want to know about this song',
  'who sings the song about a long black train, who sings the song about a long black train, who sings the song about a long black train, who sings the song about a long black train',
  'i want a coffee, i want a coffee, i want a coffee, i want a coffee, i want a coffee, i want a coffee, i want a coffee, i want a coffee'])

In [67]:
en_OOD_long = {"label":labels.copy(), "text":texts.copy()}
en_OOD_long = Dataset.from_pandas(pd.DataFrame(data=en_OOD_long))
en_OOD_long.save_to_disk("./datasets/en_OOD_long_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 190/190 [00:00<00:00, 12848.95 examples/s]


### Esperanto

#### Full

##### ID

In [68]:
texts = list()
labels = list()

for item in tqdm(en_ID):
  query = item["text"]
  label = item["label"]
  texts.append(translate_text(query))
  labels.append(label)

len(texts), len(labels)

100%|██████████| 2769/2769 [08:09<00:00,  5.66it/s]


(2769, 2769)

In [69]:
labels[:5], texts[:5]

([48, 46, 1, 41, 40],
 ['veku min je la kvina matene ĉi-semajne',
  'trankvila',
  'rozo estas ĉio, kion ni bezonas',
  'kaj la mallumo falis',
  'ol estingu la lumojn en la dormoĉambro'])

In [70]:
es_ID = {"label":labels.copy(), "text":texts.copy()}
es_ID = Dataset.from_pandas(pd.DataFrame(data=es_ID))
es_ID.save_to_disk("./datasets/es_ID_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 2769/2769 [00:00<00:00, 396938.64 examples/s]


##### OOD

In [71]:
texts = list()
labels = list()

for item in tqdm(en_OOD):
  query = item["text"]
  label = item["label"]
  texts.append(translate_text(query))
  labels.append(label)

len(texts), len(labels)

100%|██████████| 205/205 [00:36<00:00,  5.61it/s]


(205, 205)

In [72]:
labels[:5], texts[:5]

([56, 57, 57, 57, 56],
 ['kuri kafomaŝinon',
  'kiu estas la kantisto de kanto',
  'Mi volas scii pri ĉi tiu kanto',
  'kiu kantas la kanton pri longa nigra trajno',
  'mi volas kafon'])

In [73]:
es_OOD = {"label":labels.copy(), "text":texts.copy()}
es_OOD = Dataset.from_pandas(pd.DataFrame(data=es_OOD))
es_OOD.save_to_disk("./datasets/es_OOD_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 205/205 [00:00<00:00, 41207.34 examples/s]


#### Short

##### ID

In [74]:
texts = list()
labels = list()

for item in tqdm(en_ID_short):
  query = item["text"]
  label = item["label"]
  texts.append(translate_text(query))
  labels.append(label)

len(texts), len(labels)

100%|██████████| 2435/2435 [05:40<00:00,  7.15it/s]


(2435, 2435)

In [75]:
labels[:5], texts[:5]

([48, 46, 1, 41, 40],
 ['veku min je la kvina matene ĉi-semajne',
  'trankvila',
  'rozo estas ĉio, kion ni bezonas',
  'kaj la mallumo falis',
  'ol estingu la lumojn en la dormoĉambro'])

In [76]:
es_ID_short = {"label":labels.copy(), "text":texts.copy()}
es_ID_short = Dataset.from_pandas(pd.DataFrame(data=es_ID_short))
es_ID_short.save_to_disk("./datasets/es_ID_short_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 2435/2435 [00:00<00:00, 487081.76 examples/s]


##### OOD

In [77]:
texts = list()
labels = list()

for item in tqdm(en_OOD_short):
  query = item["text"]
  label = item["label"]
  texts.append(translate_text(query))
  labels.append(label)

len(texts), len(labels)

100%|██████████| 190/190 [00:25<00:00,  7.38it/s]


(190, 190)

In [78]:
labels[:5], texts[:5]

([56, 57, 57, 57, 56],
 ['kuri kafomaŝinon',
  'kiu estas la kantisto de kanto',
  'Mi volas scii pri ĉi tiu kanto',
  'kiu kantas la kanton pri longa nigra trajno',
  'mi volas kafon'])

In [79]:
es_OOD_short = {"label":labels.copy(), "text":texts.copy()}
es_OOD_short = Dataset.from_pandas(pd.DataFrame(data=es_OOD_short))
es_OOD_short.save_to_disk("./datasets/es_OOD_short_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 190/190 [00:00<00:00, 45894.83 examples/s]


#### Long

##### ID

In [80]:
texts = list()
labels = list()

for item in tqdm(en_ID_long):
  query = item["text"]
  label = item["label"]
  texts.append(translate_text(query))
  labels.append(label)

len(texts), len(labels)

100%|██████████| 2435/2435 [09:03<00:00,  4.48it/s]


(2435, 2435)

In [81]:
labels[:5], texts[:5]

([48, 46, 1, 41, 40],
 ['veku min je la kvina matene ĉi-semajne, veku min je la kvina matene ĉi-semajne, veku min je la kvina matene ĉi-semajne, veku min je la kvina matene ĉi-semajne',
  'kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, kvieta, trankvila quiet, quiet, quiet, quiet, quiet',
  'rozkolora estas ĉio, kion ni bezonas, rozkolora estas ĉio, kion ni bezonas, rozkolora estas ĉio, kion ni bezonas, rozkolora estas ĉio, kion ni bezonas, rozkolora estas ĉio, kion ni bezonas.',
  'kaj la mallumo falis, kaj la mallumo falis, kaj la mallumo falis, kaj la mallumo falis, kaj la mallumo falis, kaj la mallumo falis.',
  'olly malŝaltu la lumojn en la dormoĉambro, olly malŝaltu la lumojn en la dormoĉambro, olly malŝaltu la lumojn en la dormoĉambro, olly malŝaltu la lumojn en la dormoĉambro'])

In [82]:
es_ID_long = {"label":labels.copy(), "text":texts.copy()}
es_ID_long = Dataset.from_pandas(pd.DataFrame(data=es_ID_long))
es_ID_long.save_to_disk("./datasets/es_ID_long_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 2435/2435 [00:00<00:00, 405637.07 examples/s]


##### OOD

In [83]:
texts = list()
labels = list()

for item in tqdm(en_OOD_long):
  query = item["text"]
  label = item["label"]
  texts.append(translate_text(query))
  labels.append(label)

len(texts), len(labels)

100%|██████████| 190/190 [00:41<00:00,  4.54it/s]


(190, 190)

In [84]:
labels[:5], texts[:5]

([56, 57, 57, 57, 56],
 ['kuri kafomaŝinon, kuri kafmaŝinon, kuri kafomaŝinon, funkciigu kafmaŝinon, kuru kafmaŝinon, kuru kafmaŝinon, kuru kafmaŝinon, kuru kafmaŝinon, kuru kafomaŝinon, kuru kafomaŝinon',
  'kiu estas la kantisto de kanto, kiu estas la kantisto de kanto, kiu estas la kantisto de kanto, kiu estas la kantisto de kanto, kiu estas la kantisto de kanto',
  'mi volas scii pri ĉi tiu kanto, mi volas scii pri ĉi tiu kanto, mi volas scii pri ĉi tiu kanto, mi volas scii pri ĉi tiu kanto, mi volas scii pri ĉi tiu kanto',
  'kiu kantas la kanton pri longa nigra trajno, kiu kantas la kanton pri longa nigra trajno, kiu kantas la kanton pri longa nigra trajno, kiu kantas la kanton pri longa nigra trajno',
  'mi volas kafon, mi volas kafon, mi volas kafon, mi volas kafon, mi volas kafon, mi volas kafon, mi volas kafon, mi volas kafon'])

In [85]:
es_OOD_long = {"label":labels.copy(), "text":texts.copy()}
es_OOD_long = Dataset.from_pandas(pd.DataFrame(data=es_OOD_long))
es_OOD_long.save_to_disk("./datasets/es_OOD_long_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 190/190 [00:00<00:00, 31523.65 examples/s]


### `Punjabi`

In [90]:
src="en"
dest="pa"

#### Full

##### ID

In [86]:
texts = list()
labels = list()

for item in tqdm(en_ID):
  query = item["text"]
  label = item["label"]
  texts.append(translate_text(query, src=src, dest=dest))
  labels.append(label)

len(texts), len(labels)

100%|██████████| 2769/2769 [08:44<00:00,  5.28it/s]


(2769, 2769)

In [87]:
labels[:5], texts[:5]

([48, 46, 1, 41, 40],
 ['ਮੈਨੂੰ ਇਸ ਹਫ਼ਤੇ ਪੰਜ ਵਜੇ ਜਗਾਓ',
  'ਸ਼ਾਂਤ',
  'ਗੁਲਾਬੀ ਸਿਰਫ ਸਾਨੂੰ ਲੋੜ ਹੈ',
  'ਅਤੇ ਹਨੇਰਾ ਡਿੱਗ ਗਿਆ ਹੈ',
  'ਓਲੀ ਨੇ ਬੈੱਡਰੂਮ ਦੀਆਂ ਲਾਈਟਾਂ ਬੰਦ ਕਰ ਦਿੱਤੀਆਂ'])

In [88]:
pa_ID = {"label":labels.copy(), "text":texts.copy()}
pa_ID = Dataset.from_pandas(pd.DataFrame(data=pa_ID))
pa_ID.save_to_disk("./datasets/pa_ID_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 2769/2769 [00:00<00:00, 289402.90 examples/s]


##### OOD

In [91]:
texts = list()
labels = list()

for item in tqdm(en_OOD):
  query = item["text"]
  label = item["label"]
  texts.append(translate_text(query, src=src, dest=dest))
  labels.append(label)

len(texts), len(labels)

100%|██████████| 205/205 [00:37<00:00,  5.51it/s]


(205, 205)

In [92]:
labels[:5], texts[:5]

([56, 57, 57, 57, 56],
 ['ਕੌਫੀ ਮੇਕਰ ਚਲਾਓ',
  'ਗੀਤ ਦਾ ਗਾਇਕ ਕੌਣ ਹੈ',
  'ਮੈਂ ਇਸ ਗੀਤ ਬਾਰੇ ਜਾਣਨਾ ਚਾਹੁੰਦਾ ਹਾਂ',
  'ਜੋ ਇੱਕ ਲੰਬੀ ਕਾਲੀ ਰੇਲਗੱਡੀ ਬਾਰੇ ਗੀਤ ਗਾਉਂਦਾ ਹੈ',
  'ਮੈਨੂੰ ਇੱਕ ਕੌਫੀ ਚਾਹੀਦੀ ਹੈ'])

In [93]:
pa_OOD = {"label":labels.copy(), "text":texts.copy()}
pa_OOD = Dataset.from_pandas(pd.DataFrame(data=pa_OOD))
pa_OOD.save_to_disk("./datasets/pa_OOD_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 205/205 [00:00<00:00, 67549.09 examples/s]


#### Short

##### ID

In [95]:
texts = list()
labels = list()

for item in tqdm(en_ID_short):
  query = item["text"]
  label = item["label"]
  texts.append(translate_text(query, src=src, dest=dest))
  labels.append(label)

len(texts), len(labels)

100%|██████████| 2435/2435 [05:42<00:00,  7.10it/s]


(2435, 2435)

In [96]:
labels[:5], texts[:5]

([48, 46, 1, 41, 40],
 ['ਮੈਨੂੰ ਇਸ ਹਫ਼ਤੇ ਪੰਜ ਵਜੇ ਜਗਾਓ',
  'ਸ਼ਾਂਤ',
  'ਗੁਲਾਬੀ ਸਿਰਫ ਸਾਨੂੰ ਲੋੜ ਹੈ',
  'ਅਤੇ ਹਨੇਰਾ ਡਿੱਗ ਗਿਆ ਹੈ',
  'ਓਲੀ ਨੇ ਬੈੱਡਰੂਮ ਦੀਆਂ ਲਾਈਟਾਂ ਬੰਦ ਕਰ ਦਿੱਤੀਆਂ'])

In [97]:
pa_ID_short = {"label":labels.copy(), "text":texts.copy()}
pa_ID_short = Dataset.from_pandas(pd.DataFrame(data=pa_ID_short))
pa_ID_short.save_to_disk("./datasets/pa_ID_short_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 2435/2435 [00:00<00:00, 303926.03 examples/s]


##### OOD

In [98]:
texts = list()
labels = list()

for item in tqdm(en_OOD_short):
  query = item["text"]
  label = item["label"]
  texts.append(translate_text(query, src=src, dest=dest))
  labels.append(label)

len(texts), len(labels)

100%|██████████| 190/190 [00:27<00:00,  6.79it/s]


(190, 190)

In [99]:
labels[:5], texts[:5]

([56, 57, 57, 57, 56],
 ['ਕੌਫੀ ਮੇਕਰ ਚਲਾਓ',
  'ਗੀਤ ਦਾ ਗਾਇਕ ਕੌਣ ਹੈ',
  'ਮੈਂ ਇਸ ਗੀਤ ਬਾਰੇ ਜਾਣਨਾ ਚਾਹੁੰਦਾ ਹਾਂ',
  'ਜੋ ਇੱਕ ਲੰਬੀ ਕਾਲੀ ਰੇਲਗੱਡੀ ਬਾਰੇ ਗੀਤ ਗਾਉਂਦਾ ਹੈ',
  'ਮੈਨੂੰ ਇੱਕ ਕੌਫੀ ਚਾਹੀਦੀ ਹੈ'])

In [100]:
pa_OOD_short = {"label":labels.copy(), "text":texts.copy()}
pa_OOD_short = Dataset.from_pandas(pd.DataFrame(data=pa_OOD_short))
pa_OOD_short.save_to_disk("./datasets/pa_OOD_short_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 190/190 [00:00<00:00, 47531.78 examples/s]


#### Long

##### ID

In [101]:
texts = list()
labels = list()

for item in tqdm(en_ID_long):
  query = item["text"]
  label = item["label"]
  texts.append(translate_text(query, src=src, dest=dest))
  labels.append(label)

len(texts), len(labels)

100%|██████████| 2435/2435 [09:01<00:00,  4.50it/s]


(2435, 2435)

In [102]:
labels[:5], texts[:5]

([48, 46, 1, 41, 40],
 ['ਮੈਨੂੰ ਇਸ ਹਫਤੇ ਪੰਜ ਵਜੇ ਜਗਾਓ, ਇਸ ਹਫਤੇ ਮੈਨੂੰ ਪੰਜ ਵਜੇ ਜਗਾਓ, ਇਸ ਹਫਤੇ ਮੈਨੂੰ ਪੰਜ ਵਜੇ ਜਗਾਓ, ਇਸ ਹਫਤੇ ਮੈਨੂੰ ਪੰਜ ਵਜੇ ਜਗਾਓ',
  'ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ, ਸ਼ਾਂਤ',
  'ਗੁਲਾਬੀ ਉਹ ਸਭ ਕੁਝ ਹੈ ਜਿਸਦੀ ਸਾਨੂੰ ਲੋੜ ਹੈ, ਗੁਲਾਬੀ ਉਹ ਸਭ ਕੁਝ ਹੈ ਜਿਸਦੀ ਸਾਨੂੰ ਲੋੜ ਹੈ, ਗੁਲਾਬੀ ਉਹ ਹੈ ਜਿਸਦੀ ਸਾਨੂੰ ਲੋੜ ਹੈ, ਗੁਲਾਬੀ ਉਹ ਸਭ ਕੁਝ ਹੈ ਜਿਸਦੀ ਸਾਨੂੰ ਲੋੜ ਹੈ, ਗੁਲਾਬੀ ਉਹ ਸਭ ਕੁਝ ਹੈ ਜਿਸਦੀ ਸਾਨੂੰ ਲੋੜ ਹੈ',
  'ਅਤੇ ਹਨੇਰਾ ਡਿੱਗ ਗਿਆ ਹੈ, ਅਤੇ ਹਨੇਰਾ ਡਿੱਗ ਗਿਆ ਹੈ, ਅਤੇ ਹਨੇਰਾ ਡਿੱਗ ਗਿਆ ਹੈ, ਅਤੇ ਹਨੇਰਾ ਡਿੱਗ ਗਿਆ ਹੈ, ਅਤੇ ਹਨੇਰਾ ਡਿੱਗ ਗਿਆ ਹੈ, ਅਤੇ ਹਨੇਰਾ ਡਿੱਗ ਗਿਆ ਹੈ',
  'ਓਲੀ ਬੈੱਡਰੂਮ ਦੀਆਂ ਲਾਈਟਾਂ ਬੰਦ ਕਰ ਦਿਓ, ਓਲੀ ਬੈੱਡਰੂਮ ਦੀਆਂ ਲਾਈਟਾਂ ਬੰਦ ਕਰੋ, ਓਲੀ ਬੈੱਡਰੂਮ ਦੀਆਂ ਲਾਈਟਾਂ ਬੰਦ ਕਰੋ, ਓਲੀ ਬੈੱਡਰੂਮ ਦੀਆਂ ਲਾਈਟਾਂ ਬੰਦ ਕਰੋ'])

In [103]:
pa_ID_long = {"label":labels.copy(), "text":texts.copy()}
pa_ID_long = Dataset.from_pandas(pd.DataFrame(data=pa_ID_long))
pa_ID_long.save_to_disk("./datasets/pa_ID_long_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 2435/2435 [00:00<00:00, 303123.21 examples/s]


##### OOD

In [104]:
texts = list()
labels = list()

for item in tqdm(en_OOD_long):
  query = item["text"]
  label = item["label"]
  texts.append(translate_text(query, src=src, dest=dest))
  labels.append(label)

len(texts), len(labels)

100%|██████████| 190/190 [00:43<00:00,  4.39it/s]


(190, 190)

In [105]:
labels[:5], texts[:5]

([56, 57, 57, 57, 56],
 ['ਕੌਫੀ ਮੇਕਰ ਚਲਾਓ, ਕੌਫੀ ਮੇਕਰ ਚਲਾਓ, ਕੌਫੀ ਮੇਕਰ ਚਲਾਓ, ਕੌਫੀ ਮੇਕਰ ਚਲਾਓ, ਕੌਫੀ ਮੇਕਰ ਚਲਾਓ, ਕੌਫੀ ਮੇਕਰ ਚਲਾਓ, ਕੌਫੀ ਮੇਕਰ ਚਲਾਓ, ਕੌਫੀ ਮੇਕਰ ਚਲਾਓ, ਕੌਫੀ ਮੇਕਰ ਚਲਾਓ, ਕੌਫੀ ਮੇਕਰ ਚਲਾਓ',
  'ਗੀਤ ਦਾ ਗਾਇਕ ਕੌਣ ਹੈ, ਗੀਤ ਦਾ ਗਾਇਕ ਕੌਣ ਹੈ, ਗੀਤ ਦਾ ਗਾਇਕ ਕੌਣ ਹੈ, ਗੀਤ ਦਾ ਗਾਇਕ ਕੌਣ ਹੈ, ਗੀਤ ਦਾ ਗਾਇਕ ਕੌਣ ਹੈ',
  'ਮੈਂ ਇਸ ਗੀਤ ਬਾਰੇ ਜਾਣਨਾ ਚਾਹੁੰਦਾ ਹਾਂ, ਮੈਂ ਇਸ ਗੀਤ ਬਾਰੇ ਜਾਣਨਾ ਚਾਹੁੰਦਾ ਹਾਂ, ਮੈਂ ਇਸ ਗੀਤ ਬਾਰੇ ਜਾਣਨਾ ਚਾਹੁੰਦਾ ਹਾਂ, ਮੈਂ ਇਸ ਗੀਤ ਬਾਰੇ ਜਾਣਨਾ ਚਾਹੁੰਦਾ ਹਾਂ, ਮੈਂ ਇਸ ਗੀਤ ਬਾਰੇ ਜਾਣਨਾ ਚਾਹੁੰਦਾ ਹਾਂ',
  'ਕੌਣ ਇੱਕ ਲੰਬੀ ਕਾਲੀ ਰੇਲਗੱਡੀ ਬਾਰੇ ਗੀਤ ਗਾਉਂਦਾ ਹੈ, ਕੌਣ ਇੱਕ ਲੰਬੀ ਕਾਲੀ ਰੇਲਗੱਡੀ ਬਾਰੇ ਗੀਤ ਗਾਉਂਦਾ ਹੈ, ਕੌਣ ਇੱਕ ਲੰਬੀ ਕਾਲੀ ਰੇਲਗੱਡੀ ਬਾਰੇ ਗੀਤ ਗਾਉਂਦਾ ਹੈ, ਕੌਣ ਇੱਕ ਲੰਬੀ ਕਾਲੀ ਰੇਲਗੱਡੀ ਬਾਰੇ ਗੀਤ ਗਾਉਂਦਾ ਹੈ',
  'ਮੈਨੂੰ ਕੌਫ਼ੀ ਚਾਹੀਦੀ ਹੈ, ਮੈਨੂੰ ਕੌਫ਼ੀ ਚਾਹੀਦੀ ਹੈ, ਮੈਨੂੰ ਕੌਫ਼ੀ ਚਾਹੀਦੀ ਹੈ, ਮੈਨੂੰ ਕੌਫ਼ੀ ਚਾਹੀਦੀ ਹੈ, ਮੈਨੂੰ ਕੌਫ਼ੀ ਚਾਹੀਦੀ ਹੈ, ਮੈਨੂੰ ਕੌਫ਼ੀ ਚਾਹੀਦੀ ਹੈ, ਮੈਨੂੰ ਕੌਫ਼ੀ ਚਾਹੀਦੀ ਹੈ, ਮੈਨੂੰ ਕੌਫ਼ੀ ਚਾਹੀਦੀ ਹੈ'])

In [106]:
pa_OOD_long = {"label":labels.copy(), "text":texts.copy()}
pa_OOD_long = Dataset.from_pandas(pd.DataFrame(data=pa_OOD_long))
pa_OOD_long.save_to_disk("./datasets/pa_OOD_long_test.hf")

Saving the dataset (1/1 shards): 100%|██████████| 190/190 [00:00<00:00, 10322.23 examples/s]


##  Save all datasets

In [109]:
languages = ["en", "es", "pa"]
split = ["ID", "OOD"]
versions = ["", "short", "long"]

ks = list()

for lang in languages:
    for s in split:
        for v in versions:
            set_name = f"{lang}_{s}"
            if v: set_name += f"_{v}"
            ks.append(set_name)
len(ks), ks


(18,
 ['en_ID',
  'en_ID_short',
  'en_ID_long',
  'en_OOD',
  'en_OOD_short',
  'en_OOD_long',
  'es_ID',
  'es_ID_short',
  'es_ID_long',
  'es_OOD',
  'es_OOD_short',
  'es_OOD_long',
  'pa_ID',
  'pa_ID_short',
  'pa_ID_long',
  'pa_OOD',
  'pa_OOD_short',
  'pa_OOD_long'])

In [111]:
all_datasets = {
    "en_ID"         :   en_ID,
    "en_ID_short"   :   en_ID_short,
    "en_ID_long"    :   en_ID_long,
    "en_OOD"        :   en_OOD,
    "en_OOD_short"  :   en_OOD_short,
    "en_OOD_long"   :   en_OOD_long,
    "es_ID"         :   es_ID,
    "es_ID_short"   :   es_ID_short,
    "es_ID_long"    :   es_ID_long,
    "es_OOD"        :   es_OOD,
    "es_OOD_short"  :   es_OOD_short,
    "es_OOD_long"   :   es_OOD_long,
    "pa_ID"         :   pa_ID,
    "pa_ID_short"   :   pa_ID_short,
    "pa_ID_long"    :   pa_ID_long,
    "pa_OOD"        :   pa_OOD,
    "pa_OOD_short"  :   pa_OOD_short,
    "pa_OOD_long"   :   pa_OOD_long
}



In [112]:
# with open("./datasets/all_datasets.pkl","wb") as f:
#     pkl.dump(all_datasets, f)


In [113]:
# with open("./datasets/all_test_datasets.pkl", "rb") as f:
#     pkl_datasets = pkl.load(f)
# type(pkl_datasets)

dict

## Predictions

In [None]:
preds = dict()

for m in model_names:
  preds[m] = dict()
  for d_name, data in tqdm(all_datasets.items()):
    preds[m][d_name] = models[m]["classifier"](data["text"])

In [137]:
# with open("./predictions/all_test_predictions.pkl","wb") as f:
#     pkl.dump(preds, f)

In [138]:
# with open("./predictions/all_test_predictions.pkl", "rb") as f:
#     pkl_predictions = pkl.load(f)
# type(pkl_predictions)

dict

# Full classification (60 classes)

## Classifiers

In [4]:
models = {
    "multi" :   dict(),
    "xlm"   :   dict(),
    "xln"   :   dict()
    }

model_names = list(models.keys())

model_names

['multi', 'xlm', 'xln']

In [5]:
models_dir = "./classifiers/full_en_60"
for m in model_names:
  tokenizer_link = f"{models_dir}/{m}/{m}_tokenizer"
  model_link = f"{models_dir}/{m}/{m}_model"
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_link)
  model = AutoModelForSequenceClassification.from_pretrained(model_link)
  classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, function_to_apply="none", return_all_scores=True)
  models[m]["tokenizer"] = tokenizer
  models[m]["model"] = model
  models[m]["classifier"] = classifier



## Datasets

In [6]:
categroies_list = [
    'datetime_query',
    'iot_hue_lightchange',
    'transport_ticket',
    'takeaway_query',
    'qa_stock',
    'general_greet',
    'recommendation_events',
    'music_dislikeness',
    'iot_wemo_off',
    'cooking_recipe',
    'qa_currency',
    'transport_traffic',
    'general_quirky',
    'weather_query',
    'audio_volume_up',
    'email_addcontact',
    'takeaway_order',
    'email_querycontact',
    'iot_hue_lightup',
    'recommendation_locations',
    'play_audiobook',
    'lists_createoradd',
    'news_query',
    'alarm_query',
    'iot_wemo_on',
    'general_joke',
    'qa_definition',
    'social_query',
    'music_settings',
    'audio_volume_other',
    'calendar_remove',
    'iot_hue_lightdim',
    'calendar_query',
    'email_sendemail',
    'iot_cleaning',
    'audio_volume_down',
    'play_radio',
    'cooking_query',
    'datetime_convert',
    'qa_maths',
    'iot_hue_lightoff',
    'iot_hue_lighton',
    'transport_query',
    'music_likeness',
    'email_query',
    'play_music',
    'audio_volume_mute',
    'social_post',
    'alarm_set',
    'qa_factoid',
    'calendar_set',
    'play_game',
    'alarm_remove',
    'lists_remove',
    'transport_taxi',
    'recommendation_movies',
    'iot_coffee',
    'music_query',
    'play_podcasts',
    'lists_query']

### Massive

In [9]:
datasets_versions = ["train", "valid", "test", "train_valid"]
en_datasets = dict()

folder_path = "datasets/full_en_eo_pa"

ds_name = "en"
t = "short"
for v in datasets_versions:
    en_datasets[v] = load_from_disk(f"{folder_path}/{ds_name}_{v}_{t}.hf")

In [10]:
en_datasets

{'train': Dataset({
     features: ['label', 'text'],
     num_rows: 11514
 }),
 'valid': Dataset({
     features: ['label', 'text'],
     num_rows: 2033
 }),
 'test': Dataset({
     features: ['label', 'text'],
     num_rows: 2974
 }),
 'train_valid': Dataset({
     features: ['label', 'text'],
     num_rows: 13547
 })}

#### Full

In [None]:
labels_list, counts = np.unique(en_datasets["train_valid"]["label"], return_counts=True)
labels_list, counts

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59]),
 array([414, 147, 152, 146, 176,  27, 216,  16,  57, 248, 174, 139, 660,
        699, 122,  59, 155, 143,  88, 204, 185, 202, 585, 149,  55,  87,
        322, 126,  59,  18, 359,  93, 668, 417, 112,  60, 329,   6,  61,
         91, 170,  27, 263, 129, 491, 762, 125, 333, 213, 634, 941, 134,
         92, 201, 127,  82, 138, 184, 227, 248], dtype=int64))

In [None]:
num_labels = len(labels_list)
num_labels

60

#### English (en)

#### Long

In [78]:
query_len_thr = 512
long_query = query_len_thr

In [79]:
datasets_versions[:-1]

['train', 'valid', 'test']

In [189]:
ds_name = "en"

batch_size = 100

### loading a tokenizer for the workaround; e,g, multi
m = "multi"
tokenizer_link = f"{models_dir}/{m}/{m}_tokenizer"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_link)

for v in datasets_versions[:-1]:
    print(f"Start > {v}")
    ds = en_datasets[v]
    ds_list = list()
    texts = list()
    labels = list()
    combined_ds = None
    index = 0
    batches = math.ceil(ds.num_rows/batch_size)
    for i in tqdm(range(batches)):
        last_index = index+batch_size
        if last_index>=len(ds): last_index=None
        texts_batch = ds["text"][index:last_index]
        labels_batch = ds["label"][index:last_index]
        index = (i+1)*batch_size
        ds_list = list()
        for text, label in zip(texts_batch, labels_batch):
            # query = item["text"]
            # label = item["label"]
            query = text
            query_len = len(query.split())
            repeat = math.floor(long_query/query_len)
            query = [query for _ in range(repeat)]
            query = ", ".join(query)
            ##########################################
            # This part is a workaround to avoid longer tokenized sequences, truncation and model_max_length were useless when loading the tokenizer
            query_len = len(query.split())
            t_query = tokenizer(query, return_tensors="pt")
            tokenized_len = len(t_query[0])
            red_perc = (tokenized_len-long_query)/tokenized_len
            num_red_tokens = int(red_perc*query_len)+5 # +5 to make sure that it is less than 512
            query = " ".join(query.split()[:-num_red_tokens])
            new_len = len(tokenizer(query, return_tensors="pt")[0])
            # if (new_len>long_query): print(f"Error: {new_len}") # check
            ##########################################
            # texts.append(query)
            # labels.append(label)
            ds_list.append({"label": label, "text":query})

        batch_ds = Dataset.from_list(ds_list)

        if not combined_ds:
            combined_ds = batch_ds
        else:
            combined_ds = concatenate_datasets([combined_ds, batch_ds])

    
    print(f"{v}: len of ds > {ds.num_rows}")
    print(f"{v}: len of {v} ds > {combined_ds.num_rows}")
    # print(f"example len of instance: {len(ds_list[1]["text"].split())}")

    combined_ds.save_to_disk(f"{folder_path}/{ds_name}_{v}_long.hf")

    print(f"Created > {v}")
    del texts, labels


Start > train


100%|██████████| 116/116 [00:55<00:00,  2.10it/s]


train: len of ds > 11514
train: len of train ds > 11514


Saving the dataset (1/1 shards): 100%|██████████| 11514/11514 [00:00<00:00, 564687.64 examples/s]


Created > train
Start > valid


100%|██████████| 21/21 [00:08<00:00,  2.54it/s]


valid: len of ds > 2033
valid: len of valid ds > 2033


Saving the dataset (1/1 shards): 100%|██████████| 2033/2033 [00:00<?, ? examples/s]


Created > valid
Start > test


100%|██████████| 30/30 [00:13<00:00,  2.23it/s]


test: len of ds > 2974
test: len of test ds > 2974


Saving the dataset (1/1 shards): 100%|██████████| 2974/2974 [00:00<?, ? examples/s]

Created > test





In [190]:
train = Dataset.load_from_disk(f"{folder_path}/{ds_name}_train_long.hf")
valid = Dataset.load_from_disk(f"{folder_path}/{ds_name}_valid_long.hf")

train, valid

(Dataset({
     features: ['label', 'text'],
     num_rows: 11514
 }),
 Dataset({
     features: ['label', 'text'],
     num_rows: 2033
 }))

In [191]:
combined_ds = concatenate_datasets([train, valid])
combined_ds

Dataset({
    features: ['label', 'text'],
    num_rows: 13547
})

In [192]:
combined_ds.save_to_disk(f"{folder_path}/{ds_name}_train_valid_long.hf")

Saving the dataset (1/1 shards): 100%|██████████| 13547/13547 [00:00<00:00, 292041.80 examples/s]


In [11]:
ds_name = "en"
t = "long"

en_datasets_long = {f"{v}_long":Dataset.load_from_disk(f"{folder_path}/{ds_name}_{v}_{t}.hf") for v in datasets_versions}

en_datasets_long

{'train_long': Dataset({
     features: ['label', 'text'],
     num_rows: 11514
 }),
 'valid_long': Dataset({
     features: ['label', 'text'],
     num_rows: 2033
 }),
 'test_long': Dataset({
     features: ['label', 'text'],
     num_rows: 2974
 }),
 'train_valid_long': Dataset({
     features: ['label', 'text'],
     num_rows: 13547
 })}

### Esperanto

#### Short

In [8]:
# ds_name = "eo" #Esperanto

# batch_size = 100

# for v in datasets_versions[1:-1]:
#     print(f"Start > {v}")
#     ds = en_datasets[v]
#     ds_list = list()
#     texts = list()
#     labels = list()
#     combined_ds = None
#     index = 0
#     batches = math.ceil(ds.num_rows/batch_size)
#     for i in tqdm(range(batches)):
#         last_index = index+batch_size
#         if last_index>=len(ds): last_index=None
#         texts_batch = ds["text"][index:last_index]
#         labels_batch = ds["label"][index:last_index]
#         index = (i+1)*batch_size
#         ds_list = list()
#         for text, label in zip(texts_batch, labels_batch):
#             # query = item["text"]
#             # label = item["label"]
#             query = translate_text(text, src="en", dest=ds_name)
#             # texts.append(query)
#             # labels.append(label)
#             ds_list.append({"label": label, "text":query})

#         batch_ds = Dataset.from_list(ds_list)

#         if not combined_ds:
#             combined_ds = batch_ds
#         else:
#             combined_ds = concatenate_datasets([combined_ds, batch_ds])

    
#     print(f"{v}: len of ds > {ds.num_rows}")
#     print(f"{v}: len of {ds_name} ds > {combined_ds.num_rows}")
#     # print(f"example len of instance: {len(ds_list[1]["text"].split())}")

#     combined_ds.save_to_disk(f"{folder_path}/{ds_name}_{v}.hf")

#     print(f"Created > {v}")
#     del texts, labels

In [9]:
# eo_train = Dataset.load_from_disk(f"{folder_path}/{ds_name}_train.hf")
# eo_valid = Dataset.load_from_disk(f"{folder_path}/{ds_name}_valid.hf")
# eo_test = Dataset.load_from_disk(f"{folder_path}/{ds_name}_test.hf")
# print(eo_train, eo_valid, eo_test)

# eo_combined_ds = concatenate_datasets([eo_train, eo_valid])
# print(eo_combined_ds)

# eo_combined_ds.save_to_disk(f"{folder_path}/{ds_name}_train_valid.hf")

In [12]:
datasets_versions = ["train", "valid", "test", "train_valid"]
eo_datasets = dict()

folder_path = "datasets/full_en_eo_pa"

ds_name = "eo"
t = "short"

for v in datasets_versions:
    eo_datasets[v] = load_from_disk(f"{folder_path}/{ds_name}_{v}_{t}.hf")

In [13]:
eo_datasets

{'train': Dataset({
     features: ['label', 'text'],
     num_rows: 11514
 }),
 'valid': Dataset({
     features: ['label', 'text'],
     num_rows: 2033
 }),
 'test': Dataset({
     features: ['label', 'text'],
     num_rows: 2974
 }),
 'train_valid': Dataset({
     features: ['label', 'text'],
     num_rows: 13547
 })}

In [21]:
# ds_name = "eo" #Esperanto

# batch_size = 100

# for v in datasets_versions[:1]: # Only first item for now
#     print(f"Start > {v}")
#     ds = en_datasets[v]
#     ds_list = list()
#     texts = list()
#     labels = list()
#     combined_ds = None
#     index = 0
#     batches = math.ceil(ds.num_rows/batch_size)
#     for i in tqdm(range(batches)):
#         last_index = index+batch_size
#         if last_index>=len(ds): last_index=None
#         texts_batch = ds["text"][index:last_index]
#         labels_batch = ds["label"][index:last_index]
#         ds_list = list()
#         for text, label in zip(texts_batch, labels_batch):
#             # query = item["text"]
#             # label = item["label"]
#             query = translate_text(text, src="en", dest=ds_name)
#             # texts.append(query)
#             # labels.append(label)
#             ds_list.append({"label": label, "text":query})

#         batch_ds = Dataset.from_list(ds_list)

#         batch_ds.save_to_disk(f"{folder_path}/{ds_name}_{v}_batches/full_{ds_name}_{v}_{index}.hf")    
#         print(f"{v}: len of ds {index}> {ds.num_rows}")
#         print(f"{v}: len of eo ds {index}> {batch_ds.num_rows}")
        
#         index = (i+1)*batch_size

#         # if not combined_ds:
#         #     combined_ds = batch_ds
#         # else:
#         #     combined_ds = concatenate_datasets([combined_ds, batch_ds])

    
#         # print(f"{v}: len of ds {index}> {ds.num_rows}")
#         # print(f"{v}: len of eo ds {index}> {combined_ds.num_rows}")
#     # print(f"example len of instance: {len(ds_list[1]["text"].split())}")

#     # combined_ds.save_to_disk(f"{folder_path}/full_{ds_name}_{v}.hf")

#     print(f"Created > {v}")
#     del texts, labels

#### Long

In [218]:
query_len_thr = 512
long_query = query_len_thr

ds_name = "eo"

batch_size = 100

### loading a tokenizer for the workaround; e,g, multi
m = "multi"
tokenizer_link = f"{models_dir}/{m}/{m}_tokenizer"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_link)

for v in datasets_versions[:-1]:
    print(f"Start > {v}")
    ds = eo_datasets[v]
    ds_list = list()
    texts = list()
    labels = list()
    combined_ds = None
    index = 0
    batches = math.ceil(ds.num_rows/batch_size)
    for i in tqdm(range(batches)):
        last_index = index+batch_size
        if last_index>=len(ds): last_index=None
        texts_batch = ds["text"][index:last_index]
        labels_batch = ds["label"][index:last_index]
        index = (i+1)*batch_size
        ds_list = list()
        for text, label in zip(texts_batch, labels_batch):
            # query = item["text"]
            # label = item["label"]
            query = text
            query_len = len(query.split())
            repeat = math.ceil(long_query/query_len)
            query = [query for _ in range(repeat)]
            query = ", ".join(query)
            ##########################################
            # This part is a workaround to avoid longer tokenized sequences, truncation and model_max_length were useless when loading the tokenizer
            query_len = len(query.split())
            t_query = tokenizer(query, return_tensors="pt")
            tokenized_len = len(t_query[0])
            red_perc = (tokenized_len-long_query)/tokenized_len
            num_red_tokens = int(red_perc*query_len)+5 # +5 to make sure that it is less than 512
            query = " ".join(query.split()[:-num_red_tokens])
            new_len = len(tokenizer(query, return_tensors="pt"))
            # if (new_len>long_query): print(f"Error: {new_len}") # check
            ##########################################
            # texts.append(query)
            # labels.append(label)
            ds_list.append({"label": label, "text":query})

        batch_ds = Dataset.from_list(ds_list)

        if not combined_ds:
            combined_ds = batch_ds
        else:
            combined_ds = concatenate_datasets([combined_ds, batch_ds])

    
    print(f"{v}: len of ds > {ds.num_rows}")
    print(f"{v}: len of {v} ds > {combined_ds.num_rows}")
    # print(f"example len of instance: {len(ds_list[1]["text"].split())}")

    combined_ds.save_to_disk(f"{folder_path}/{ds_name}_{v}_long.hf")

    print(f"Created > {v}")
    del texts, labels


Start > train


100%|██████████| 116/116 [00:36<00:00,  3.17it/s]


train: len of ds > 11514
train: len of train ds > 11514


Saving the dataset (1/1 shards): 100%|██████████| 11514/11514 [00:00<00:00, 681770.54 examples/s]


Created > train
Start > valid


100%|██████████| 21/21 [00:06<00:00,  3.39it/s]


valid: len of ds > 2033
valid: len of valid ds > 2033


Saving the dataset (1/1 shards): 100%|██████████| 2033/2033 [00:00<00:00, 199831.74 examples/s]


Created > valid
Start > test


100%|██████████| 30/30 [00:11<00:00,  2.72it/s]


test: len of ds > 2974
test: len of test ds > 2974


Saving the dataset (1/1 shards): 100%|██████████| 2974/2974 [00:00<00:00, 369103.72 examples/s]

Created > test





In [219]:
train = Dataset.load_from_disk(f"{folder_path}/{ds_name}_train_long.hf")
valid = Dataset.load_from_disk(f"{folder_path}/{ds_name}_valid_long.hf")

train, valid

(Dataset({
     features: ['label', 'text'],
     num_rows: 11514
 }),
 Dataset({
     features: ['label', 'text'],
     num_rows: 2033
 }))

In [220]:
combined_ds = concatenate_datasets([train, valid])
combined_ds

Dataset({
    features: ['label', 'text'],
    num_rows: 13547
})

In [221]:
combined_ds.save_to_disk(f"{folder_path}/{ds_name}_train_valid_long.hf")

Saving the dataset (1/1 shards): 100%|██████████| 13547/13547 [00:00<00:00, 379449.17 examples/s]


In [14]:
ds_name = "eo"
t = "long"

eo_datasets_long = {f"{v}_long":Dataset.load_from_disk(f"{folder_path}/{ds_name}_{v}_{t}.hf") for v in datasets_versions}

eo_datasets_long

{'train_long': Dataset({
     features: ['label', 'text'],
     num_rows: 11514
 }),
 'valid_long': Dataset({
     features: ['label', 'text'],
     num_rows: 2033
 }),
 'test_long': Dataset({
     features: ['label', 'text'],
     num_rows: 2974
 }),
 'train_valid_long': Dataset({
     features: ['label', 'text'],
     num_rows: 13547
 })}

### Punjabi

#### Short

In [24]:
# ds_name = "pa" #Punjabi

# batch_size = 100

# for v in datasets_versions[:1]: # Only first item for now (TRAIN)
#     print(f"Start > {v}")
#     ds = en_datasets[v]
#     ds_list = list()
#     texts = list()
#     labels = list()
#     combined_ds = None
#     i = 0 + 7700
#     batches = math.ceil(ds.num_rows/batch_size)
#     while tqdm(i<ds.num_rows): # tqdm(range(batches)):
#         last_index = i+batch_size
#         if last_index>=len(ds): last_index=None
#         texts_batch = ds["text"][i:last_index]
#         labels_batch = ds["label"][i:last_index]
#         ds_list = list()
#         for text, label in zip(texts_batch, labels_batch):
#             # query = item["text"]
#             # label = item["label"]
#             query = translate_text(text, src="en", dest=ds_name)
#             # texts.append(query)
#             # labels.append(label)
#             ds_list.append({"label": label, "text":query})

#         batch_ds = Dataset.from_list(ds_list)

#         batch_ds.save_to_disk(f"{folder_path}/{ds_name}_{v}_batches/_{ds_name}_{v}_{i}.hf")
#         print(f"{i} of {ds.num_rows} ds are saved!")    
#         # print(f"{v}: len of ds {index}> {ds.num_rows}")
#         # print(f"{v}: len of {ds_name} ds {index}> {batch_ds.num_rows}")
        
#         i += batch_size

#         # if not combined_ds:
#         #     combined_ds = batch_ds
#         # else:
#         #     combined_ds = concatenate_datasets([combined_ds, batch_ds])

    
#         # print(f"{v}: len of ds {index}> {ds.num_rows}")
#         # print(f"{v}: len of eo ds {index}> {combined_ds.num_rows}")
#     # print(f"example len of instance: {len(ds_list[1]["text"].split())}")

#     # combined_ds.save_to_disk(f"{folder_path}/full_{ds_name}_{v}.hf")

#     print(f"Created > {v}")
#     del texts, labels

In [25]:
# ds_name = "pa" #Punjabi
# batch_size = 100

# v = "train"
# batches_path = f"{folder_path}/{ds_name}_{v}_batches"

# combined_ds = None

# for i in tqdm(range(0, en_datasets[v].num_rows, batch_size)):
#     ds_path = f"{batches_path}/_{ds_name}_{v}_{i}.hf"
#     batch_ds = Dataset.load_from_disk(f"{ds_path}")

#     if not combined_ds:
#         combined_ds = batch_ds
#     else:
#         combined_ds = concatenate_datasets([combined_ds, batch_ds])

# print(combined_ds.num_rows)
# combined_ds.save_to_disk(f"{folder_path}/{ds_name}_{v}.hf")

In [None]:
# ds_name = "pa" #Punjabi

# batch_size = 100

# for v in datasets_versions[:1]:
#     print(f"Start > {v}")
#     ds = en_datasets[v]
#     ds_list = list()
#     texts = list()
#     labels = list()
#     combined_ds = None
#     index = 0
#     batches = math.ceil(ds.num_rows/batch_size)
#     for i in tqdm(range(batches)):
#         last_index = index+batch_size
#         if last_index>=len(ds): last_index=None
#         texts_batch = ds["text"][index:last_index]
#         labels_batch = ds["label"][index:last_index]
#         index = (i+1)*batch_size
#         ds_list = list()
#         for text, label in zip(texts_batch, labels_batch):
#             # query = item["text"]
#             # label = item["label"]
#             query = translate_text(text, src="en", dest=ds_name)
#             # texts.append(query)
#             # labels.append(label)
#             ds_list.append({"label": label, "text":query})

#         batch_ds = Dataset.from_list(ds_list)

#         if not combined_ds:
#             combined_ds = batch_ds
#         else:
#             combined_ds = concatenate_datasets([combined_ds, batch_ds])

    
#     print(f"{v}: len of ds > {ds.num_rows}")
#     print(f"{v}: len of {ds_name} ds > {combined_ds.num_rows}")
#     # print(f"example len of instance: {len(ds_list[1]["text"].split())}")

#     combined_ds.save_to_disk(f"{folder_path}/{ds_name}_{v}.hf")

#     print(f"Created > {v}")
#     del texts, labels

In [26]:
pa_train = Dataset.load_from_disk(f"{folder_path}/{ds_name}_train.hf")
pa_valid = Dataset.load_from_disk(f"{folder_path}/{ds_name}_valid.hf")
pa_test = Dataset.load_from_disk(f"{folder_path}/{ds_name}_test.hf")
print(pa_train, pa_valid, pa_test)

pa_combined_ds = concatenate_datasets([pa_train, pa_valid])
print(pa_combined_ds)

pa_combined_ds.save_to_disk(f"{folder_path}/{ds_name}_train_valid.hf")

Dataset({
    features: ['label', 'text'],
    num_rows: 11514
}) Dataset({
    features: ['label', 'text'],
    num_rows: 2033
}) Dataset({
    features: ['label', 'text'],
    num_rows: 2974
})
Dataset({
    features: ['label', 'text'],
    num_rows: 13547
})


Saving the dataset (1/1 shards): 100%|██████████| 13547/13547 [00:00<00:00, 1231261.08 examples/s]


In [15]:
datasets_versions = ["train", "valid", "test", "train_valid"]
pa_datasets = dict()

folder_path = "datasets/full_en_eo_pa"

ds_name = "pa"
t = "short"

for v in datasets_versions:
    pa_datasets[v] = load_from_disk(f"{folder_path}/{ds_name}_{v}_{t}.hf")

In [16]:
pa_datasets

{'train': Dataset({
     features: ['label', 'text'],
     num_rows: 11514
 }),
 'valid': Dataset({
     features: ['label', 'text'],
     num_rows: 2033
 }),
 'test': Dataset({
     features: ['label', 'text'],
     num_rows: 2974
 }),
 'train_valid': Dataset({
     features: ['label', 'text'],
     num_rows: 13547
 })}

#### Long

In [223]:
query_len_thr = 512
long_query = query_len_thr

ds_name = "pa"

batch_size = 100

### loading a tokenizer for the workaround; e,g, multi
m = "multi"
tokenizer_link = f"{models_dir}/{m}/{m}_tokenizer"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_link)

for v in datasets_versions[:-1]:
    print(f"Start > {v}")
    ds = pa_datasets[v]
    ds_list = list()
    texts = list()
    labels = list()
    combined_ds = None
    index = 0
    batches = math.ceil(ds.num_rows/batch_size)
    for i in tqdm(range(batches)):
        last_index = index+batch_size
        if last_index>=len(ds): last_index=None
        texts_batch = ds["text"][index:last_index]
        labels_batch = ds["label"][index:last_index]
        index = (i+1)*batch_size
        ds_list = list()
        for text, label in zip(texts_batch, labels_batch):
            # query = item["text"]
            # label = item["label"]
            query = text
            query_len = len(query.split())
            repeat = math.ceil(long_query/query_len)
            query = [query for _ in range(repeat)]
            query = ", ".join(query)
            ##########################################
            # This part is a workaround to avoid longer tokenized sequences, truncation and model_max_length were useless when loading the tokenizer
            query_len = len(query.split())
            t_query = tokenizer(query, return_tensors="pt")
            tokenized_len = len(t_query[0])
            red_perc = (tokenized_len-long_query)/tokenized_len
            num_red_tokens = int(red_perc*query_len)+5 # +5 to make sure that it is less than 512
            query = " ".join(query.split()[:-num_red_tokens])
            new_len = len(tokenizer(query, return_tensors="pt"))
            # if (new_len>long_query): print(f"Error: {new_len}") # check
            ##########################################
            # texts.append(query)
            # labels.append(label)
            ds_list.append({"label": label, "text":query})

        batch_ds = Dataset.from_list(ds_list)

        if not combined_ds:
            combined_ds = batch_ds
        else:
            combined_ds = concatenate_datasets([combined_ds, batch_ds])

    
    print(f"{v}: len of ds > {ds.num_rows}")
    print(f"{v}: len of {v} ds > {combined_ds.num_rows}")
    # print(f"example len of instance: {len(ds_list[1]["text"].split())}")

    combined_ds.save_to_disk(f"{folder_path}/{ds_name}_{v}_long.hf")

    print(f"Created > {v}")
    del texts, labels


Start > train


100%|██████████| 116/116 [00:54<00:00,  2.14it/s]


train: len of ds > 11514
train: len of train ds > 11514


Saving the dataset (1/1 shards): 100%|██████████| 11514/11514 [00:00<00:00, 254046.46 examples/s]


Created > train
Start > valid


100%|██████████| 21/21 [00:13<00:00,  1.57it/s]


valid: len of ds > 2033
valid: len of valid ds > 2033


Saving the dataset (1/1 shards): 100%|██████████| 2033/2033 [00:00<00:00, 100619.74 examples/s]


Created > valid
Start > test


100%|██████████| 30/30 [00:15<00:00,  1.99it/s]


test: len of ds > 2974
test: len of test ds > 2974


Saving the dataset (1/1 shards): 100%|██████████| 2974/2974 [00:00<00:00, 205757.79 examples/s]

Created > test





In [224]:
train = Dataset.load_from_disk(f"{folder_path}/{ds_name}_train_long.hf")
valid = Dataset.load_from_disk(f"{folder_path}/{ds_name}_valid_long.hf")

train, valid

(Dataset({
     features: ['label', 'text'],
     num_rows: 11514
 }),
 Dataset({
     features: ['label', 'text'],
     num_rows: 2033
 }))

In [225]:
combined_ds = concatenate_datasets([train, valid])
combined_ds

Dataset({
    features: ['label', 'text'],
    num_rows: 13547
})

In [226]:
combined_ds.save_to_disk(f"{folder_path}/{ds_name}_train_valid_long.hf")

Saving the dataset (1/1 shards): 100%|██████████| 13547/13547 [00:00<00:00, 309353.73 examples/s]


In [17]:
ds_name = "pa"
t = "long"

pa_datasets_long = {f"{v}_long":Dataset.load_from_disk(f"{folder_path}/{ds_name}_{v}_{t}.hf") for v in datasets_versions}

pa_datasets_long

{'train_long': Dataset({
     features: ['label', 'text'],
     num_rows: 11514
 }),
 'valid_long': Dataset({
     features: ['label', 'text'],
     num_rows: 2033
 }),
 'test_long': Dataset({
     features: ['label', 'text'],
     num_rows: 2974
 }),
 'train_valid_long': Dataset({
     features: ['label', 'text'],
     num_rows: 13547
 })}

## Predictions

In [18]:
datasets_names = ["en", "eo", "pa"]
datasets_types = ["short", "long"]
datasets_versions = ["train", "valid", "test", "train_valid"]

datasets_main_dict = {
    "en"    :   {
        "short" : en_datasets,
        "long"  : en_datasets_long
    },
    "eo"    :   {
        "short" : eo_datasets,
        "long"  : eo_datasets_long
    },
    "pa"    :   {
        "short" : pa_datasets,
        "long"  : pa_datasets_long
    },
}

In [19]:
datasets_versions[:-1]

['train', 'valid', 'test']

In [20]:
all_datasets = dict()

for n in datasets_names:
    for t in datasets_types:
        for v in datasets_versions[:-1]:
            key = v
            if t=="long": key+="_long" 
            all_datasets[f"{n}_{v}_{t}"] = datasets_main_dict[n][t][key]

In [21]:
len(all_datasets), all_datasets

(18,
 {'en_train_short': Dataset({
      features: ['label', 'text'],
      num_rows: 11514
  }),
  'en_valid_short': Dataset({
      features: ['label', 'text'],
      num_rows: 2033
  }),
  'en_test_short': Dataset({
      features: ['label', 'text'],
      num_rows: 2974
  }),
  'en_train_long': Dataset({
      features: ['label', 'text'],
      num_rows: 11514
  }),
  'en_valid_long': Dataset({
      features: ['label', 'text'],
      num_rows: 2033
  }),
  'en_test_long': Dataset({
      features: ['label', 'text'],
      num_rows: 2974
  }),
  'eo_train_short': Dataset({
      features: ['label', 'text'],
      num_rows: 11514
  }),
  'eo_valid_short': Dataset({
      features: ['label', 'text'],
      num_rows: 2033
  }),
  'eo_test_short': Dataset({
      features: ['label', 'text'],
      num_rows: 2974
  }),
  'eo_train_long': Dataset({
      features: ['label', 'text'],
      num_rows: 11514
  }),
  'eo_valid_long': Dataset({
      features: ['label', 'text'],
      num_rows

In [22]:
model_names

['multi', 'xlm', 'xln']

In [26]:
pred_path = "predictions/full_en_eo_pa/separate"

preds = dict()

for m in model_names[1:2]:
  print(f">>>> {m}")
  preds[m] = dict()
  for d_name, data in tqdm(all_datasets.items()):
    if ("eo_test_long" == d_name):
      pred = dict()
      print(f"\t{d_name} starts...")
      pred[f"{m}_{d_name}"] = models[m]["classifier"](data["text"])
      with open(f"{pred_path}/{m}_{d_name}_preds.pkl", "wb") as f:
        pkl.dump(pred, f)
      print(f"\t{d_name} is dumped.")

>>>> xlm


  0%|          | 0/18 [00:00<?, ?it/s]

	eo_test_long starts...


100%|██████████| 18/18 [41:54<00:00, 139.69s/it]

	eo_test_long is dumped.





In [None]:
####### Merge train and Valid after finishing
####### becareful "multi_en_train_valid_short_preds" is already done!!!

In [None]:
all_preds = dict()

pred_path = "predictions/full_en_eo_pa"

for m in model:
    for n in datasets_names:
        for t in datasets_types:
            for v in datasets_versions:
                ds_pkl_path = f"{pred_path}/separate/{m}_{n}_{v}_{t}_preds.pkl"
                with open(ds_pkl_path, "rb") as f:
                    ds_pkl = pkl.load(f)
                    all_preds.update(ds_pkl)

In [None]:
# with open(f"{pred_path}/all_predictions.pkl", "wb") as f:
#     pkl.dump(preds, f)

In [None]:
# with open(f"{pred_path}/all_predictions.pkl", "rb") as f:
#     pkl_predictions = pkl.load(f)
# type(pkl_predictions)

dict