### Dependencies

In [None]:
! pip install --upgrade torch pandas transformers farasapy PyArabic

In [None]:
! pip install git+https://github.com/adapter-hub/adapter-transformers.git

**RESTART RUNTIME**

### Dataset Prep

In [None]:
! wget "https://github.com/UBC-NLP/aoc_id/raw/master/data/train/MultiTrain.Shuffled.csv"

In [None]:
import pandas as pd

df_ubc = pd.read_csv("./MultiTrain.Shuffled.csv", index_col=0)
df_ubc = df_ubc[["text", "label"]]
df_ubc.head()

Unnamed: 0,text,label
0,بالإضافة لقيام معلمو الجيزة للذهاب إلي جريدة ا...,MSA
1,بعدين والله حرام تجي تلقى الي واقف عند الاشاره...,MSA
2,لمسه اليد مرتين واضحة جدا والحكم,DIAL_LEV
3,بخصوص الهاتريك عمرها ما راح تصير,DIAL_LEV
4,الله يجبر كسرهم ويرجع و لدهم اليوم قبل بكرى ،,DIAL_GLF


In [None]:
! git clone https://github.com/ryancotterell/arabic_dialect_annotation
! gunzip arabic_dialect_annotation/annotated_data.tar.gz
! tar -xvf arabic_dialect_annotation/annotated_data.tar

In [None]:
df_egy = pd.read_csv("annotated_data/egyptian", delimiter="\t").dropna()
df_egy["label"] = "DIAL_EGY"
df_egy = df_egy[["text", "label"]]
df_egy.head(3)

Unnamed: 0,text,label
1,والله الاخلاق في ها الزمن ضاعت وصارت النساء مث...,DIAL_EGY
2,بصراحة الوحدات الان بدفع ثمن تفريطو بأكثر من ل...,DIAL_EGY
3,الكل ضد المارد الاخضر الجميع فرحان بخساره الوح...,DIAL_EGY


In [None]:
df_lev = pd.read_csv("annotated_data/levantine", delimiter="\t").dropna()
df_lev["label"] = "DIAL_LEV"
df_lev = df_lev[["text", "label"]]

# df_msa = pd.read_csv("annotated_data/msa", delimiter="\t").dropna()
# df_msa["label"] = "MSA"
# df_msa = df_msa[["text", "label"]].sample(20000)

df_mgh = pd.read_csv("annotated_data/maghrebi", delimiter="\t").dropna()
df_mgh["label"] = "MAGHREBI"
df_mgh = df_mgh[["text", "label"]]

df_mgh2 = pd.read_csv("annotated_data/twitter-maghrebi", delimiter="\t").dropna()
df_mgh2["label"] = "MAGHREBI"
df_mgh2 = df_mgh2[["text", "label"]]
df_mgh = df_mgh.append(df_mgh2)

df_glf = pd.read_csv("annotated_data/gulf", delimiter="\t").dropna()
df_glf["label"] = "DIAL_GLF"
df_glf = df_glf[["text", "label"]]

In [None]:
df = df_ubc.append(df_egy).append(df_lev).append(df_mgh).append(df_glf) #no additional msa
print(len(df))
df.head()

113191


Unnamed: 0,text,label
0,بالإضافة لقيام معلمو الجيزة للذهاب إلي جريدة ا...,MSA
1,بعدين والله حرام تجي تلقى الي واقف عند الاشاره...,MSA
2,لمسه اليد مرتين واضحة جدا والحكم,DIAL_LEV
3,بخصوص الهاتريك عمرها ما راح تصير,DIAL_LEV
4,الله يجبر كسرهم ويرجع و لدهم اليوم قبل بكرى ،,DIAL_GLF


In [None]:
df.label.value_counts()

MSA         50845
DIAL_GLF    22898
DIAL_EGY    18946
DIAL_LEV    13890
MAGHREBI     6612
Name: label, dtype: int64

In [None]:
df.label = pd.Categorical(df.label)
df.label = df.label.cat.codes

In [None]:
# Key is 0=Egyptian, 1=Gulf, 2=Levantine, 3=Maghrebi, 4=MSA
df.label.value_counts()

4    50845
1    22898
0    18946
2    13890
3     6612
Name: label, dtype: int64

### Train/test split combined data
Tokenize / encode

In [None]:
! git clone https://github.com/aub-mind/arabert

In [None]:
from sklearn.model_selection import train_test_split
train, eval = train_test_split(df)

In [None]:
from farasa.segmenter import FarasaSegmenter
from arabert.preprocess_arabert import preprocess
from transformers import AutoTokenizer

farasa_segmenter = FarasaSegmenter(interactive=True)
tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabert')

train_str = []
for line in list(train["text"].astype('str')):
  line_preprocessed = preprocess( line,
                                do_farasa_tokenization = True,
                                farasa = farasa_segmenter,
                                use_farasapy = True)
  train_str.append( tokenizer.encode(line_preprocessed, truncation=True, max_length=128, pad_to_max_length=True) )

eval_str = []
for line in list(eval["text"].astype('str')):
  line_preprocessed = preprocess( line,
                                do_farasa_tokenization = True,
                                farasa = farasa_segmenter,
                                use_farasapy = True)
  eval_str.append( tokenizer.encode(line_preprocessed, truncation=True, max_length=128, pad_to_max_length=True) )

[2020-09-24 03:14:34,763 - DEBUG]: perform system check...
[2020-09-24 03:14:34,765 - DEBUG]: check java version...
[2020-09-24 03:14:35,459 - DEBUG]: Your java version is 11.0 which is compatiple with Farasa 
[2020-09-24 03:14:35,460 - DEBUG]: check toolkit binaries...
[2020-09-24 03:14:35,464 - INFO]: some binaries are not existed.
[2020-09-24 03:14:35,467 - INFO]: downloading zipped binaries...
[2020-09-24 03:14:35,473 - DEBUG]: Starting new HTTPS connection (1): farasa-api.qcri.org:443
[2020-09-24 03:14:36,944 - DEBUG]: https://farasa-api.qcri.org:443 "GET /farasapy/releases/download/toolkit-bins-released/farasa_bin.zip HTTP/1.1" 200 200394706


100%|██████████| 200M/200M [00:18<00:00, 13.0MiB/s]

[2020-09-24 03:14:55,216 - DEBUG]: extracting...
[2020-09-24 03:14:55,963 - DEBUG]: toolkit binaries are downloaded and extracted.
[2020-09-24 03:14:55,978 - INFO]: Dependencies seem to be satisfied..
[2020-09-24 03:14:55,981 - INFO]: [37minitializing [SEGMENT] task in [32mINTERACTIVE [37mmode...
[2020-09-24 03:15:00,795 - INFO]: task [SEGMENT] is initialized interactively.
[2020-09-24 03:15:00,806 - DEBUG]: Starting new HTTPS connection (1): s3.amazonaws.com:443
[2020-09-24 03:15:01,585 - DEBUG]: https://s3.amazonaws.com:443 "HEAD /models.huggingface.co/bert/aubmindlab/bert-base-arabert/config.json HTTP/1.1" 200 0
[2020-09-24 03:15:01,588 - DEBUG]: Attempting to acquire lock 139710643437072 on /root/.cache/torch/transformers/91c3e98e149f6e88215bffd705e4ef9bd8a355f4c317973e4f3868c6f93fa24a.87c61215f57298a5ff1f7680910adeb70154f00bf4e7a0fe7d5ab21ee115c7cf.lock
[2020-09-24 03:15:01,593 - INFO]: Lock 139710643437072 acquired on /root/.cache/torch/transformers/91c3e98e149f6e88215bffd705e

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=572.0, style=ProgressStyle(description_…

[2020-09-24 03:15:02,395 - INFO]: storing https://s3.amazonaws.com/models.huggingface.co/bert/aubmindlab/bert-base-arabert/config.json in cache at /root/.cache/torch/transformers/91c3e98e149f6e88215bffd705e4ef9bd8a355f4c317973e4f3868c6f93fa24a.87c61215f57298a5ff1f7680910adeb70154f00bf4e7a0fe7d5ab21ee115c7cf
[2020-09-24 03:15:02,396 - INFO]: creating metadata file for /root/.cache/torch/transformers/91c3e98e149f6e88215bffd705e4ef9bd8a355f4c317973e4f3868c6f93fa24a.87c61215f57298a5ff1f7680910adeb70154f00bf4e7a0fe7d5ab21ee115c7cf
[2020-09-24 03:15:02,399 - DEBUG]: Attempting to release lock 139710643437072 on /root/.cache/torch/transformers/91c3e98e149f6e88215bffd705e4ef9bd8a355f4c317973e4f3868c6f93fa24a.87c61215f57298a5ff1f7680910adeb70154f00bf4e7a0fe7d5ab21ee115c7cf.lock
[2020-09-24 03:15:02,401 - INFO]: Lock 139710643437072 released on /root/.cache/torch/transformers/91c3e98e149f6e88215bffd705e4ef9bd8a355f4c317973e4f3868c6f93fa24a.87c61215f57298a5ff1f7680910adeb70154f00bf4e7a0fe7d5ab21e




[2020-09-24 03:15:03,350 - DEBUG]: https://s3.amazonaws.com:443 "HEAD /models.huggingface.co/bert/aubmindlab/bert-base-arabert/vocab.txt HTTP/1.1" 200 0
[2020-09-24 03:15:03,353 - DEBUG]: Attempting to acquire lock 139708571119568 on /root/.cache/torch/transformers/e00d70bd70387a7cefb7b3f960c2b4bff22c254c680dabca3c09336b98c42396.a17ce51bb78b7fa46a5b28baee670bca2f9bf6bc93608fc37438db58382e5bc0.lock
[2020-09-24 03:15:03,354 - INFO]: Lock 139708571119568 acquired on /root/.cache/torch/transformers/e00d70bd70387a7cefb7b3f960c2b4bff22c254c680dabca3c09336b98c42396.a17ce51bb78b7fa46a5b28baee670bca2f9bf6bc93608fc37438db58382e5bc0.lock
[2020-09-24 03:15:03,355 - INFO]: https://s3.amazonaws.com/models.huggingface.co/bert/aubmindlab/bert-base-arabert/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpg7bv_7ff
[2020-09-24 03:15:03,362 - DEBUG]: Starting new HTTPS connection (1): s3.amazonaws.com:443
[2020-09-24 03:15:04,168 - DEBUG]: https

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=717153.0, style=ProgressStyle(descripti…

[2020-09-24 03:15:05,100 - INFO]: storing https://s3.amazonaws.com/models.huggingface.co/bert/aubmindlab/bert-base-arabert/vocab.txt in cache at /root/.cache/torch/transformers/e00d70bd70387a7cefb7b3f960c2b4bff22c254c680dabca3c09336b98c42396.a17ce51bb78b7fa46a5b28baee670bca2f9bf6bc93608fc37438db58382e5bc0
[2020-09-24 03:15:05,101 - INFO]: creating metadata file for /root/.cache/torch/transformers/e00d70bd70387a7cefb7b3f960c2b4bff22c254c680dabca3c09336b98c42396.a17ce51bb78b7fa46a5b28baee670bca2f9bf6bc93608fc37438db58382e5bc0
[2020-09-24 03:15:05,103 - DEBUG]: Attempting to release lock 139708571119568 on /root/.cache/torch/transformers/e00d70bd70387a7cefb7b3f960c2b4bff22c254c680dabca3c09336b98c42396.a17ce51bb78b7fa46a5b28baee670bca2f9bf6bc93608fc37438db58382e5bc0.lock
[2020-09-24 03:15:05,104 - INFO]: Lock 139708571119568 released on /root/.cache/torch/transformers/e00d70bd70387a7cefb7b3f960c2b4bff22c254c680dabca3c09336b98c42396.a17ce51bb78b7fa46a5b28baee670bca2f9bf6bc93608fc37438db5838




[2020-09-24 03:15:05,957 - DEBUG]: https://s3.amazonaws.com:443 "HEAD /models.huggingface.co/bert/aubmindlab/bert-base-arabert/added_tokens.json HTTP/1.1" 404 0
[2020-09-24 03:15:05,964 - DEBUG]: Starting new HTTPS connection (1): s3.amazonaws.com:443
[2020-09-24 03:15:06,708 - DEBUG]: https://s3.amazonaws.com:443 "HEAD /models.huggingface.co/bert/aubmindlab/bert-base-arabert/special_tokens_map.json HTTP/1.1" 200 0
[2020-09-24 03:15:06,711 - DEBUG]: Attempting to acquire lock 139708571118112 on /root/.cache/torch/transformers/4b95797216e163eea8d9cb1922733c29b4b3ba1036fc48b291cfe660b4240c51.275045728fbf41c11d3dae08b8742c054377e18d92cc7b72b6351152a99b64e4.lock
[2020-09-24 03:15:06,714 - INFO]: Lock 139708571118112 acquired on /root/.cache/torch/transformers/4b95797216e163eea8d9cb1922733c29b4b3ba1036fc48b291cfe660b4240c51.275045728fbf41c11d3dae08b8742c054377e18d92cc7b72b6351152a99b64e4.lock
[2020-09-24 03:15:06,716 - INFO]: https://s3.amazonaws.com/models.huggingface.co/bert/aubmindlab/be

100%|██████████| 200M/200M [00:29<00:00, 13.0MiB/s]

[2020-09-24 03:15:07,506 - DEBUG]: https://s3.amazonaws.com:443 "GET /models.huggingface.co/bert/aubmindlab/bert-base-arabert/special_tokens_map.json HTTP/1.1" 200 112


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…

[2020-09-24 03:15:07,544 - INFO]: storing https://s3.amazonaws.com/models.huggingface.co/bert/aubmindlab/bert-base-arabert/special_tokens_map.json in cache at /root/.cache/torch/transformers/4b95797216e163eea8d9cb1922733c29b4b3ba1036fc48b291cfe660b4240c51.275045728fbf41c11d3dae08b8742c054377e18d92cc7b72b6351152a99b64e4
[2020-09-24 03:15:07,546 - INFO]: creating metadata file for /root/.cache/torch/transformers/4b95797216e163eea8d9cb1922733c29b4b3ba1036fc48b291cfe660b4240c51.275045728fbf41c11d3dae08b8742c054377e18d92cc7b72b6351152a99b64e4
[2020-09-24 03:15:07,548 - DEBUG]: Attempting to release lock 139708571118112 on /root/.cache/torch/transformers/4b95797216e163eea8d9cb1922733c29b4b3ba1036fc48b291cfe660b4240c51.275045728fbf41c11d3dae08b8742c054377e18d92cc7b72b6351152a99b64e4.lock
[2020-09-24 03:15:07,549 - INFO]: Lock 139708571118112 released on /root/.cache/torch/transformers/4b95797216e163eea8d9cb1922733c29b4b3ba1036fc48b291cfe660b4240c51.275045728fbf41c11d3dae08b8742c054377e18d92cc




[2020-09-24 03:15:08,310 - DEBUG]: https://s3.amazonaws.com:443 "HEAD /models.huggingface.co/bert/aubmindlab/bert-base-arabert/tokenizer_config.json HTTP/1.1" 200 0
[2020-09-24 03:15:08,313 - DEBUG]: Attempting to acquire lock 139708571119120 on /root/.cache/torch/transformers/7f3845bf9305a0617f5c8bed56fae2122d82c8f1e2fb5daea826606bcff59b32.1ddbe932d1da5efd2703149c0507e2f0c6ef863b1470aecbc2275f5edf984bea.lock
[2020-09-24 03:15:08,314 - INFO]: Lock 139708571119120 acquired on /root/.cache/torch/transformers/7f3845bf9305a0617f5c8bed56fae2122d82c8f1e2fb5daea826606bcff59b32.1ddbe932d1da5efd2703149c0507e2f0c6ef863b1470aecbc2275f5edf984bea.lock
[2020-09-24 03:15:08,314 - INFO]: https://s3.amazonaws.com/models.huggingface.co/bert/aubmindlab/bert-base-arabert/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp7zpx_g06
[2020-09-24 03:15:08,318 - DEBUG]: Starting new HTTPS connection (1): s3.amazonaws.com:443
[2020-09-24 03:1

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=406.0, style=ProgressStyle(description_…

[2020-09-24 03:15:09,138 - INFO]: storing https://s3.amazonaws.com/models.huggingface.co/bert/aubmindlab/bert-base-arabert/tokenizer_config.json in cache at /root/.cache/torch/transformers/7f3845bf9305a0617f5c8bed56fae2122d82c8f1e2fb5daea826606bcff59b32.1ddbe932d1da5efd2703149c0507e2f0c6ef863b1470aecbc2275f5edf984bea
[2020-09-24 03:15:09,140 - INFO]: creating metadata file for /root/.cache/torch/transformers/7f3845bf9305a0617f5c8bed56fae2122d82c8f1e2fb5daea826606bcff59b32.1ddbe932d1da5efd2703149c0507e2f0c6ef863b1470aecbc2275f5edf984bea
[2020-09-24 03:15:09,141 - DEBUG]: Attempting to release lock 139708571119120 on /root/.cache/torch/transformers/7f3845bf9305a0617f5c8bed56fae2122d82c8f1e2fb5daea826606bcff59b32.1ddbe932d1da5efd2703149c0507e2f0c6ef863b1470aecbc2275f5edf984bea.lock
[2020-09-24 03:15:09,143 - INFO]: Lock 139708571119120 released on /root/.cache/torch/transformers/7f3845bf9305a0617f5c8bed56fae2122d82c8f1e2fb5daea826606bcff59b32.1ddbe932d1da5efd2703149c0507e2f0c6ef863b1470ae




### Prepare AdapterHub

In [None]:
from transformers import AutoModelWithHeads
model = AutoModelWithHeads.from_pretrained('aubmindlab/bert-base-arabert')

from transformers import AdapterType
model.add_adapter("dialect-arabic", AdapterType.text_task)
model.train_adapter(["dialect-arabic"])
model.add_classification_head("dialect-arabic", num_labels=5)
model.set_active_adapters([["dialect-arabic"]])

[2020-09-24 03:17:30,076 - DEBUG]: Starting new HTTPS connection (1): s3.amazonaws.com:443
[2020-09-24 03:17:30,847 - DEBUG]: https://s3.amazonaws.com:443 "HEAD /models.huggingface.co/bert/aubmindlab/bert-base-arabert/config.json HTTP/1.1" 200 0
[2020-09-24 03:17:30,850 - INFO]: loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/aubmindlab/bert-base-arabert/config.json from cache at /root/.cache/torch/transformers/91c3e98e149f6e88215bffd705e4ef9bd8a355f4c317973e4f3868c6f93fa24a.87c61215f57298a5ff1f7680910adeb70154f00bf4e7a0fe7d5ab21ee115c7cf
[2020-09-24 03:17:30,852 - INFO]: Model config BertConfig {
  "adapters": {
    "adapters": {}
  },
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "ber

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=543450661.0, style=ProgressStyle(descri…

[2020-09-24 03:17:39,642 - INFO]: storing https://cdn.huggingface.co/aubmindlab/bert-base-arabert/pytorch_model.bin in cache at /root/.cache/torch/transformers/26f8720a79d80f7ae70d448d93e09fed364a6b0393c95a026519402dbd1313b2.8867d4172d26dd413ba8029e13bbe7eb5b9ae301b62c259f8be84ee78c0beb0c
[2020-09-24 03:17:39,650 - INFO]: creating metadata file for /root/.cache/torch/transformers/26f8720a79d80f7ae70d448d93e09fed364a6b0393c95a026519402dbd1313b2.8867d4172d26dd413ba8029e13bbe7eb5b9ae301b62c259f8be84ee78c0beb0c
[2020-09-24 03:17:39,652 - DEBUG]: Attempting to release lock 139708570739880 on /root/.cache/torch/transformers/26f8720a79d80f7ae70d448d93e09fed364a6b0393c95a026519402dbd1313b2.8867d4172d26dd413ba8029e13bbe7eb5b9ae301b62c259f8be84ee78c0beb0c.lock
[2020-09-24 03:17:39,653 - INFO]: Lock 139708570739880 released on /root/.cache/torch/transformers/26f8720a79d80f7ae70d448d93e09fed364a6b0393c95a026519402dbd1313b2.8867d4172d26dd413ba8029e13bbe7eb5b9ae301b62c259f8be84ee78c0beb0c.lock
[2020




[2020-09-24 03:17:44,504 - INFO]: Weights from pretrained model not used in BertModelWithHeads: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
[2020-09-24 03:17:44,507 - INFO]: Adding adapter 'dialect-arabic' of type 'text_task'.
[2020-09-24 03:17:44,547 - INFO]: Adding head 'dialect-arabic' with config {'head_type': 'classification', 'num_labels': 5, 'layers': 2, 'activation_function': 'tanh'}.


In [None]:
import torch
from torch.utils.data import Dataset

class Result():
    def __init__(self, x, y):
        self.input_ids = x
        self.label = y

class DatasetHelper(Dataset):
    def __init__(self, encodings, labels):
        self.input_ids = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return Result(self.input_ids[idx], self.labels[idx])

### Run training

In [None]:
from transformers import Trainer, TrainingArguments, EvalPrediction, glue_compute_metrics

# def compute_metrics(p: EvalPrediction):
#     preds = np.argmax(p.predictions, axis=1)
#     return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

training_args = TrainingArguments(
    logging_steps=1000, 
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=64, 
    save_steps=1000,
    evaluate_during_training=True,
    output_dir="./models/dialect-arabic",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    learning_rate=0.0001,
    num_train_epochs=3,
)

In [None]:
train_dt = DatasetHelper(train_str, list(train["label"]))
eval_dt = DatasetHelper(eval_str, list(eval["label"]))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dt,
    eval_dataset=eval_dt,
    #compute_metrics=compute_metrics,
    #tokenizer=tokenizer
)

[2020-09-24 03:17:53,267 - INFO]: PyTorch: setting up devices
[2020-09-24 03:18:07,541 - INFO]: You are instantiating a Trainer but W&B is not installed. To use wandb logging, run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface.


In [None]:
trainer.train()
trainer.evaluate()

[2020-09-24 03:18:07,572 - INFO]: ***** Running training *****
[2020-09-24 03:18:07,573 - INFO]:   Num examples = 84893
[2020-09-24 03:18:07,574 - INFO]:   Num Epochs = 3
[2020-09-24 03:18:07,577 - INFO]:   Instantaneous batch size per device = 32
[2020-09-24 03:18:07,580 - INFO]:   Total train batch size (w. parallel, distributed & accumulation) = 32
[2020-09-24 03:18:07,581 - INFO]:   Gradient Accumulation steps = 1
[2020-09-24 03:18:07,582 - INFO]:   Total optimization steps = 7959


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=2653.0, style=ProgressStyle(description_w…

[2020-09-24 03:20:43,811 - INFO]: ***** Running Evaluation *****
[2020-09-24 03:20:43,812 - INFO]:   Num examples = 28298
[2020-09-24 03:20:43,816 - INFO]:   Batch size = 64


{"loss": 0.8311975573599338, "learning_rate": 8.743560748837794e-05, "epoch": 0.3769317753486619, "step": 1000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=443.0, style=ProgressStyle(description_w…

[2020-09-24 03:21:45,653 - INFO]: Saving model checkpoint to ./models/dialect-arabic/checkpoint-1000
[2020-09-24 03:21:45,655 - INFO]: Configuration saved in ./models/dialect-arabic/checkpoint-1000/config.json



{"eval_loss": 0.6661830785984918, "epoch": 0.3769317753486619, "step": 1000}


[2020-09-24 03:21:47,597 - INFO]: Model weights saved in ./models/dialect-arabic/checkpoint-1000/pytorch_model.bin
[2020-09-24 03:24:24,030 - INFO]: ***** Running Evaluation *****
[2020-09-24 03:24:24,031 - INFO]:   Num examples = 28298
[2020-09-24 03:24:24,036 - INFO]:   Batch size = 64


{"loss": 0.6577505039870739, "learning_rate": 7.487121497675587e-05, "epoch": 0.7538635506973238, "step": 2000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=443.0, style=ProgressStyle(description_w…

[2020-09-24 03:25:26,056 - INFO]: Saving model checkpoint to ./models/dialect-arabic/checkpoint-2000
[2020-09-24 03:25:26,059 - INFO]: Configuration saved in ./models/dialect-arabic/checkpoint-2000/config.json



{"eval_loss": 0.614043429594815, "epoch": 0.7538635506973238, "step": 2000}


[2020-09-24 03:25:28,121 - INFO]: Model weights saved in ./models/dialect-arabic/checkpoint-2000/pytorch_model.bin





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=2653.0, style=ProgressStyle(description_w…

[2020-09-24 03:28:04,433 - INFO]: ***** Running Evaluation *****
[2020-09-24 03:28:04,434 - INFO]:   Num examples = 28298
[2020-09-24 03:28:04,438 - INFO]:   Batch size = 64


{"loss": 0.6304260551184416, "learning_rate": 6.230682246513382e-05, "epoch": 1.1307953260459858, "step": 3000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=443.0, style=ProgressStyle(description_w…

[2020-09-24 03:29:06,328 - INFO]: Saving model checkpoint to ./models/dialect-arabic/checkpoint-3000
[2020-09-24 03:29:06,330 - INFO]: Configuration saved in ./models/dialect-arabic/checkpoint-3000/config.json



{"eval_loss": 0.5975804443284028, "epoch": 1.1307953260459858, "step": 3000}


[2020-09-24 03:29:08,190 - INFO]: Model weights saved in ./models/dialect-arabic/checkpoint-3000/pytorch_model.bin
[2020-09-24 03:31:44,199 - INFO]: ***** Running Evaluation *****
[2020-09-24 03:31:44,200 - INFO]:   Num examples = 28298
[2020-09-24 03:31:44,205 - INFO]:   Batch size = 64


{"loss": 0.6004857460558415, "learning_rate": 4.974242995351175e-05, "epoch": 1.5077271013946476, "step": 4000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=443.0, style=ProgressStyle(description_w…

[2020-09-24 03:32:46,085 - INFO]: Saving model checkpoint to ./models/dialect-arabic/checkpoint-4000
[2020-09-24 03:32:46,087 - INFO]: Configuration saved in ./models/dialect-arabic/checkpoint-4000/config.json



{"eval_loss": 0.5864283250766736, "epoch": 1.5077271013946476, "step": 4000}


[2020-09-24 03:32:48,003 - INFO]: Model weights saved in ./models/dialect-arabic/checkpoint-4000/pytorch_model.bin
[2020-09-24 03:35:23,931 - INFO]: ***** Running Evaluation *****
[2020-09-24 03:35:23,933 - INFO]:   Num examples = 28298
[2020-09-24 03:35:23,933 - INFO]:   Batch size = 64


{"loss": 0.584248293057084, "learning_rate": 3.7178037441889686e-05, "epoch": 1.8846588767433095, "step": 5000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=443.0, style=ProgressStyle(description_w…

[2020-09-24 03:36:25,792 - INFO]: Saving model checkpoint to ./models/dialect-arabic/checkpoint-5000
[2020-09-24 03:36:25,795 - INFO]: Configuration saved in ./models/dialect-arabic/checkpoint-5000/config.json



{"eval_loss": 0.5868301301118214, "epoch": 1.8846588767433095, "step": 5000}


[2020-09-24 03:36:27,710 - INFO]: Model weights saved in ./models/dialect-arabic/checkpoint-5000/pytorch_model.bin





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=2653.0, style=ProgressStyle(description_w…

[2020-09-24 03:39:03,744 - INFO]: ***** Running Evaluation *****
[2020-09-24 03:39:03,745 - INFO]:   Num examples = 28298
[2020-09-24 03:39:03,748 - INFO]:   Batch size = 64


{"loss": 0.5719429815411567, "learning_rate": 2.4613644930267624e-05, "epoch": 2.2615906520919715, "step": 6000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=443.0, style=ProgressStyle(description_w…

[2020-09-24 03:40:05,697 - INFO]: Saving model checkpoint to ./models/dialect-arabic/checkpoint-6000
[2020-09-24 03:40:05,700 - INFO]: Configuration saved in ./models/dialect-arabic/checkpoint-6000/config.json



{"eval_loss": 0.5721590425104374, "epoch": 2.2615906520919715, "step": 6000}


[2020-09-24 03:40:07,749 - INFO]: Model weights saved in ./models/dialect-arabic/checkpoint-6000/pytorch_model.bin
[2020-09-24 03:42:43,778 - INFO]: ***** Running Evaluation *****
[2020-09-24 03:42:43,779 - INFO]:   Num examples = 28298
[2020-09-24 03:42:43,779 - INFO]:   Batch size = 64


{"loss": 0.5603452992886305, "learning_rate": 1.204925241864556e-05, "epoch": 2.638522427440633, "step": 7000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=443.0, style=ProgressStyle(description_w…

[2020-09-24 03:43:45,633 - INFO]: Saving model checkpoint to ./models/dialect-arabic/checkpoint-7000
[2020-09-24 03:43:45,636 - INFO]: Configuration saved in ./models/dialect-arabic/checkpoint-7000/config.json



{"eval_loss": 0.5618560506581722, "epoch": 2.638522427440633, "step": 7000}


[2020-09-24 03:43:47,667 - INFO]: Model weights saved in ./models/dialect-arabic/checkpoint-7000/pytorch_model.bin
[2020-09-24 03:46:17,274 - INFO]: 

Training completed. Do not forget to share your model on huggingface.co/models =)


[2020-09-24 03:46:17,276 - INFO]: ***** Running Evaluation *****
[2020-09-24 03:46:17,279 - INFO]:   Num examples = 28298
[2020-09-24 03:46:17,280 - INFO]:   Batch size = 64






HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=443.0, style=ProgressStyle(description_w…


{"eval_loss": 0.5657353865037384, "epoch": 3.0, "step": 7959}


{'epoch': 3.0, 'eval_loss': 0.5657353865037384}

### Export Adapter

In [None]:
model.save_adapter("./dialect-ar", "dialect-arabic")

[2020-09-24 03:47:20,145 - INFO]: Configuration saved in ./dialect-ar/adapter_config.json
[2020-09-24 03:47:20,161 - INFO]: Module weights saved in ./dialect-ar/pytorch_adapter.bin
[2020-09-24 03:47:20,163 - INFO]: Configuration saved in ./dialect-ar/head_config.json
[2020-09-24 03:47:20,176 - INFO]: Module weights saved in ./dialect-ar/pytorch_model_head.bin


# Testing the completed adapter



In [None]:
! pip install --upgrade torch pandas transformers
! pip install git+https://github.com/adapter-hub/adapter-transformers.git

In [None]:
from transformers import BertModelWithHeads, AdapterConfig, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabert")
model = BertModelWithHeads.from_pretrained("aubmindlab/bert-base-arabert")
config = AdapterConfig.load("pfeiffer")
adapter = model.load_adapter("dialect/arabic@mapmeld", "text_task", config=config)
model.set_active_adapters(adapter)
model.eval()

BertModelWithHeads(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [None]:
# Arabic dialect data unused in training
! unzip DART.zip

In [None]:
import pandas as pd
import torch

In [None]:
# Key is 0=Egyptian, 1=Gulf, 2=Levantine, 3=Maghrebi, 4=MSA

In [None]:
egy_eval = pd.read_csv("DART/eval-acc/EGY.txt", delimiter='\t')
count = 0
for idx, row in egy_eval.iterrows():
  sentence = row[1]
  input_tensor = torch.tensor([tokenizer.encode(sentence)])
  outputs = model(input_tensor)
  predicted = torch.argmax(outputs[0]).item()
  print(predicted)
  count += 1
  if count > 10:
    break

0
0
0
0
0
0
0
0
0
2
0


In [None]:
lev_eval = pd.read_csv("DART/eval-acc/LEV.txt", delimiter='\t')
count = 0
for idx, row in lev_eval.iterrows():
  sentence = row[1]
  input_tensor = torch.tensor([tokenizer.encode(sentence)])
  outputs = model(input_tensor)
  predicted = torch.argmax(outputs[0]).item()
  print(predicted)
  count += 1
  if count > 10:
    break

2
0
2
2
2
2
2
2
0
2
2


In [None]:
mgh_eval = pd.read_csv("DART/eval-acc/MGH.txt", delimiter='\t')
count = 0
for idx, row in mgh_eval.iterrows():
  sentence = row[1]
  input_tensor = torch.tensor([tokenizer.encode(sentence)])
  outputs = model(input_tensor)
  predicted = torch.argmax(outputs[0]).item()
  print(predicted)
  count += 1
  if count > 10:
    break

3
3
3
3
0
0
3
1
1
2
1


In [None]:
! git clone https://github.com/ryancotterell/arabic_dialect_annotation
! gunzip arabic_dialect_annotation/annotated_data.tar.gz
! tar -xvf arabic_dialect_annotation/annotated_data.tar

In [None]:
msa_eval = pd.read_csv("annotated_data/msa", delimiter='\t')
count = 0
for idx, row in msa_eval.iterrows():
  if count > 0:
    sentence = row[1]
    input_tensor = torch.tensor([tokenizer.encode(sentence)])
    outputs = model(input_tensor)
    predicted = torch.argmax(outputs[0]).item()
    print(predicted)
  count += 1
  if count > 10:
    break

3
4
4
4
2
4
4
4
4
4
