### Phase I:  Classification for each language, each one has a model 

#### Create a dataframe for all language from our original dataset

In [1]:
import pandas as pd
import random
random.seed(30)
dataset = pd.read_csv('/home/nesma/SemesterII/Neural Networks/Project/multilingual-text-categorization-dataset/data/dataset.csv', sep='\t', header=None).applymap(str)
dataset.columns = ["language","label","text"]
languagesData=[]
loc = 0
languages = dataset[dataset.columns[0]].unique()
for i in languages:
    name = languages[loc]+"Data" 
    globals()[name] = pd.DataFrame( dataset[dataset.language == i])
    loc += 1

#### Some dataPrepocessing for all the languages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
def preprocessing(text):
    text = text.str.lower()                                              #Lower
    text = text.apply(lambda x: re.split('https:\/\/.*', str(x))[0])     #Remove Secured URL
    text = text.apply(lambda x: re.split('http:\/\/.*', str(x))[0])      #Remove URL
    text = text.str.replace('\d+', '')                                   #Remove_numbers
    text = text.str.replace('[^\w\s]','')                                #Remove_punctuations
    text = text.str.strip()                                              #remove_blank_space
    text = text.replace('\s+', ' ', regex=True)                          
    return text

#### AllenNlp Classes for Dataset Reading, The model, And the predictions

In [3]:
from typing import Dict
import numpy as np
import torch
import torch.optim as optim
from allennlp.data.iterators import BucketIterator
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.training.trainer import Trainer
from typing import Dict
import logging
import csv
from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
EMBEDDING_DIM = 128
HIDDEN_DIM = 128

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
# @DatasetReader.register("data-reader")
class MultilingualDatasetReader(DatasetReader):
    def __init__(self,    
        lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy = lazy)
        self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

    @overrides
    def _read(self, file_path):
        logger.info("Reading instances from lines in file at: %s", file_path)
        with open(file_path, "r") as data_file:
            tsv_in = csv.reader(data_file, delimiter=',')
            for row in tsv_in:
                if len(row) == 2:
                    Instance = self.text_to_instance( article=row[1],label=row[0])
                    yield Instance

    @overrides
    def text_to_instance(self,  # type: ignore
                 article: str,
                 label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        tokenized_article = self._tokenizer.tokenize(article)
        fields["tokens"] = TextField(tokenized_article, self._token_indexers)
#        fields["tokens"] = article
        if label is not None:
            fields['label'] = LabelField(label)
        return Instance(fields)
    
    
class LstmClassifier(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        # We need the embeddings to convert word IDs to their vector representations
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()

        # We use the cross-entropy loss because this is a classification task.
        self.loss_function = torch.nn.CrossEntropyLoss()

        def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        # In deep NLP, when sequences of tensors in different lengths are batched together,
        # shorter sequences get padded with zeros to make them of equal length.
        # Masking is the process to ignore extra zeros added by padding
        mask = get_text_field_mask(tokens)

        # Forward pass
        embeddings = self.word_embeddings(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.hidden2tag(encoder_out)

        # In AllenNLP, the output of forward() is a dictionary.
        # Your output dictionary must contain a "loss" key for your model to be trained.
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            output["loss"] = self.loss_function(logits, label)
            output["acc"] = self.accuracy(logits, label)


        return output
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        metrics = {'accuracy': self.accuracy.get_metric(reset)}
        return metrics
    
    
from overrides import overrides

from allennlp.common.util import JsonDict
from allennlp.data import Instance
from allennlp.predictors.predictor import Predictor


# @Predictor.register('text_classifier')
class TextClassifierPredictor(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single class for it.  In particular, it can be used with
    the :class:`~allennlp.models.basic_classifier.BasicClassifier` model
    """
    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        return self._dataset_reader.text_to_instance(sentence)
    

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


#### Dataset Preparation, Including spliting the dataset for train and test, Also save those datasets to be reeded using the AllenNlp Reader lateron.

In [4]:
def prepare(ds,lang):
    ds['text'] = preprocessing(ds['text'])
    ds = ds[["label","text"]]
    df = pd.DataFrame(ds["label"]) 
    df["text"] = ds['text'].str.split().str[1:100].str.join(" ")
    msk = np.random.rand(len(df)) < 0.8
    train_dataset = df[msk]
    test_dataset = df[~msk]
    train_dataset.to_csv(str(lang)+"train.csv",index=False,header=False)
    test_dataset.to_csv(str(lang)+"test.csv",index=False,header=False)

## English DS

In [5]:
def trainLang(DS,lang):
    prepare(DS,lang)
    reader = MultilingualDatasetReader()
    train_dataset = reader.read(str(lang) + 'train.csv')
    dev_dataset = reader.read(str(lang)+'test.csv')

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    print(optimizer)
    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=50)
    print(trainer.train())

In [6]:
trainLang(englishData,"english")

3594it [00:01, 3505.70it/s]
925it [00:00, 7123.64it/s]
100%|██████████| 4519/4519 [00:00<00:00, 13557.34it/s]
  0%|          | 0/113 [00:00<?, ?it/s]

LstmClassifier(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): PytorchSeq2VecWrapper(
    (_module): LSTM(128, 128, batch_first=True)
  )
  (hidden2tag): Linear(in_features=128, out_features=45, bias=True)
  (loss_function): CrossEntropyLoss()
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 1e-05
)


accuracy: 0.0434, loss: 3.7941 ||: 100%|██████████| 113/113 [00:24<00:00,  4.23it/s]
accuracy: 0.0843, loss: 3.7487 ||: 100%|██████████| 29/29 [00:01<00:00, 22.97it/s]
accuracy: 0.0879, loss: 3.5959 ||: 100%|██████████| 113/113 [00:20<00:00,  5.00it/s]
accuracy: 0.0930, loss: 3.4621 ||: 100%|██████████| 29/29 [00:01<00:00, 26.11it/s]
accuracy: 0.1249, loss: 3.2155 ||: 100%|██████████| 113/113 [00:20<00:00,  5.67it/s]
accuracy: 0.1049, loss: 3.3729 ||: 100%|██████████| 29/29 [00:01<00:00, 26.41it/s]
accuracy: 0.1945, loss: 2.8350 ||: 100%|██████████| 113/113 [00:17<00:00,  7.50it/s]
accuracy: 0.1330, loss: 3.3942 ||: 100%|██████████| 29/29 [00:00<00:00, 43.67it/s]
accuracy: 0.2679, loss: 2.4563 ||: 100%|██████████| 113/113 [00:15<00:00,  6.88it/s]
accuracy: 0.1557, loss: 3.4278 ||: 100%|██████████| 29/29 [00:00<00:00, 42.94it/s]
accuracy: 0.3497, loss: 2.1849 ||: 100%|██████████| 113/113 [00:15<00:00,  6.65it/s]
accuracy: 0.1697, loss: 3.5414 ||: 100%|██████████| 29/29 [00:00<00:00, 41.

{'best_epoch': 2, 'peak_cpu_memory_MB': 790.72, 'training_duration': '00:03:49', 'training_start_epoch': 0, 'training_epochs': 11, 'epoch': 11, 'training_accuracy': 0.8091263216471898, 'training_loss': 0.5794810180642963, 'training_cpu_memory_MB': 790.72, 'validation_accuracy': 0.1827027027027027, 'validation_loss': 4.642499874378073, 'best_validation_accuracy': 0.10486486486486486, 'best_validation_loss': 3.372904399345661}





In [32]:
DS = englishData
lang = "english"
prepare(DS,lang)
reader = MultilingualDatasetReader()
train_dataset = reader.read(str(lang) + 'train.csv')
dev_dataset = reader.read(str(lang)+'test.csv')

vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                  min_count={'tokens': 3})
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
encoder = PytorchSeq2VecWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

model = LstmClassifier(word_embeddings, encoder, vocab)
print(model)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
print(optimizer)
iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=dev_dataset,
                  patience=10,
                  num_epochs=50)
print(trainer.train())
    


3644it [00:01, 2483.97it/s]
875it [00:00, 7443.04it/s]
100%|██████████| 4519/4519 [00:00<00:00, 10867.81it/s]
  0%|          | 0/114 [00:00<?, ?it/s]

LstmClassifier(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): PytorchSeq2VecWrapper(
    (_module): LSTM(128, 128, batch_first=True)
  )
  (hidden2tag): Linear(in_features=128, out_features=45, bias=True)
  (loss_function): CrossEntropyLoss()
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 1e-05
)


accuracy: 0.0467, loss: 3.7929 ||: 100%|██████████| 114/114 [00:27<00:00,  4.09it/s]
accuracy: 0.0640, loss: 3.7530 ||: 100%|██████████| 28/28 [00:01<00:00, 21.21it/s]
accuracy: 0.0735, loss: 3.5898 ||: 100%|██████████| 114/114 [00:25<00:00,  4.30it/s]
accuracy: 0.0811, loss: 3.5179 ||: 100%|██████████| 28/28 [00:01<00:00, 21.85it/s]
accuracy: 0.1142, loss: 3.2544 ||: 100%|██████████| 114/114 [00:25<00:00,  5.12it/s]
accuracy: 0.1177, loss: 3.4445 ||: 100%|██████████| 28/28 [00:01<00:00, 24.25it/s]
accuracy: 0.1745, loss: 2.8891 ||: 100%|██████████| 114/114 [00:26<00:00,  4.30it/s]
accuracy: 0.1383, loss: 3.4180 ||: 100%|██████████| 28/28 [00:01<00:00, 23.43it/s]
accuracy: 0.2549, loss: 2.5245 ||: 100%|██████████| 114/114 [00:26<00:00,  3.92it/s]
accuracy: 0.1714, loss: 3.3984 ||: 100%|██████████| 28/28 [00:01<00:00, 21.63it/s]
accuracy: 0.3573, loss: 2.1293 ||: 100%|██████████| 114/114 [00:27<00:00,  4.02it/s]
accuracy: 0.1726, loss: 3.4508 ||: 100%|██████████| 28/28 [00:01<00:00, 15.

{'best_epoch': 4, 'peak_cpu_memory_MB': 1461.52, 'training_duration': '00:06:31', 'training_start_epoch': 0, 'training_epochs': 13, 'epoch': 13, 'training_accuracy': 0.8600439077936334, 'training_loss': 0.3739910206773825, 'training_cpu_memory_MB': 1461.52, 'validation_accuracy': 0.21257142857142858, 'validation_loss': 4.628362476825714, 'best_validation_accuracy': 0.17142857142857143, 'best_validation_loss': 3.398360252380371}





In [35]:
tokens = 'available in itunes about bad wolves a los angeles californiabased heavy metal outfit with an impressive hard rock'
predictor = TextClassifierPredictor(model, dataset_reader=reader)
logits = predictor.predict(tokens)['logits']
label_id = np.argmax(logits)

print(model.vocab.get_token_from_index(label_id, 'labels'))

jewelry


## Estonian Language

In [8]:
trainLang(estonianData,"estonian")

578it [00:00, 4258.49it/s]
133it [00:00, 4912.18it/s]
100%|██████████| 711/711 [00:00<00:00, 9673.20it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

LstmClassifier(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): PytorchSeq2VecWrapper(
    (_module): LSTM(128, 128, batch_first=True)
  )
  (hidden2tag): Linear(in_features=128, out_features=20, bias=True)
  (loss_function): CrossEntropyLoss()
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 1e-05
)


accuracy: 0.1125, loss: 2.8812 ||: 100%|██████████| 19/19 [00:02<00:00,  8.90it/s]
accuracy: 0.1955, loss: 2.4912 ||: 100%|██████████| 5/5 [00:00<00:00, 43.81it/s]
accuracy: 0.1419, loss: 2.4112 ||: 100%|██████████| 19/19 [00:02<00:00,  7.41it/s]
accuracy: 0.1429, loss: 2.3065 ||: 100%|██████████| 5/5 [00:00<00:00, 48.07it/s]
accuracy: 0.1713, loss: 2.3331 ||: 100%|██████████| 19/19 [00:02<00:00,  6.99it/s]
accuracy: 0.1880, loss: 2.3465 ||: 100%|██████████| 5/5 [00:00<00:00, 49.18it/s]
accuracy: 0.2630, loss: 2.2167 ||: 100%|██████████| 19/19 [00:02<00:00,  7.88it/s]
accuracy: 0.2632, loss: 2.3031 ||: 100%|██████████| 5/5 [00:00<00:00, 48.09it/s]
accuracy: 0.2976, loss: 2.0351 ||: 100%|██████████| 19/19 [00:02<00:00,  7.80it/s]
accuracy: 0.3910, loss: 2.0688 ||: 100%|██████████| 5/5 [00:00<00:00, 49.04it/s]
accuracy: 0.3426, loss: 1.9690 ||: 100%|██████████| 19/19 [00:02<00:00,  7.19it/s]
accuracy: 0.3985, loss: 1.9051 ||: 100%|██████████| 5/5 [00:00<00:00, 47.91it/s]
accuracy: 0.3806

{'best_epoch': 15, 'peak_cpu_memory_MB': 809.384, 'training_duration': '00:01:13', 'training_start_epoch': 0, 'training_epochs': 24, 'epoch': 24, 'training_accuracy': 0.7716262975778547, 'training_loss': 0.5849381755841406, 'training_cpu_memory_MB': 809.232, 'validation_accuracy': 0.43609022556390975, 'validation_loss': 1.9164239645004273, 'best_validation_accuracy': 0.5037593984962406, 'best_validation_loss': 1.5504174709320069}





## Arabic Language

In [9]:
lang = "arabic"
DS = arabicData
trainLang(DS,lang)

1609it [00:00, 2254.28it/s]
383it [00:00, 6246.43it/s]
100%|██████████| 1992/1992 [00:00<00:00, 14365.29it/s]
  0%|          | 0/51 [00:00<?, ?it/s]

LstmClassifier(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): PytorchSeq2VecWrapper(
    (_module): LSTM(128, 128, batch_first=True)
  )
  (hidden2tag): Linear(in_features=128, out_features=28, bias=True)
  (loss_function): CrossEntropyLoss()
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 1e-05
)


accuracy: 0.0740, loss: 3.2149 ||: 100%|██████████| 51/51 [00:08<00:00,  5.94it/s]
accuracy: 0.0888, loss: 3.1456 ||: 100%|██████████| 12/12 [00:00<00:00, 31.00it/s]
accuracy: 0.0901, loss: 3.0734 ||: 100%|██████████| 51/51 [00:07<00:00,  6.84it/s]
accuracy: 0.0914, loss: 3.0363 ||: 100%|██████████| 12/12 [00:00<00:00, 33.05it/s]
accuracy: 0.1175, loss: 2.8930 ||: 100%|██████████| 51/51 [00:07<00:00,  6.99it/s]
accuracy: 0.1175, loss: 2.8830 ||: 100%|██████████| 12/12 [00:00<00:00, 38.58it/s]
accuracy: 0.2039, loss: 2.5863 ||: 100%|██████████| 51/51 [00:07<00:00,  8.67it/s]
accuracy: 0.1540, loss: 2.8339 ||: 100%|██████████| 12/12 [00:00<00:00, 39.55it/s]
accuracy: 0.3132, loss: 2.2385 ||: 100%|██████████| 51/51 [00:07<00:00,  6.40it/s]
accuracy: 0.2089, loss: 2.7449 ||: 100%|██████████| 12/12 [00:00<00:00, 37.74it/s]
accuracy: 0.4786, loss: 1.8214 ||: 100%|██████████| 51/51 [00:08<00:00,  6.80it/s]
accuracy: 0.2402, loss: 2.7302 ||: 100%|██████████| 12/12 [00:00<00:00, 34.92it/s]
accu

{'best_epoch': 5, 'peak_cpu_memory_MB': 848.4, 'training_duration': '00:02:02', 'training_start_epoch': 0, 'training_epochs': 14, 'epoch': 14, 'training_accuracy': 0.9055313859540087, 'training_loss': 0.22668769312839882, 'training_cpu_memory_MB': 848.4, 'validation_accuracy': 0.2845953002610966, 'validation_loss': 3.446045736471812, 'best_validation_accuracy': 0.2402088772845953, 'best_validation_loss': 2.730215867360433}





## Spanish Data

In [10]:
lang = "spanish"
DS = spanishData
trainLang(DS,lang)

3235it [00:00, 3610.27it/s]
805it [00:00, 3072.66it/s]
100%|██████████| 4040/4040 [00:00<00:00, 13888.46it/s]
  0%|          | 0/102 [00:00<?, ?it/s]

LstmClassifier(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): PytorchSeq2VecWrapper(
    (_module): LSTM(128, 128, batch_first=True)
  )
  (hidden2tag): Linear(in_features=128, out_features=42, bias=True)
  (loss_function): CrossEntropyLoss()
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 1e-05
)


accuracy: 0.0457, loss: 3.6906 ||: 100%|██████████| 102/102 [00:16<00:00,  6.07it/s]
accuracy: 0.0522, loss: 3.6524 ||: 100%|██████████| 26/26 [00:00<00:00, 35.35it/s]
accuracy: 0.0668, loss: 3.5939 ||: 100%|██████████| 102/102 [00:15<00:00,  7.23it/s]
accuracy: 0.0733, loss: 3.5352 ||: 100%|██████████| 26/26 [00:00<00:00, 41.17it/s]
accuracy: 0.1286, loss: 3.2442 ||: 100%|██████████| 102/102 [00:14<00:00,  6.99it/s]
accuracy: 0.1342, loss: 3.3428 ||: 100%|██████████| 26/26 [00:00<00:00, 36.15it/s]
accuracy: 0.2340, loss: 2.8023 ||: 100%|██████████| 102/102 [00:15<00:00,  6.16it/s]
accuracy: 0.1764, loss: 3.2250 ||: 100%|██████████| 26/26 [00:00<00:00, 39.56it/s]
accuracy: 0.3431, loss: 2.3890 ||: 100%|██████████| 102/102 [00:14<00:00,  6.37it/s]
accuracy: 0.2012, loss: 3.3317 ||: 100%|██████████| 26/26 [00:00<00:00, 40.85it/s]
accuracy: 0.4906, loss: 1.8463 ||: 100%|██████████| 102/102 [00:14<00:00,  7.07it/s]
accuracy: 0.2099, loss: 3.5232 ||: 100%|██████████| 26/26 [00:00<00:00, 41.

{'best_epoch': 3, 'peak_cpu_memory_MB': 857.904, 'training_duration': '00:03:25', 'training_start_epoch': 0, 'training_epochs': 12, 'epoch': 12, 'training_accuracy': 0.9221020092735703, 'training_loss': 0.23847985302335492, 'training_cpu_memory_MB': 857.904, 'validation_accuracy': 0.2236024844720497, 'validation_loss': 5.1005976016704855, 'best_validation_accuracy': 0.1763975155279503, 'best_validation_loss': 3.22501749258775}





## French data

In [11]:
lang = "french"
DS = frenchData
trainLang(DS,lang)

3107it [00:00, 3646.04it/s]
766it [00:00, 3557.22it/s]
100%|██████████| 3873/3873 [00:00<00:00, 20387.45it/s]
  0%|          | 0/98 [00:00<?, ?it/s]

LstmClassifier(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): PytorchSeq2VecWrapper(
    (_module): LSTM(128, 128, batch_first=True)
  )
  (hidden2tag): Linear(in_features=128, out_features=41, bias=True)
  (loss_function): CrossEntropyLoss()
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 1e-05
)


accuracy: 0.0447, loss: 3.6517 ||: 100%|██████████| 98/98 [00:14<00:00,  6.41it/s]
accuracy: 0.0496, loss: 3.6294 ||: 100%|██████████| 24/24 [00:00<00:00, 37.29it/s]
accuracy: 0.0846, loss: 3.5051 ||: 100%|██████████| 98/98 [00:14<00:00,  7.25it/s]
accuracy: 0.0940, loss: 3.4574 ||: 100%|██████████| 24/24 [00:00<00:00, 37.05it/s]
accuracy: 0.1561, loss: 3.1613 ||: 100%|██████████| 98/98 [00:14<00:00,  6.31it/s]
accuracy: 0.1527, loss: 3.2518 ||: 100%|██████████| 24/24 [00:00<00:00, 38.60it/s]
accuracy: 0.2617, loss: 2.7120 ||: 100%|██████████| 98/98 [00:14<00:00,  6.76it/s]
accuracy: 0.1710, loss: 3.1620 ||: 100%|██████████| 24/24 [00:00<00:00, 37.26it/s]
accuracy: 0.3746, loss: 2.2211 ||: 100%|██████████| 98/98 [00:14<00:00,  6.57it/s]
accuracy: 0.1958, loss: 3.1418 ||: 100%|██████████| 24/24 [00:00<00:00, 41.30it/s]
accuracy: 0.4892, loss: 1.7936 ||: 100%|██████████| 98/98 [00:14<00:00,  6.07it/s]
accuracy: 0.2167, loss: 3.2966 ||: 100%|██████████| 24/24 [00:00<00:00, 39.20it/s]
accu

{'best_epoch': 4, 'peak_cpu_memory_MB': 880.408, 'training_duration': '00:03:30', 'training_start_epoch': 0, 'training_epochs': 13, 'epoch': 13, 'training_accuracy': 0.878982941744448, 'training_loss': 0.29328808142822616, 'training_cpu_memory_MB': 880.408, 'validation_accuracy': 0.20757180156657964, 'validation_loss': 4.639977211753528, 'best_validation_accuracy': 0.195822454308094, 'best_validation_loss': 3.1418435921271644}





## Phase II: Classification for all languages using one model.

#### Dataset Concatenation for all the languages

In [12]:
AllLangs = englishData.append(estonianData)
AllLangs = AllLangs.append(arabicData)
AllLangs = AllLangs.append(frenchData)
AllLangs = AllLangs.append(spanishData)

## All langugues are concatenated and the dataset splited into 80, 20 for training and 20% from all languages was selected as validation 

In [13]:
lang = "allLanguages"
DS = AllLangs
trainLang(DS,lang)

12059it [00:02, 4253.97it/s]
3076it [00:00, 5113.82it/s] 
100%|██████████| 15135/15135 [00:00<00:00, 18087.20it/s]
  0%|          | 0/377 [00:00<?, ?it/s]

LstmClassifier(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): PytorchSeq2VecWrapper(
    (_module): LSTM(128, 128, batch_first=True)
  )
  (hidden2tag): Linear(in_features=128, out_features=45, bias=True)
  (loss_function): CrossEntropyLoss()
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 1e-05
)


accuracy: 0.0595, loss: 3.6332 ||: 100%|██████████| 377/377 [01:09<00:00,  5.68it/s]
accuracy: 0.0809, loss: 3.4967 ||: 100%|██████████| 97/97 [00:02<00:00, 37.69it/s]
accuracy: 0.1140, loss: 3.2527 ||: 100%|██████████| 377/377 [01:13<00:00,  4.53it/s]
accuracy: 0.1359, loss: 3.2834 ||: 100%|██████████| 97/97 [00:02<00:00, 37.70it/s]
accuracy: 0.2103, loss: 2.7668 ||: 100%|██████████| 377/377 [01:11<00:00,  5.22it/s]
accuracy: 0.1869, loss: 3.2044 ||: 100%|██████████| 97/97 [00:02<00:00, 42.40it/s]
accuracy: 0.3435, loss: 2.2126 ||: 100%|██████████| 377/377 [01:22<00:00,  4.94it/s]
accuracy: 0.2178, loss: 3.2420 ||: 100%|██████████| 97/97 [00:02<00:00, 37.92it/s]
accuracy: 0.5004, loss: 1.6371 ||: 100%|██████████| 377/377 [01:10<00:00,  4.95it/s]
accuracy: 0.2263, loss: 3.4114 ||: 100%|██████████| 97/97 [00:02<00:00, 38.70it/s]
accuracy: 0.6354, loss: 1.1587 ||: 100%|██████████| 377/377 [01:10<00:00,  5.08it/s]
accuracy: 0.2341, loss: 3.7145 ||: 100%|██████████| 97/97 [00:02<00:00, 39.

{'best_epoch': 2, 'peak_cpu_memory_MB': 1233.208, 'training_duration': '00:15:04', 'training_start_epoch': 0, 'training_epochs': 11, 'epoch': 11, 'training_accuracy': 0.8825773281366615, 'training_loss': 0.285989127242913, 'training_cpu_memory_MB': 1233.208, 'validation_accuracy': 0.23114434330299088, 'validation_loss': 4.743071145618085, 'best_validation_accuracy': 0.18693107932379713, 'best_validation_loss': 3.204392076767597}





## Validation using a subset from the english dataset, which was excluded before spliting the whole dataframe into train and dev

In [28]:
def trainAllLang(DS,lang,testPath):
    DS['text'] = preprocessing(DS['text'])
    DS = DS[["label","text"]]
    df = pd.DataFrame(DS["label"]) 
    df["text"] = DS['text'].str.split().str[1:100].str.join(" ")
    
    
    df.to_csv(str(lang)+"train.csv",index=False,header=False)

    reader = MultilingualDatasetReader()
    train_dataset = reader.read(str(lang) + 'train.csv')
    dev_dataset = reader.read(testPath)

    vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                      min_count={'tokens': 3})
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model = LstmClassifier(word_embeddings, encoder, vocab)
    print(model)
    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    print(optimizer)
    iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=dev_dataset,
                      patience=10,
                      num_epochs=50)
    print(trainer.train())
    


In [14]:
englishTrain = pd.read_csv("englishtrainForAll.csv") 
AllLangs = englishTrain.append(estonianData)
AllLangs = AllLangs.append(arabicData)
AllLangs = AllLangs.append(frenchData)
AllLangs = AllLangs.append(spanishData)

testPath = 'englishtest.csv'
trainAllLang(AllLangs,"AllLangs2",testPath)

14215it [00:03, 4047.59it/s]
925it [00:00, 10890.96it/s]
100%|██████████| 15140/15140 [00:00<00:00, 19460.59it/s]
  0%|          | 0/445 [00:00<?, ?it/s]

LstmClassifier(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): PytorchSeq2VecWrapper(
    (_module): LSTM(128, 128, batch_first=True)
  )
  (hidden2tag): Linear(in_features=128, out_features=45, bias=True)
  (loss_function): CrossEntropyLoss()
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 1e-05
)


accuracy: 0.0607, loss: 3.6050 ||: 100%|██████████| 445/445 [01:22<00:00,  5.27it/s]
accuracy: 0.0314, loss: 3.8596 ||: 100%|██████████| 29/29 [00:00<00:00, 38.11it/s]
accuracy: 0.1213, loss: 3.2738 ||: 100%|██████████| 445/445 [01:21<00:00,  5.35it/s]
accuracy: 0.1514, loss: 3.1513 ||: 100%|██████████| 29/29 [00:00<00:00, 42.64it/s]
accuracy: 0.2103, loss: 2.8005 ||: 100%|██████████| 445/445 [01:25<00:00,  6.05it/s]
accuracy: 0.2400, loss: 2.7640 ||: 100%|██████████| 29/29 [00:00<00:00, 42.72it/s]
accuracy: 0.3037, loss: 2.3737 ||: 100%|██████████| 445/445 [01:37<00:00,  5.06it/s]
accuracy: 0.3297, loss: 2.4199 ||: 100%|██████████| 29/29 [00:00<00:00, 30.74it/s]
accuracy: 0.4096, loss: 1.9470 ||: 100%|██████████| 445/445 [01:30<00:00,  5.56it/s]
accuracy: 0.4573, loss: 2.1025 ||: 100%|██████████| 29/29 [00:00<00:00, 40.76it/s]
accuracy: 0.5197, loss: 1.5586 ||: 100%|██████████| 445/445 [01:32<00:00,  4.49it/s]
accuracy: 0.5341, loss: 1.8408 ||: 100%|██████████| 29/29 [00:00<00:00, 36.

{'best_epoch': 29, 'peak_cpu_memory_MB': 1461.52, 'training_duration': '01:03:51', 'training_start_epoch': 0, 'training_epochs': 38, 'epoch': 38, 'training_accuracy': 0.8846289131199437, 'training_loss': 0.22364689901973425, 'training_cpu_memory_MB': 1461.52, 'validation_accuracy': 0.7654054054054054, 'validation_loss': 1.1931363056445945, 'best_validation_accuracy': 0.7610810810810811, 'best_validation_loss': 1.1701805684073219}





## Phase III: Round Trip

#### Read the dataset that have the arabic text translated to english 

In [16]:
RT = pd.read_csv("mydatasetWitharabicEnglishText.csv")
RT.head()

Unnamed: 0,label,text,Arabic,NewEnglish
0,advertising,asks sec to mull halfyear corporate filings vs...,يسأل ثانية للنظر في طلبات الشركات نصف السنة مق...,Asked again to look at half-year corporate req...
1,advertising,st up on trade hopes sp equals longest bull ru...,التباطؤ في الآمال التجارية: sp تساوي أطول فترة...,Deceleration in business hopes: sp equals the ...
2,advertising,shares hit oneyear low on turkey china worries...,الأسهم الصينية تتراجع إلى أدنى مستوياتها مع ال...,Chinese stocks fall to their lowest levels wit...
3,advertising,stocks weaken as turkey worries weigh dollar s...,الأسهم الأسيوية تضعف مع قلق تركيا من ارتفاع ال...,Asian stocks weaken as Turkey worries about do...
4,advertising,asian shares edge up after wall st gains but c...,الأسهم الأسيوية ترتفع بعد المكاسب التي حققتها ...,"Asian stocks rise after Wall Street gains, but..."


#### Read the first part from the CSV to have the same training and test split as the previous classifier, for fair comparison

In [27]:
Part1 = pd.read_csv("englishtrain.csv",header=None)
Part1.columns = ["label","text"]
Part1.head()

Unnamed: 0,label,text
0,advertising,asks sec to mull halfyear corporate filings vs...
1,advertising,stocks weaken as turkey worries weigh dollar s...
2,advertising,asian shares edge up after wall st gains but c...
3,advertising,bid for tesla no formal offer no firm deals wi...
4,advertising,st rallies on solid earnings uschina trade tal...


#### Read the second part of the dataset from the translated dataset

In [21]:
Part2 = RT["label"]
Part2 = pd.DataFrame(Part2)
Part2["text"] = RT["NewEnglish"]
Part2.head()

Unnamed: 0,label,text
0,advertising,Asked again to look at half-year corporate req...
1,advertising,Deceleration in business hopes: sp equals the ...
2,advertising,Chinese stocks fall to their lowest levels wit...
3,advertising,Asian stocks weaken as Turkey worries about do...
4,advertising,"Asian stocks rise after Wall Street gains, but..."


#### Concatenate the to parts to have the original training part of english dataset, alongwith the arabic dataset that have got translated.

In [29]:
RT_Translated = Part1.append(Part2)

#### The path to the test dataset is provided to the model, in order to have the same validation dataset as the previous model to have a fair comparison. So the validation dataset is totally separated from the training dataset

In [30]:
testPath = 'englishtest.csv'
trainAllLang(RT_Translated,"RT_Translated",testPath)

4593it [00:01, 3790.21it/s]
925it [00:00, 5359.65it/s]
100%|██████████| 5518/5518 [00:00<00:00, 13031.99it/s]
  0%|          | 0/144 [00:00<?, ?it/s]

LstmClassifier(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): PytorchSeq2VecWrapper(
    (_module): LSTM(128, 128, batch_first=True)
  )
  (hidden2tag): Linear(in_features=128, out_features=45, bias=True)
  (loss_function): CrossEntropyLoss()
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 1e-05
)


accuracy: 0.0583, loss: 3.7051 ||: 100%|██████████| 144/144 [00:32<00:00,  4.77it/s]
accuracy: 0.0692, loss: 3.7097 ||: 100%|██████████| 29/29 [00:01<00:00, 20.89it/s]
accuracy: 0.1034, loss: 3.3974 ||: 100%|██████████| 144/144 [00:30<00:00,  5.27it/s]
accuracy: 0.0778, loss: 3.6678 ||: 100%|██████████| 29/29 [00:01<00:00, 23.35it/s]
accuracy: 0.1380, loss: 3.0907 ||: 100%|██████████| 144/144 [00:30<00:00,  4.45it/s]
accuracy: 0.1243, loss: 3.4553 ||: 100%|██████████| 29/29 [00:01<00:00, 22.91it/s]
accuracy: 0.2197, loss: 2.7675 ||: 100%|██████████| 144/144 [00:30<00:00,  4.46it/s]
accuracy: 0.1286, loss: 3.3195 ||: 100%|██████████| 29/29 [00:01<00:00, 19.64it/s]
accuracy: 0.3055, loss: 2.4183 ||: 100%|██████████| 144/144 [00:30<00:00,  4.79it/s]
accuracy: 0.1665, loss: 3.2709 ||: 100%|██████████| 29/29 [00:01<00:00, 22.87it/s]
accuracy: 0.4156, loss: 2.0200 ||: 100%|██████████| 144/144 [00:29<00:00,  4.76it/s]
accuracy: 0.2151, loss: 3.2400 ||: 100%|██████████| 29/29 [00:01<00:00, 23.

{'best_epoch': 5, 'peak_cpu_memory_MB': 1461.52, 'training_duration': '00:08:37', 'training_start_epoch': 0, 'training_epochs': 14, 'epoch': 14, 'training_accuracy': 0.8689309819290224, 'training_loss': 0.38841870703941417, 'training_cpu_memory_MB': 1461.52, 'validation_accuracy': 0.24864864864864866, 'validation_loss': 4.254193289526578, 'best_validation_accuracy': 0.21513513513513513, 'best_validation_loss': 3.240029030832751}



