In [None]:
import pandas as pd

from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn import TransformerDecoder, TransformerDecoderLayer
import torch.nn.functional as F

import torch
import torch.nn as nn
import torch.optim as optim


import numpy as np


import math
import random

import os
import re
from tqdm import tqdm


from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset, Sampler



import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

import time

In [None]:
#The important parameters
model_name_ar = 'moha/arabert_c19'
model_name_en = 'bert-base-uncased'

batch_size = 32
n_epochs = 3

seed = 99 #Important for reproducing the results

In [None]:
tokenizer_ar = AutoTokenizer.from_pretrained(model_name_ar, do_lower_case=True)
tokenizer_en = AutoTokenizer.from_pretrained(model_name_en, do_lower_case=True)

In [None]:
def set_seed():
    """Set seed for reproducibility.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

In [None]:
xls = pd.ExcelFile("data/transliteration/dataset.xlsx")
dataset = pd.read_excel(xls, "Sheet1")

known = dataset[dataset.from_source == True]
dataset = dataset[["arabizi", "arabic", "from_source"]]

dataset.columns = ["Arabize", "Arabic", "from_source"]

#dataset.  #Drop Arabic duplicates

In [None]:
#Store known words so we just replace them instead of computing them with model again
#This saves up computation time, and improves transliteration accuracy
known = known[["arabizi", "arabic"]].set_index("arabizi", drop=True).arabic.to_dict()
known_idx = list(known.keys())

In [None]:
in_max = dataset.apply(lambda x: len(str(x.Arabize)), axis=1).max()
out_max = dataset.apply(lambda x: len(x.Arabic), axis=1).max() + 2  #Take into account eos and sos

pad_token = 0
eos_token = 2
sos_token = 1

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def preprocess(a):
        
    x = a.copy()  
    
    def filter_letters_arabizi(word):
        
        word = word.replace("$", "s")
        word = word.replace("å", "a")
        word = word.replace("é", "e")
        word = word.replace("ê", "e")
        word = word.replace("ÿ", "y")
        word = word.replace("ą", "a")
        word = word.replace("ī", "i")
        word = word.replace("\n", "")
        word = word.replace("′", "'")
        
        return word
    
    x.Arabize = filter_letters_arabizi(str(x.Arabize))
    x.Arabic = x.Arabic
    
    return x

In [None]:
dataset[["Arabize","Arabic"]] = dataset[["Arabize","Arabic"]].apply(preprocess, axis=1)

In [None]:
in_tokens = set(" ".join(dataset.Arabize.values.tolist()).lower())
in_token_to_int = {token: (i+1) for i,token in enumerate(sorted(in_tokens))}

in_token_to_int[0] = "<pad>"

out_tokens = set(" ".join(dataset.Arabic.values.tolist()))
out_token_to_int = {token: (i+3) for i,token in enumerate(sorted(out_tokens))}



out_token_to_int["<pad>"] = pad_token

out_token_to_int["<sos>"] = sos_token
out_token_to_int["<eos>"] = eos_token

In [None]:
def tokenize(a):
    
    x = a.copy()
    
    x.Arabize = [in_token_to_int[i] for i in x.Arabize.lower()]
    x.Arabic = [sos_token] + [out_token_to_int[i] for i in x.Arabic] + [eos_token]
    
    x.Arabize = x.Arabize + (in_max - len(x.Arabize)) * [pad_token] 
    x.Arabic = x.Arabic + (out_max - len(x.Arabic)) * [pad_token] 
    
    return x

In [None]:
dataset[["Arabize","Arabic"]] = dataset[["Arabize","Arabic"]].apply(tokenize, axis=1)

validation = dataset.sample(frac=0.1)
train = dataset.drop(validation.index)

X_train = train.Arabize
y_train = train.Arabic

X_valid = validation.Arabize
y_valid = validation.Arabic

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=9000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.scale = nn.Parameter(torch.ones(1))

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(
            0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.scale * self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:

class TransformerModel(nn.Module):
    
    def __init__(self, intoken, outtoken ,hidden, enc_layers=1, dec_layers=1, dropout=0.15, nheads=4):
        super(TransformerModel, self).__init__()
        
        ff_model = hidden*4
        
        self.encoder = nn.Embedding(intoken, hidden)
        self.pos_encoder = PositionalEncoding(hidden, dropout)

        self.decoder = nn.Embedding(outtoken, hidden) 
        self.pos_decoder = PositionalEncoding(hidden, dropout)
        
        
        encoder_layers = TransformerEncoderLayer(d_model=hidden, nhead = nheads, dim_feedforward = ff_model, dropout=dropout, activation='relu')
        self.transformer_encoder = TransformerEncoder(encoder_layers, enc_layers)

        encoder_layers = TransformerDecoderLayer(hidden, nheads, ff_model, dropout, activation='relu')
        self.transformer_decoder = TransformerDecoder(encoder_layers, dec_layers)        

        self.fc_out = nn.Linear(hidden, outtoken)

        self.src_mask = None
        self.trg_mask = None
        self.memory_mask = None

        
    def generate_square_subsequent_mask(self, sz, sz1=None):
        
        if sz1 == None:
            mask = torch.triu(torch.ones(sz, sz), 1)
        else:
            mask = torch.triu(torch.ones(sz, sz1), 1)
            
        return mask.masked_fill(mask==1, float('-inf'))

    def make_len_mask_enc(self, inp):
        return (inp == pad_token).transpose(0, 1)   #(batch_size, output_seq_len)
    
    def make_len_mask_dec(self, inp):
        return (inp == pad_token).transpose(0, 1) #(batch_size, input_seq_len)
    


    def forward(self, src, trg): #SRC: (seq_len, batch_size)

        if self.trg_mask is None or self.trg_mask.size(0) != len(trg):
            self.trg_mask = self.generate_square_subsequent_mask(len(trg)).to(trg.device)
            

        #Adding padding mask
        src_pad_mask = self.make_len_mask_enc(src)
        trg_pad_mask = self.make_len_mask_dec(trg)
             

        #Add embeddings Encoder
        src = self.encoder(src)  #Embedding, (seq_len, batch_size, d_model)
        src = self.pos_encoder(src)   #Pos embedding
        
        
        #Add embedding decoder
        trg = self.decoder(trg) #(seq_len, batch_size, d_model)
        trg = self.pos_decoder(trg)

        
        memory = self.transformer_encoder(src, None, src_pad_mask)
        output = self.transformer_decoder(tgt = trg, memory = memory, tgt_mask = self.trg_mask, memory_mask = None, 
                                          tgt_key_padding_mask = trg_pad_mask, memory_key_padding_mask = src_pad_mask)

        output = self.fc_out(output)

        return output

In [None]:
len(in_token_to_int)

In [None]:
len(out_token_to_int)

In [None]:
set_seed()
model = TransformerModel(len(in_token_to_int), len(out_token_to_int), 128).to(device)

In [None]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))

In [None]:
class Arab2ArabizDS(Dataset):

    def __init__(self, data, label):
        
        self.data = data.values.tolist()
        self.labels = label.values.tolist()
        
        self.lengths_source = [len(i) for i in data]
        self.lengths_label = [len(i) for i in label]
        
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return (self.data[idx], self.labels[idx], self.lengths_source[idx], self.lengths_label[idx])

In [None]:
def data_collator_Arab2Arabiz(data):
    
    word, label, length_source, length_label = zip(*data)
    
    tensor_dim_1 = max(length_source)
    tensor_dim_2 = max(length_label)
    
    out_word = torch.full((len(word), tensor_dim_1), dtype=torch.long, fill_value=pad_token)
    label_word = torch.full((len(word), tensor_dim_2), dtype=torch.long, fill_value=pad_token)

    for i in range(len(word)):
        
        out_word[i][:len(word[i])] = torch.Tensor(word[i])
        label_word[i][:len(label[i])] = torch.Tensor(label[i])
    
    return (out_word, label_word)

In [None]:
class KSampler(Sampler):

    def __init__(self, data_source, batch_size):
        self.lens = [x[1] for x in data_source]
        self.batch_size = batch_size

    def __iter__(self):

        idx = list(range(len(self.lens)))
        arr = list(zip(self.lens, idx))

        random.shuffle(arr)
        n = self.batch_size*100

        iterator = []

        for i in range(0, len(self.lens), n):
            dt = arr[i:i+n]
            dt = sorted(dt, key=lambda x: x[0])

            for j in range(0, len(dt), self.batch_size):
                indices = list(map(lambda x: x[1], dt[j:j+self.batch_size]))
                iterator.append(indices)

        random.shuffle(iterator)
        return iter([item for sublist in iterator for item in sublist])  #Flatten nested list

    def __len__(self):
        return len(self.lens)


In [None]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

In [None]:
batch_size = 32

train_data = Arab2ArabizDS(X_train, y_train)
train_sampler = KSampler(train_data, batch_size)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, worker_init_fn=seed_worker, collate_fn=data_collator_Arab2Arabiz)

valid_data = Arab2ArabizDS(X_valid, y_valid)
valid_sampler = KSampler(valid_data, batch_size)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size,worker_init_fn=seed_worker, collate_fn=data_collator_Arab2Arabiz)

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_token)
optimizer = NoamOpt(128, 1, 4000 ,optim.Adam(model.parameters(), lr=0))

In [None]:
def run_epoch(iterator):
    
    total_loss = 0

    for src, trg in iterator:

        src = src.T.to(device)
        trg = trg.T.to(device)

        output = model(src, trg[:-1, :])
        output = output.reshape(-1, output.shape[2])

        optimizer.optimizer.zero_grad()
        loss = criterion(output, trg[1:].reshape(-1))
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()


    return total_loss / len(iterator)

In [None]:
def run_validation(iterator):
    
    total_loss = 0

    for src, trg in iterator:

        src = src.T.to(device)
        trg = trg.T.to(device)

        output = model(src, trg[:-1, :])
        output = output.reshape(-1, output.shape[2])

        optimizer.optimizer.zero_grad()
        loss = criterion(output, trg[1:].reshape(-1))
        total_loss += loss.item()


    return total_loss / len(iterator)

In [None]:
set_seed()


min_loss = 99
#Change model size 
for i in range(100):
    
    loss = run_epoch(train_dataloader)
    loss_val = run_validation(valid_dataloader)
    
    if loss_val < min_loss:
        min_loss = loss_val
        torch.save(model, "convert_best")
    
    print("EPOCH %d -- %f -- Val Loss: %f" % (i, loss, loss_val))

In [None]:
model = torch.load("convert_best").eval()

In [None]:
min_loss

In [None]:
out_int_to_token = {out_token_to_int[t]:t for t in out_token_to_int}

In [None]:
def arabizi_2_arabic(inp):
    
    input_sentence = [in_token_to_int[i] for i in inp.lower()]
    preds = [sos_token]

    input_sentence = torch.Tensor(input_sentence).unsqueeze(-1).long().to(device)


    new_char = -1

    while new_char != eos_token:

        output_sentence = torch.Tensor(preds).unsqueeze(-1).long().to(device)

        src = model.pos_encoder(model.encoder(input_sentence))
        trg = model.pos_encoder(model.decoder(output_sentence))

        memory = model.transformer_encoder(src)
        output = model.transformer_decoder(tgt = trg, memory = memory)

        output = model.fc_out(output)
        new_char = output.argmax(-1)[-1, 0].item()

        preds.append(new_char)

        if len(preds) > 50:
            break
        

    return "".join([out_int_to_token[i] for i in preds[1:-1]])

In [None]:
train = pd.read_csv("../input/zindidd/Train.csv")[["textt", "labell"]]
train.columns = ["texts", "data_labels"]

data = train

In [None]:
def preprocess(text):    #Might use the same setting if they work to other languages (english and french)  

    text = text.replace('ß',"b")
    text = text.replace('à',"a")
    text = text.replace('á',"a")
    text = text.replace('ç',"c")
    text = text.replace('è',"e")
    text = text.replace('é',"e")
    text = text.replace('$',"s")
    text = text.replace("1","")
    
    
    text = text.lower()
    text = re.sub(r'[^A-Za-z0-9 ,!?.]', '', text)

    
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    text = re.sub(r'([h][h][h][h])\1+', r'\1', text)
    text = re.sub(r'([a-g-i-z])\1+', r'\1', text)  #Remove repeating characters
    text = re.sub(r' [0-9]+ ', " ", text)
    text = re.sub(r'^[0-9]+ ', "", text)

    return text

In [None]:
#Keep numbers block
def split(text):
    
    splits = re.findall(r"[\w']+|[?!.,]", text)

    to_be_added = []
    idx_to_be_added = []
    
    forbidden = ["?", "!", ".", ","] + known_idx

    for i, split in enumerate(splits):

        if split in forbidden:
            if split in known_idx:
                to_be_added.append(known[split])
            else:
                to_be_added.append(split)
            idx_to_be_added.append(i)
        #else:
        #splits[i] = splits[i][:1000]


    splits = [i for i in splits if not i in forbidden]
    
    return splits, to_be_added, idx_to_be_added

In [None]:
problematic = []

def convert_phrase_2(text):
    text = text.replace("0","")
    text = text.replace("6","")

    #print("\nTEXT: "+text)
    phrase, to_be_added, idx_to_be_added = split(text.lower())

    max_len_phrase = max([len(i) for i in phrase])

    input_sentence = []
    for word in phrase:
        input_sentence.append([in_token_to_int[i] for i in word] + [pad_token]*(max_len_phrase-len(word)))

    input_sentence = torch.Tensor(input_sentence).long().T.to(device)
    preds = [[sos_token] * len(phrase)]

    end_word = len(phrase) * [False]
    src_pad_mask = model.make_len_mask_enc(input_sentence)


    while not all(end_word):
        output_sentence = torch.Tensor(preds).long().to(device)

        src = model.pos_encoder(model.encoder(input_sentence))
        trg = model.pos_encoder(model.decoder(output_sentence))

        memory = model.transformer_encoder(src, None ,src_pad_mask)
        output = model.transformer_decoder(tgt = trg, memory = memory, memory_key_padding_mask = src_pad_mask)
        
        
        output = model.fc_out(output)


        output = output.argmax(-1)[-1].cpu().detach().numpy()
        preds.append(output.tolist())


        end_word = (output == eos_token) | end_word
        
        if len(preds) > 50:
            global problematic 
            
            problematic.append(text)
            #print(text)
            break
            
    
    preds = np.array(preds).T
    result = []

    for word in preds:

        tmp = []
        for i in word[1:]:   
            if out_int_to_token[i] == "<eos>":
                break
            tmp.append(out_int_to_token[i])

        result.append("".join(tmp))

        
    #Re-add removed punctuation
    for item, idx in zip(to_be_added, idx_to_be_added):

        if item == "?":
            item = "؟"
        elif item == ",":
            item = "،"

        result.insert(idx, item)
        
        
    result = " ".join(result)
    
    return result

In [None]:
train.texts = train.texts.apply(preprocess)


In [None]:
results = []
step_size = 100

texts = train.texts.values.tolist()

for i in tqdm(range(0, len(texts), step_size)): 
    
    out = convert_phrase_2(" lkrb3 ".join(texts[i:i+step_size]))
    splitted_sentences = [ex.lstrip().rstrip() for ex in out.split(" " + convert_phrase_2("lkrb3") + " ")]
    
    if len(splitted_sentences) != len(texts[i:i+step_size]):
        print("DANGER")
        break
    
    results.extend(splitted_sentences)

In [None]:
train["converted"] = results.copy()
train.to_csv("train_data.csv")

In [None]:
test = pd.read_csv("../input/zindidd/Test.csv")
test.textt = test.textt.apply(preprocess)

In [None]:
results = []
step_size = 50

texts = test.textt.values.tolist()

for i in tqdm(range(0, len(texts), step_size)): 
    
    out = convert_phrase_2(" lkrb3 ".join(texts[i:i+step_size]))
    splitted_sentences = [ex.lstrip().rstrip() for ex in out.split(" " + convert_phrase_2("lkrb3") + " ")]
    
    if len(splitted_sentences) != len(texts[i:i+step_size]):
        print("DANGER")
        break
    
    results.extend(splitted_sentences)

In [None]:
test["converted"] = results
test.to_csv("test_data.csv")

In [None]:
def preprocessing_for_bert(data, tokenizer, preprocess_text, max_len=256):

    input_ids = []
    attention_masks = []
    tmp = tokenizer.encode("ab")[-1]

    for sentence in data:

        encoding = tokenizer.encode(preprocess_text(sentence))

        if len(encoding) > max_len:
            encoding = encoding[:max_len-1] + [tmp]

        in_ids = encoding
        att_mask = [1]*len(encoding)
        
        input_ids.append(in_ids)
        attention_masks.append(att_mask)

    return input_ids, attention_masks

In [None]:
class BertDataset(Dataset):

    def __init__(self, data, masks, label=None):
        
        self.data = data
        self.masks = masks
        
        if label != None:
            self.labels = label
        else:
            self.labels = None
        
        self.lengths = [len(i) for i in data]
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if self.labels !=  None:
            return (self.data[idx], self.masks[idx], self.labels[idx], self.lengths[idx])
        else:  #For validation
            return (self.data[idx], self.masks[idx], None, self.lengths[idx])

In [None]:
def data_collator(data):
    
    sentence, mask, label, length = zip(*data)
    
    tensor_dim = max(length)
    
    out_sentence = torch.full((len(sentence), tensor_dim), dtype=torch.long, fill_value=pad)
    out_mask = torch.zeros(len(sentence), tensor_dim, dtype=torch.long)

    for i in range(len(sentence)):
        
        out_sentence[i][:len(sentence[i])] = torch.Tensor(sentence[i])
        out_mask[i][:len(mask[i])] = torch.Tensor(mask[i])
    
    if label[0] != None:
        return (out_sentence, out_mask, torch.Tensor(label).long())
    else:
        return (out_sentence, out_mask)

In [None]:
class KSampler(Sampler):

    def __init__(self, data_source, batch_size):
        self.lens = [x[1] for x in data_source]
        self.batch_size = batch_size

    def __iter__(self):

        idx = list(range(len(self.lens)))
        arr = list(zip(self.lens, idx))

        random.shuffle(arr)
        n = self.batch_size*100

        iterator = []

        for i in range(0, len(self.lens), n):
            dt = arr[i:i+n]
            dt = sorted(dt, key=lambda x: x[0])

            for j in range(0, len(dt), self.batch_size):
                indices = list(map(lambda x: x[1], dt[j:j+self.batch_size]))
                iterator.append(indices)

        random.shuffle(iterator)
        return iter([item for sublist in iterator for item in sublist])  #Flatten nested list

    def __len__(self):
        return len(self.lens)


In [None]:
# Create the BertClassfier class
class BertClassifier(nn.Module):

    def __init__(self, model_name, dropout, freeze_bert=False):

        super(BertClassifier, self).__init__()
        D_in, H, D_out = 768, 200, 3

        self.bert = AutoModel.from_pretrained(model_name)

        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Linear(H, D_out)
        )

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):

        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        last_hidden_state_cls = outputs[0][:, 0, :]

        logits = self.classifier(last_hidden_state_cls)

        return logits

In [None]:
def initialize_model(model_name, epochs=4, dropout=0.1):

    bert_classifier = BertClassifier(model_name, dropout=dropout, freeze_bert=False)

    bert_classifier.to(device)

    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,   
                      eps=1e-8 
                      )

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
loss_fn = nn.CrossEntropyLoss()

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False, fold=0, prefix=""):
    
    global max_acc

    print("Start training...\n")
    for epoch_i in range(epochs):

        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts = 0, 0, 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1

            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            logits = model(b_input_ids, b_attn_mask)

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):

                time_elapsed = time.time() - t0_batch

                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
                
            if step%200 == 0 and step != 0 and epoch_i != 0 and epoch_i != 1:
                
                print("-"*70)

                if evaluation == True:

                    val_loss, val_accuracy = evaluate(model, val_dataloader)
                    
                    if val_accuracy > max_acc:
                        max_acc = val_accuracy
                        torch.save(model, prefix + "_best_"+str(fold))
                        print("new max")
                        

                    print(val_accuracy)
                    
                    print("-"*70)
                print("\n")
                
                model.train()

        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)

        if evaluation == True:
            
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            
            if val_accuracy > max_acc:
                max_acc = val_accuracy
                torch.save(model, prefix+"_best_"+str(fold))
                print("new max")

            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):

    model.eval()

    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:

        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
def get_indices(arr, idxs):  #Helper function to get multiple indexes from a list
    
    output = []
    for idx in idxs:
        output.append(arr[idx])
        
    return output

In [None]:
#Tried these different preprocessing functions and tesed their effect on the results
#Found out that text_preprocessing_2 gives the best results for the English model
def text_preprocessing_1(text): 

    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
            
    return text


def text_preprocessing_2(text): 

    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    
    text = re.sub(r'([a-g-i-z][a-g-i-z])\1+', r'\1', text)
        
    return text


def text_preprocessing_3(text):    

    text = text.replace('ß',"b")
    text = text.replace('à',"a")
    text = text.replace('á',"a")
    text = text.replace('ç',"c")
    text = text.replace('è',"e")
    text = text.replace('é',"e")
    text = text.replace('$',"s")
    text = text.replace("1","")
    
    
    text = text.lower()
    text = re.sub(r'[^A-Za-z0-9 ,!?.]', '', text)

    
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    text = re.sub(r'([h][h][h][h])\1+', r'\1', text)
    text = re.sub(r'([a-g-i-z])\1+', r'\1', text)  #Remove repeating characters
    text = re.sub(r' [0-9]+ ', " ", text)
    text = re.sub(r'^[0-9]+ ', "", text)

    return text

In [None]:
data = pd.read_csv("../input/zindidd/Train.csv")[["textt", "labell"]].iloc[1000:]
data.columns = ["texts", "data_labels"]

data.data_labels = data.data_labels.replace(0,2)  #Neutral 2, Positive 1, Negative 0
data.data_labels = data.data_labels.replace(-1,0)



X = data.texts.values
y = data.data_labels.values

preprocessed_data, masks = preprocessing_for_bert(X, tokenizer_en, text_preprocessing_2, max_len=256)
pad = tokenizer_en.pad_token_id

In [None]:
kfold = KFold(5, True, seed)
fold = 0

bests = []

for train_ids, val_ids in kfold.split(preprocessed_data):
    
    print("\n\tFOLD %d \n" % (fold))
    max_acc = -99

    X_train = get_indices(preprocessed_data, train_ids)
    y_train = get_indices(y, train_ids)
    train_masks = get_indices(masks, train_ids)
    
    X_val = get_indices(preprocessed_data, val_ids)
    y_val = get_indices(y, val_ids)
    val_masks = get_indices(masks, val_ids)
    
    
    X_val, y_val, val_masks = list(zip(*sorted(zip(X_val, y_val, val_masks), key=lambda x: len(x[0]))))  #Order the validation data for faster validation
    X_val, y_val, val_masks = list(X_val), list(y_val), list(val_masks)
    
    
    # Convert other data types to torch.Tensor
    y_train = torch.tensor(y_train)
    y_val = torch.tensor(y_val)

    # Create the DataLoader for our training set
    train_data = BertDataset(X_train, train_masks, y_train)
    train_sampler = KSampler(train_data, batch_size)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, collate_fn=data_collator)

    # Create the DataLoader for our validation set
    val_data = BertDataset(X_val, val_masks, y_val)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size, collate_fn=data_collator)
    
    
    set_seed()    # Set seed for reproducibility
    bert_classifier, optimizer, scheduler = initialize_model(model_name=model_name_en, epochs=n_epochs, dropout=0.05)
    train(bert_classifier, train_dataloader, val_dataloader, epochs=n_epochs, evaluation=True, fold=fold, prefix="en")
    
    fold += 1
    bests.append(max_acc)


In [None]:
bests

In [None]:
data = pd.read_csv("train_data.csv")[["converted", "data_labels"]].iloc[1000:]
data.columns = ["texts", "data_labels"]

data.data_labels = data.data_labels.replace(0,2)  #Neutral 2, Positive 1, Negative 0
data.data_labels = data.data_labels.replace(-1,0)



X = data.texts.values
y = data.data_labels.values

preprocessed_data, masks = preprocessing_for_bert(X, tokenizer_ar, lambda x: x, max_len=256)
pad = tokenizer_ar.pad_token_id

In [None]:
kfold = KFold(10, True, seed)
fold = 0

bests = []

for train_ids, val_ids in kfold.split(preprocessed_data):
    
    print("\n\tFOLD %d \n" % (fold))
    max_acc = -99
    
    X_train = get_indices(preprocessed_data, train_ids)
    y_train = get_indices(y, train_ids)
    train_masks = get_indices(masks, train_ids)
    
    X_val = get_indices(preprocessed_data, val_ids)
    y_val = get_indices(y, val_ids)
    val_masks = get_indices(masks, val_ids)
    

    X_val, y_val, val_masks = list(zip(*sorted(zip(X_val, y_val, val_masks), key=lambda x: len(x[0]))))  #Order the validation data for faster validation
    X_val, y_val, val_masks = list(X_val), list(y_val), list(val_masks)
    
    
    # Convert other data types to torch.Tensor
    y_train = torch.tensor(y_train)
    y_val = torch.tensor(y_val)

    # Create the DataLoader for our training set
    train_data = BertDataset(X_train, train_masks, y_train)
    train_sampler = KSampler(train_data, batch_size)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, collate_fn=data_collator)

    # Create the DataLoader for our validation set
    val_data = BertDataset(X_val, val_masks, y_val)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size, collate_fn=data_collator)
    
    
    set_seed()    # Set seed for reproducibility
    bert_classifier, optimizer, scheduler = initialize_model(model_name=model_name_ar, epochs=n_epochs, dropout=0)
    train(bert_classifier, train_dataloader, val_dataloader, epochs=n_epochs, evaluation=True, fold=fold, prefix="ar")
    
    fold += 1
    bests.append(max_acc)

    

In [None]:
bests

In [None]:
def bert_single_predict(model, test_dataloader):

    model.eval()

    all_logits = []

    for batch in tqdm(test_dataloader):

        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]
        
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    all_logits = torch.cat(all_logits, dim=0)

    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [None]:
def bert_ensemble_predict(sentences, models, tokenizer, preprocess, truncate=True, max_len=256):
    
    inputs, masks = preprocessing_for_bert(sentences, tokenizer, preprocess, max_len=max_len)
    
    
    dataset = BertDataset(inputs, masks)
    sample = SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=sample, batch_size=128, collate_fn=data_collator)
    
    preds = []
    
    for model in models:
        preds.append(bert_single_predict(model, dataloader))
        
    return preds 

In [None]:
def predict_lang(lang_prefix, directory, preprocess_fn, dataset, model_name, n=1, truncate=True, max_len=256):
    
    print("Loading the models ....")
    
    global pad
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
    pad = tokenizer.pad_token_id
    
    lang_models = []
    for i in range(n):
        lang_models.append(torch.load(directory + "/" + lang_prefix + "best_"+str(i), map_location=device))
        
    print("Inference ....")

    out = bert_ensemble_predict(dataset, lang_models, tokenizer, preprocess_fn, truncate=truncate, max_len=max_len)

    out_sum = out[0]
    for i in range(1,n):
        out_sum = out[i] + out_sum
    
    return out_sum

In [None]:
#Sort the list for faster inference
df = pd.read_csv("../input/zindidd/Test.csv")
df_converted = pd.read_csv("test_data.csv")

df["lens"] = df.textt.apply(len)
df = df.sort_values(by="lens").set_index("IDD", drop=True)
df_converted = df_converted.set_index("IDD", drop=True).loc[df.index]


#Convert to list
test = df.textt.tolist()
test_converted = df_converted[["converted"]].converted.tolist()

In [None]:
output_ar = predict_lang("ar_", "./", lambda x:x, test_converted, model_name_ar, n=10, truncate=True, max_len=512)

In [None]:
output_en = predict_lang("en_", "./", text_preprocessing_2, test, model_name_en, n=5, truncate=True, max_len=512)

In [None]:
df["preds"] = ((output_ar/10)*1.30+(output_en/5)).argmax(1)

df.preds = df.preds.replace(0,-1)
df.preds = df.preds.replace(2,0)

the_output = df.reset_index()[["IDD", "preds"]]
the_output.columns = ["ID", "label"]

the_output.to_csv("lessvalid_convvalid150.csv", index=False)