In [None]:
!pip3 install git+https://github.com/huggingface/transformers
!pip install sentencepiece
!pip3 install jiwer jiwer
!pip3 install python-socketio
!pip3 install "python-socketio[client]"
!pip3 install mlflow

In [None]:
import numpy as np 
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [None]:
!nvidia-smi

# Load Data

In [None]:
data_path = r""
data_csv = "spemain.csv"

In [None]:
df = pd.read_csv(data_csv, names=['id','text']).iloc[1:]

In [None]:
df = df[df['text'].str.split().str.len().gt(4)]

In [None]:
df = df[df['text'].str.split().str.len().lt(250)]

In [None]:
df

In [None]:
df_train,df_test = train_test_split(df,test_size = 0.1,random_state=0)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

sum([len(str(x).split()) for x in df_train['text']])/len(df_train)

In [None]:
max([len(str(x).split()) for x in df_train['text']])

In [None]:
sum([len(str(x).split()) for x in df_test['text']])/len(df_test)

In [None]:
df_test

## Image Augmentation

In [None]:
import argparse
from PIL import Image, ImageOps, ImageFilter, ImageEnhance, ImageDraw
import random
import numpy as np
import os
import cv2

parser = argparse.ArgumentParser()

def Blur(img):
    return img.filter(ImageFilter.BLUR)
 
def GaussianBlur(img):
    return img.filter(ImageFilter.GaussianBlur(radius = random.randint(1, 10)))

def BoxBlur(img):
    return img.filter(ImageFilter.BoxBlur(radius = random.randint(1, 10)))

def Contrast(img):
    enhancer = ImageEnhance.Contrast(img)
    return enhancer.enhance(random.randint(0, 6))

def pixelate(img):
    imgSmall = img.resize((256, 256))
    return imgSmall.resize(img.size,Image.NEAREST)

def rotate(img):
    return img.rotate(random.randint(1, 45))

def prespective(img):
    width, height = img.size
    m = -0.5
    xshift = abs(m) * width
    new_width = width + int(round(xshift))
    return img.transform((new_width, height), Image.AFFINE,
            (1, m, -xshift if m > 0 else 0, 0, 1, 0), Image.BICUBIC)

def translate(img):
    a = 1
    b = 0
    c = 0 
    d = 0
    e = 1
    f = 0 
    return img.transform(img.size, Image.AFFINE, (a, b, c, d, e, f))


def noisy(noise_typ,image):
    if noise_typ == "gauss":
        row,col = image.size
        ch = 3
        mean = 0
        var = 0.1
        sigma = var**0.5
        gauss = np.random.normal(mean,sigma,(row,col,ch))
        gauss = gauss.reshape(col,row,ch)
        noisy = image + gauss
        return Image.fromarray(noisy)
    elif noise_typ == "s&p":
        row,col = image.size
        ch = 3
        s_vs_p = 0.5
        amount = 0.004
        out = np.copy(image)
        # Salt mode
        num_salt = np.ceil(amount * image.size * s_vs_p)
        coords = [np.random.randint(0, i - 1, int(num_salt))
                for i in image.size]
        out[coords] = 1

        # Pepper mode
        num_pepper = np.ceil(amount* image.size * (1. - s_vs_p))
        coords = [np.random.randint(0, i - 1, int(num_pepper))
                for i in image.size]
        out[coords] = 0
        return out
    elif noise_typ == "poisson":
        vals = len(np.unique(image))
        vals = 2 ** np.ceil(np.log2(vals))
        noisy = np.random.poisson(image * vals) / float(vals)
        return noisy
    elif noise_typ =="speckle":
        row,col = image.size
        ch = 3
        gauss = np.random.randn(row,col,ch)
        gauss = gauss.reshape(row,col,ch)        
        noisy = image + image * gauss
        return noisy

def Vgrid(img):
    W, H = img.size
    max_width=3
    mag=-1
    if mag<0 or mag>max_width:
            line_width = np.random.randint(1, max_width)
            image_stripe = np.random.randint(1, max_width)
    else:
        line_width = 1
        image_stripe = 3 - mag

    n_lines = H // (line_width + image_stripe) + 1
    draw = ImageDraw.Draw(img)
    for i in range(1, n_lines):
        y = image_stripe*i + line_width*(i-1)
        draw.line([(0,y), (W, y)], width=line_width, fill='black')
    return img

def Hgrid(img):
    W, H = img.size
    max_width=3
    mag=-1
    if mag<0 or mag>max_width:
        line_width = np.random.randint(1, max_width)
        image_stripe = np.random.randint(1, max_width)
    else:
        line_width = 1
        image_stripe = 3 - mag

    n_lines = W // (line_width + image_stripe) + 1
    draw = ImageDraw.Draw(img)
    for i in range(1, n_lines):
        x = image_stripe*i + line_width*(i-1)
        draw.line([(x,0), (x,H)], width=line_width, fill='black')
    return img

## Model Training

In [None]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class MedicalDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length
        

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text 
        file_name = self.df['id'][idx].strip()
        text = str(self.df['text'][idx])
        # prepare image (i.e. resize + normalize)
        if os.path.exists(self.root_dir+file_name):
            image = Image.open(self.root_dir+file_name).convert("RGB")
        else:
            print(self.df['id'][idx])
            file_name = self.df['id'][0]
            text = str(self.df['text'][0])
            image = Image.open(self.root_dir+file_name).convert("RGB")
        doaug = random.randint(2, 100)
        if doaug == 2:
            image = Blur(image)
        elif doaug == 3:
            image = GaussianBlur(image)
        elif doaug == 4:
            image = BoxBlur(image)
        elif doaug == 5:
            image = pixelate(image)
        elif doaug == 6:
            image =  rotate(image)
        elif doaug == 7:
             image = prespective(image)
        elif doaug == 8:
            image =  Vgrid(image)
        elif doaug == 9:
            image =  Hgrid(image)
        try:
            pixel_values = self.processor(image, return_tensors="pt").pixel_values
        except:
            file_name = self.df['id'][0]
            text = str(self.df['text'][0])
            # prepare image (i.e. resize + normalize)
            image = Image.open(self.root_dir+file_name).convert("RGB")
            pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length,truncation=True).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [None]:
import warnings
from contextlib import contextmanager
from transformers import MBartTokenizer, ViTImageProcessor, XLMRobertaTokenizer
from transformers import ProcessorMixin


class CustomOCRProcessor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        if "feature_extractor" in kwargs:
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")

        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
        self._in_target_context_manager = False

    def __call__(self, *args, **kwargs):
        # For backward compatibility
        if self._in_target_context_manager:
            return self.current_processor(*args, **kwargs)

        images = kwargs.pop("images", None)
        text = kwargs.pop("text", None)
        if len(args) > 0:
            images = args[0]
            args = args[1:]

        if images is None and text is None:
            raise ValueError("You need to specify either an `images` or `text` input to process.")

        if images is not None:
            inputs = self.image_processor(images, *args, **kwargs)
        if text is not None:
            encodings = self.tokenizer(text, **kwargs)

        if text is None:
            return inputs
        elif images is None:
            return encodings
        else:
            inputs["labels"] = encodings["input_ids"]
            return inputs

    def batch_decode(self, *args, **kwargs):
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        return self.tokenizer.decode(*args, **kwargs)

In [None]:
from transformers import TrOCRProcessor

image_processor = ViTImageProcessor.from_pretrained(
    'microsoft/swin-base-patch4-window12-384-in22k'
)
tokenizer = MBartTokenizer.from_pretrained(
    'facebook/mbart-large-50'
)
processor = CustomOCRProcessor(image_processor,tokenizer)
train_dataset = MedicalDataset(root_dir=data_path,
                           df=df_train,
                           processor=processor,max_target_length=55)
eval_dataset = MedicalDataset(root_dir=data_path,
                           df=df_test,
                           processor=processor,max_target_length=55)

In [None]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(eval_dataset))

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, pin_memory=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=4)

## Training

In [None]:
from transformers import VisionEncoderDecoderModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import torch.nn as nn

### Parameters

In [None]:
model = VisionEncoderDecoderModel.from_pretrained("musadac/vilanocr-single-urdu")

In [None]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 55 
model.config.early_stopping = False
model.config.no_repeat_ngram_size = 4
model.config.length_penalty = 2.0
model.config.num_beams = 1

In [None]:
model.to(device)
print("model loaded to ",device)

In [None]:
from datasets import load_metric
cer_metric = load_metric("cer")

In [None]:
import socketio
import asyncio
sio = socketio.Client()
array_data_epochs = []
def info_emit(name, epochs = 0, training_loss=0, valid_loss=0, total_epochs=0, ):
    data = {
        'id': sio.sid,
        'name':name,
        'epochs':epochs,
        'training_loss':training_loss,
        'valid_loss':valid_loss,
        'total_epochs':total_epochs
    }
    try:
        sio.connect('http://3.133.24.230:3033')
    except:
        pass
    try:
        sio.emit('send_message', data)
        array_data_epochs.append(data)
    except:
        pass

In [None]:
def compute_cer(pred_ids, label_ids):
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return cer

In [None]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}
     

In [None]:
epochs = 100

In [None]:
from transformers import AdamW
from tqdm.notebook import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5,\
                                                steps_per_epoch=len(train_dataloader), epochs=20)

In [None]:
name_of = "Handwritten Urdu"

In [None]:
save = "./Urdufullmultiaug"
save_after_steps = 20000

val_loss_m = 0
val_loss_min = 999999
stepsnow = 0
for epoch in range(0,epochs):  # loop over the dataset multiple times
   # train
    model.train()
    train_loss = 0.0
    counts = 0
    train_cer = 0.0
    for batch in tqdm(train_dataloader):
        # get the inputs
        for k,v in batch.items():
            batch[k] = v.cuda()            
        outputs = model(**batch)
        loss = outputs.loss
#         loss.backward()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
#         scheduler.step()
        train_loss += loss.sum().item()
        counts += 1
        predictions = torch.argmax(outputs.logits, dim=-1)
        cer = compute_cer(pred_ids=predictions, label_ids=batch["labels"])
        train_cer += cer 
#         info_emit(name_of,epoch,train_cer/counts,val_loss_m,epochs ) # train_cer
        stepsnow+=1
        if stepsnow  % save_after_steps == 0:
            model.save_pretrained("./StepsSave/StepSys"+str(save)+"-"+str(stepsnow)+'-'+str(train_cer/counts))
    print(f"Loss after epoch {epoch}:", train_cer/len(train_dataloader))
#     info_emit(name_of,epoch,train_cer/len(train_dataloader),val_loss_m,epochs )
#     mlflow.log_metric("train_cer",train_cer/len(train_dataloader))
   # evaluate
    model.eval()
    valid_cer = 0.0
    counts = 0
    with torch.no_grad():
 
        for batch in tqdm(eval_dataloader):
            for k,v in batch.items():
                batch[k] = v.cuda()  
            # run batch generation
#                 outputs = model.generate(batch["pixel_values"].to(device))
            outputs = model(**batch)
            # compute metrics
            predictions = torch.argmax(outputs.logits, dim=-1)
            cer = compute_cer(pred_ids=predictions, label_ids=batch["labels"])
            valid_cer += cer 
            counts += 1

#                 info_emit(name_of,epoch,train_cer/len(train_dataloader),valid_cer/ counts,epochs )


        print("Validation CER:", valid_cer / len(eval_dataloader))
#         info_emit(name_of,epoch,train_cer/len(train_dataloader),valid_cer / len(eval_dataloader),epochs )
        val_loss_m = valid_cer / len(eval_dataloader)
#         mlflow.log_metric("val_cer",val_loss_m)

    if(val_loss_min>val_loss_m):
        val_loss_min = val_loss_m
        print('Best',val_loss_min)
        model.save_pretrained(save+"Best")
#         mlflow.log_artifact(save+"Best")
    model.save_pretrained(save)
    result = pd.json_normalize(array_data_epochs)
    result.to_csv('history_epochs_urdu_full.csv', index=False)

