# Installations


In [56]:
!pip install datasets
!pip install langid



# Imports

In [57]:
from datasets import load_dataset
from datasets import load_from_disk
import langid

In [58]:
from google.colab import drive # Link your drive if you are a colab user
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os.path as path
if path.exists("/content"):
  !sudo add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
  !sudo apt-get update -qq 2>&1 > /dev/null
  !sudo apt -y install -qq google-drive-ocamlfuse 2>&1 > /dev/null
  !google-drive-ocamlfuse

  !sudo apt-get install -qq w3m # to act as web browser
  !xdg-settings set default-web-browser w3m.desktop # to set default browser
  %cd /content
  !mkdir drive
  %cd drive
  !mkdir MyDrive
  %cd ..
  %cd ..
  !google-drive-ocamlfuse -o nonempty /content/drive/MyDrive/



debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
/usr/bin/xdg-open: 882: www-browser: not found
/usr/bin/xdg-open: 882: links2: not found
/usr/bin/xdg-open: 882: elinks: not found
/usr/bin/xdg-open: 882: links: not found
/usr/bin/xdg-open: 882: lynx: not found
/usr/bin/xdg-open: 882: w3m: not found
xdg-open: no method available for opening 'https://accounts.google.com/o/oauth2/auth?client_id=564921029129.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fgd-ocaml-auth.appspot.com%2Foauth2callback&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&

In [59]:
import torch
import torch.nn as nn
from torch.nn import functional as F

batchSize = 16
blockSize = 32
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# ------------

torch.manual_seed(1337)

with open('/content/drive/MyDrive/train.txt', 'r', encoding='utf-8') as f:
    trainData = f.read()

with open('/content/drive/MyDrive/valid.txt', 'r', encoding='utf-8') as f:
    validData = f.read()

with open('/content/drive/MyDrive/test.txt', 'r', encoding='utf-8') as f:
    testData = f.read()

chars = sorted(list(set(trainData)))
vocabSize = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encodeText = lambda s: [stoi[c] for c in s]
decodeText = lambda l: ''.join([itos[i] for i in l])

# Train and test splits
trainData = torch.tensor(encodeText(trainData), dtype=torch.long)
valData = torch.tensor(encodeText(validData), dtype=torch.long)
testData = torch.tensor(encodeText(testData), dtype=torch.long)

In [60]:
def getBatch(tvt):
    if(tvt == 'train'):
      data = trainData
    elif(tvt == 'test'):
      data = testData
    else:
      data = valData
    ix = torch.randint(len(data)-blockSize,(batchSize,))
    x = torch.stack([data[i:i+blockSize] for i in ix])
    y = torch.stack([data[i+1:i+blockSize+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimateLoss():
    model.eval()
    out = {}
    for tvt in ['train', 'val', 'test']:
        losses = torch.zeros(evaluationIterations)
        for k in range(evaluationIterations):
            X, Y = getBatch(tvt)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[tvt] = losses.mean()
    model.train()
    return out

In [61]:
class Head(nn.Module):
    def __init__(self, headSize):
        super().__init__()
        self.key = nn.Linear(numberOfEmbeddings, headSize, bias=False)
        self.query = nn.Linear(numberOfEmbeddings, headSize, bias=False)
        self.value = nn.Linear(numberOfEmbeddings, headSize, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(blockSize, blockSize)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)

        wei = q @ k.transpose(-2,-1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, headSize):
        super().__init__()
        self.heads = nn.ModuleList([Head(headSize) for _ in range(num_heads)])
        self.proj = nn.Linear(numberOfEmbeddings, numberOfEmbeddings)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    def __init__(self, numberOfEmbeddings):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(numberOfEmbeddings, 4 * numberOfEmbeddings),
            nn.ReLU(),
            nn.Linear(4 * numberOfEmbeddings, numberOfEmbeddings),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, numberOfEmbeddings, numberOfHeads):
        super().__init__()
        headSize = numberOfEmbeddings // numberOfHeads
        self.sa = MultiHeadAttention(numberOfHeads, headSize)
        self.ffwd = FeedFoward(numberOfEmbeddings)
        self.ln1 = nn.LayerNorm(numberOfEmbeddings)
        self.ln2 = nn.LayerNorm(numberOfEmbeddings)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class NgramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocabSize, numberOfEmbeddings)
        self.position_embedding_table = nn.Embedding(blockSize, numberOfEmbeddings)
        self.blocks = nn.Sequential(*[Block(numberOfEmbeddings, numberOfHeads=numberOfHeads) for _ in range(numberOfLayers)])
        self.ln_f = nn.LayerNorm(numberOfEmbeddings)
        self.lm_head = nn.Linear(numberOfEmbeddings, vocabSize)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, maxNewTokens):

        for _ in range(maxNewTokens):
            idx_cond = idx[:, -blockSize:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [62]:
def save_model(model, optimizer, scheduler, metric, epoch, path):
    torch.save(
        {'model_state_dict'         : model.state_dict(),
         'optimizer_state_dict'     : optimizer.state_dict(),
         'scheduler_state_dict'     : None,
         metric[0]                  : metric[1],
         'epoch'                    : epoch},
         path
    )

def load_model(path, model, metric= 'valid_acc', optimizer= None, scheduler= None):

    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])

    if optimizer != None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if scheduler != None:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    epoch   = checkpoint['epoch']
    metric  = checkpoint[metric]

    return model, optimizer, scheduler, epoch, metric

In [63]:
#Training
lowest_val_loss=1.33
epochs = 1000000
evalInterval = 100
learning_rate = 1e-3
evaluationIterations = 200
numberOfEmbeddings = 64
numberOfHeads = 4
numberOfLayers = 4
dropout = 0.1
epoch_model_path = '/content/drive/MyDrive/Running_Baseline_Checkpoint'
best_epoch_model_path = '/content/drive/MyDrive/Best_Baseline_Checkpoint'
model = NgramLanguageModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
model, optimizer, scheduler, epoch, lowest_val_loss = load_model(epoch_model_path, model, 'low_val_loss', optimizer)
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

for e in range(epochs):

    if(e % evalInterval == 0 or e == epochs - 1):
        losses = estimateLoss()
        print(f"Step {e}: Train Loss: {losses['train']:.4f}, Val Loss: {losses['val']:.4f}")
        save_model(model, optimizer, None, ['low_val_loss', losses['val']], e, epoch_model_path)
        if(losses['val']<lowest_val_loss):
          lowest_val_loss=losses['val']
          print('Saving Model')
          save_model(model, optimizer, None, ['low_val_loss', losses['val']], e, best_epoch_model_path)

    xb, yb = getBatch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


0.21863 M parameters
Step 0: Train Loss: 1.3335, Val Loss: 1.3360


KeyboardInterrupt: ignored

In [64]:
learning_rate = 1e-3
epoch_model_path = '/content/drive/MyDrive/Running_Baseline_Checkpoint'
best_epoch_model_path = '/content/drive/MyDrive/Best_Baseline_Checkpoint'
model = NgramLanguageModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
model, optimizer, scheduler, epoch, lowest_val_loss = load_model(best_epoch_model_path, model, 'low_val_loss', optimizer)
m = model.to(device)

In [65]:
from nltk.translate.bleu_score import sentence_bleu
import json
from evaluate import load
import numpy as np

bertScore = load("bertscore")

with open('/content/drive/MyDrive/test_en_1.json', 'r') as json_file:
    data = [json.loads(line) for line in json_file]
formatted_rows = [f"Patient: {row['input']}\nDoctor: {row['output']}\n" for row in data]
formatted_text = ''.join(formatted_rows)

a = formatted_text.split('\n')[::2]
b = formatted_text.split('\n')[1::2]

newText = list(zip(a,b))

score=0.0
predictions=[]
references=[]
bleuscores=[]
for i in range(len(newText)):
  data = torch.tensor(encodeText(newText[i][0]), dtype=torch.long)
  context = data.reshape(-1, 1).to(device)
  output=decodeText(m.generate(context, maxNewTokens=len(newText[i][1]))[0].tolist())
  predictions.append(output)
  references.append(newText[i][1])
  bscore=sentence_bleu(newText[i][1].split(),output.split())
  bleuscores.append(bscore)
bleuscoresnp=np.array(bleuscores)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


KeyboardInterrupt: ignored

In [66]:
losses = estimateLoss()
print("Test Cross Entropy Loss: ",losses['test'])
bleuscoresnp=np.array(bleuscores)
bscore=bertScore.compute(predictions=predictions, references=references, lang="en")
print("Bert Precision: ", np.average(bscore['precision']))
print("Bert Recall: ", np.average(bscore['recall']))
print("Bert F1: ", np.average(bscore['f1']))
print("Bleu Score: ", np.average(bleuscoresnp))
#Bleu Score not valid and a good evaluation metric in our case since it exactly matches each word of both the prediction and references outputs
#Bert Score is a good indicator since it takes into consideration a pretrained bert along with cosine similarity to evaluate the outputs.

Test Cross Entropy Loss:  tensor(1.3355)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Bert Precision:  0.7783421344227262
Bert Recall:  0.8045449636199258
Bert F1:  0.7911773883935177
Bleu Score:  7.1977910126621e-157


In [None]:
data = torch.tensor(encodeText("So last week I started itching real bad, especially in my legs. I started noticing some bruising (and there were a lot of bruises, BIG bruises) where I scratched. I didnt think I was scratching so hard and Ive never bruised like this before whenever Ive had skin problems where I needed to scratch. Now, I found a lump in my upper thigh and its the size of a quarter. I can only notice it when I touch it and it feels like Im pressing against another bruise. However nothing has appeared on the skin which is leading me to believe its beneath the skin. Any ideas on what it could be?"), dtype=torch.long)
context = data.reshape(-1, 1).to(device)
output=decodeText(m.generate(context, maxNewTokens=1000)[0].tolist())