In [1]:
!pip install -q transformers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import train_test_split


In [4]:
def OHE_split(split = False,reset_index = False):

    data = pd.read_csv("/content/drive/MyDrive/BERT/mtsamples.csv")
    data.drop(['Unnamed: 0'],axis=1,inplace=True)


    counts = data['medical_specialty'].value_counts()
    others = [k for k,v in counts.items() if v<100]
    for each_spec in others:
        data.loc[data['medical_specialty']==each_spec,'medical_specialty']=' others'

    counts = data['medical_specialty'].value_counts()
    # print(counts)

    num_classes = len(data['medical_specialty'].unique())
    class_dict = dict(zip(data['medical_specialty'].unique(),list(range(num_classes))))
    data['medical_specialty'] = data['medical_specialty'].apply(lambda x:class_dict[x])
    data.dropna(inplace=True)
    data['transcription'] = data['keywords']+data['transcription']
    X = data[['transcription']]
    y = data[["medical_specialty"]]
    # df.head()
    # df = pd.DataFrame(X,y)
    df = pd.concat([X,y],axis=1)
    df = df.rename(columns={"medical_specialty":"list"})



    if split:
        train_val, test = train_test_split(df, test_size=0.1, random_state=42)
        train, val = train_test_split(train_val, test_size=0.1111, random_state=42)

        if reset_index:
            train = train.reset_index(drop=True)
            test = test.reset_index(drop=True)
            val = val.reset_index(drop=True)
            return train, test, val    # ***** returns tuple ***********

        else:
            return train, test, val    # ***** returns tuple ***********

    else:
        return df

In [5]:
df = OHE_split(split=False)

In [6]:
df

Unnamed: 0,transcription,list
0,"allergy / immunology, allergic rhinitis, aller...",0
1,"bariatrics, laparoscopic gastric bypass, weigh...",0
2,"bariatrics, laparoscopic gastric bypass, heart...",0
3,"cardiovascular / pulmonary, 2-d m-mode, dopple...",1
4,"cardiovascular / pulmonary, 2-d, doppler, echo...",1
...,...,...
4984,"bariatrics, laparoscopic gastric bypass, gastr...",0
4985,"bariatrics, jenny craig, medifast, nutrisystem...",0
4989,"bariatrics, elective surgical weight loss, sur...",0
4993,"allergy / immunology, chronic glossitis, xeros...",0


#### 65.103% of Data is less than 512 words

In [7]:
sum(df['transcription'].str.split().str.len() < 512)/len(df) *100

69.85633658286301

In [8]:
train,test,val= OHE_split(split=True,reset_index=True)
print(train.shape)
print(test.shape)
print(val.shape)

(3118, 2)
(390, 2)
(390, 2)


In [9]:
val

Unnamed: 0,transcription,list
0,"obstetrics / gynecology, low transverse cervic...",9
1,"pain management, acromioclavicular joint injec...",0
2,"pediatrics - neonatal, flexible bronchoscopy, ...",0
3,"urology, urethra, dmso, bladder, chronic inter...",3
4,"pediatrics - neonatal, open chest, stage 1 nor...",0
...,...,...
385,"radiology, lateral plantar cutaneous, plantar ...",7
386,"surgery, distal radius, c-arm, depo-medrol, fr...",5
387,"discharge summary, pneumonia, hypoxemia, hypot...",11
388,"surgery, standard judkins, french pigtail, sel...",5


In [10]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
TEST_BATCH_SIZE = 4
EPOCHS = 4
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [11]:
tokenizer("hi")

{'input_ids': [0, 3592, 2], 'attention_mask': [1, 1, 1]}

In [12]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = self.data['transcription']
        # print(self.comment_text[1332])
        self.targets = self.data['list']
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer(
            comment_text,
            padding='max_length',
            max_length=self.max_len,
            truncation=True,
            # return_token_type_ids=True

        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        # token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            # 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [13]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }


val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }


In [14]:
def load_data(train,test,val):

    training_set = CustomDataset(train, tokenizer, MAX_LEN)
    testing_set = CustomDataset(test, tokenizer, MAX_LEN)
    val_set = CustomDataset(val, tokenizer, MAX_LEN)

    training_loader = DataLoader(training_set, **train_params)
    testing_loader = DataLoader(testing_set, **test_params)
    val_loader = DataLoader(val_set, **test_params)

    return training_loader,testing_loader,val_loader


In [15]:
train_loader,test_loader,val_loader = load_data(train,test,val)

In [16]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [17]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = model = RobertaModel.from_pretrained("roberta-base")
        # self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 15)

    def forward(self, ids, mask):
        _, output_1= self.l1(ids, attention_mask = mask, return_dict=False)
        # output_2 = self.l2(output_1)
        output = self.l3(output_1)
        return output

model = RobertaClass()
model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [18]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [19]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

In [20]:
def train(epoch):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        # token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)

        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if _%100==0:
            print(f'Epoch: {epoch},Batch: {_} Loss:  {loss.item()}')



In [21]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0,Batch: 0 Loss:  2.716672420501709
Epoch: 0,Batch: 100 Loss:  2.2072830200195312
Epoch: 0,Batch: 200 Loss:  0.6414092183113098
Epoch: 0,Batch: 300 Loss:  0.24426600337028503
Epoch: 1,Batch: 0 Loss:  0.45002481341362
Epoch: 1,Batch: 100 Loss:  0.05275079607963562
Epoch: 1,Batch: 200 Loss:  0.047769706696271896
Epoch: 1,Batch: 300 Loss:  0.03035772033035755
Epoch: 2,Batch: 0 Loss:  0.0320909358561039
Epoch: 2,Batch: 100 Loss:  0.01888131909072399
Epoch: 2,Batch: 200 Loss:  0.044653210788965225
Epoch: 2,Batch: 300 Loss:  0.01940220408141613
Epoch: 3,Batch: 0 Loss:  0.015579639934003353
Epoch: 3,Batch: 100 Loss:  0.015502944588661194
Epoch: 3,Batch: 200 Loss:  0.01821301504969597
Epoch: 3,Batch: 300 Loss:  0.03704221919178963


In [22]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(val_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            # token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [23]:
for epoch in range(EPOCHS):

    outputs, targets = validation()
    out_array = np.array(outputs)
    tar_array = np.array(targets)

    max_tar_indices = np.zeros_like(out_array)
    max_tar_indices[np.arange(tar_array.shape[0]),tar_array] = 1
    max_tar_indices = max_tar_indices.tolist()

    max_value_indices = np.zeros_like(out_array)
    max_indices = np.argmax(out_array, axis=1)
    max_value_indices[np.arange(out_array.shape[0]), max_indices] = 1
    max_value_indices = max_value_indices.tolist()


    acuracy = sum(max_indices == tar_array)/len(tar_array)
    print("accuracy",acuracy)

    f1_score = metrics.f1_score(max_tar_indices,max_value_indices,average="micro",)

    print("f1_Score",f1_score)

accuracy 0.9846153846153847
f1_Score 0.9846153846153847
accuracy 0.9846153846153847
f1_Score 0.9846153846153847
accuracy 0.9846153846153847
f1_Score 0.9846153846153847
accuracy 0.9846153846153847
f1_Score 0.9846153846153847
