In [None]:
!pip install opendatasets
import opendatasets as od
from transformers import AdamW, BertModel, BertTokenizer
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import torch
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

torch.manual_seed(42)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
od.download('https://www.kaggle.com/datasets/jessicali9530/kuc-hackathon-winter-2018')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: piyushhinduja
Your Kaggle Key: ··········
Downloading kuc-hackathon-winter-2018.zip to ./kuc-hackathon-winter-2018


100%|██████████| 40.7M/40.7M [00:00<00:00, 74.3MB/s]





In [None]:
EMBED_SIZE = 300
BERT_MODEL = 'prajjwal1/bert-mini'
LEARNING_RATE = 0.0001
EPOCHS = 10

kaggle_train = pd.read_csv('/content/kuc-hackathon-winter-2018/drugsComTrain_raw.csv')
kaggle_test = pd.read_csv('/content/kuc-hackathon-winter-2018/drugsComTest_raw.csv')

main_x = list(pd.concat([kaggle_train['review'], kaggle_test['review']], axis=0, ignore_index=True))
main_y = list(pd.concat([kaggle_train['condition'], kaggle_test['condition']], axis=0))

main_x = main_x[:500]
main_y = main_y[:500]

vocab = list(Counter(main_y).keys())
i_to_x = {i:vocab[i] for i in range(len(vocab))}
x_to_i = {vocab[i]:i for i in range(len(vocab))}

x_train, x_test, y_train, y_test = train_test_split(main_x, main_y, test_size=.21, random_state=0)
x_test, x_val,  y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=0)

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

x_train = tokenizer(x_train, truncation=True, max_length=512, padding=True, return_tensors='pt').to(device)
y_train = [x_to_i[j] for j in y_train]
y_train = torch.tensor(y_train, dtype=torch.long).to(device)
train_dataset = TensorDataset(x_train['input_ids'], x_train['attention_mask'], x_train['token_type_ids'], y_train)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

x_test = tokenizer(x_test, truncation=True, max_length=512, padding=True, return_tensors='pt').to(device)
y_test = [x_to_i[j] for j in y_test]
y_test = torch.tensor(y_test).to(device)
test_dataset = TensorDataset(x_test['input_ids'], x_test['attention_mask'], x_test['token_type_ids'], y_test)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

x_val = tokenizer(x_val, truncation=True, max_length=512, padding=True, return_tensors='pt').to(device)
y_val = [x_to_i[j] for j in y_val]
y_val = torch.tensor(y_val).to(device)
val_dataset = TensorDataset(x_val['input_ids'], x_val['attention_mask'], x_val['token_type_ids'], y_val)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True)

In [None]:
from sklearn.metrics import accuracy_score

class DrugClassifier(nn.Module):
  def __init__(self):
    super(DrugClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL)
    self.lin1 = nn.Linear(self.bert.config.hidden_size, 512)
    self.lin2 = nn.Linear(512, len(vocab))
    # self.softmax = nn.Softmax(dim=1)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
    output1 = self.lin1(pooled_output)
    output2 = self.lin2(output1)
    # return self.softmax(output)
    return output2

model = DrugClassifier().to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
best_model = {'accuracy':-1, 'epoch':-1, 'model':{}, 'optimizer':{}}
for epoch in range(EPOCHS):
  print('Epoch: ', epoch+1)
  losses = []
  accuracies = []
  f1_scores = []
  for input_ids, attention_mask, token_type_ids, labels in tqdm(train_dataloader):
    model.train()
    out = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
    loss = loss_func(out, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    pred = torch.max(out, dim=1, keepdim=True)[1]
    pred = pred.view(pred.shape[0]).to(torch.float32).to(device)
    acc = accuracy_score(pred.tolist(), labels.tolist())
    accuracies.append(acc.item())
    f1 = f1_score(pred.tolist(), labels.tolist(), average='weighted')
    f1_scores.append(f1.item())
    losses.append(loss.item())

  print('Train Loss: ', sum(losses)/len(losses))
  print('Train Accuracy: ', sum(accuracies)/len(accuracies))
  print('Train F1 score: ', sum(f1_scores)/len(f1_scores))

  val_accuracies = []
  val_losses = []
  val_f1 = []
  with torch.no_grad():
    for input_ids, attention_mask, token_type_ids, labels in tqdm(val_dataloader):
      model.eval()
      pred = model(input_ids=input_ids, attention_mask=attention_mask)
      loss = loss_func(pred, labels)
      pred = torch.max(pred, dim=1, keepdim=True)[1]
      pred = pred.view(pred.shape[0]).to(torch.float32)
      acc = accuracy_score(pred.tolist(), labels.tolist())
      val_accuracies.append(acc.item())
      f1 = f1_score(pred.tolist(), labels.tolist(), average='weighted')
      val_f1.append(f1.item())
      val_losses.append(loss.item())
    print('Dev Loss: ', sum(val_losses)/len(val_losses))
    print('Dev Accuracy: ', sum(val_accuracies)/len(val_accuracies))
    print('Dev F1 score: ', sum(val_f1)/len(val_f1))

  if best_model['accuracy'] < sum(val_accuracies)/len(val_accuracies):
    best_model['accuracy'] = sum(val_accuracies)/len(val_accuracies)
    best_model['epoch'] = epoch+1
    best_model['model'] = model.state_dict()
    best_model['optimizer'] = optimizer.state_dict()

torch.save({
    'accuracy':best_model['accuracy'],
    'epoch':best_model['epoch'],
    'model':best_model['model'],
    'optimizer':best_model['optimizer']
}, './best_model6')

In [None]:
model_path = './best_model6'
checkpoint = torch.load(model_path)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])

test_accuracies = []
test_f1_scores = []
test_losses = []
with torch.no_grad():
    for input_ids, attention_mask, token_type_ids, labels in tqdm(val_dataloader):
      model.eval()
      pred = model(input_ids=input_ids, attention_mask=attention_mask)
      loss = loss_func(pred, labels)
      pred = torch.max(pred, dim=1, keepdim=True)[1]
      pred = pred.view(pred.shape[0]).to(torch.float32)
      acc = accuracy_score(pred.tolist(), labels.tolist())
      test_accuracies.append(acc.item())
      f1 = f1_score(pred.tolist(), labels.tolist(), average='weighted')
      test_f1_scores.append(f1.item())
      test_losses.append(loss.item())

  print('Test Loss: ', sum(test_losses)/len(test_losses))
  print('Test F1 score: ', sum(test_f1_scores)/len(test_f1_scores))
  print('Test Accuracy: ', sum(test_accuracies)/len(test_accuracies))