In [None]:
!pip install opendatasets
import opendatasets as od
import pandas as pd
from tqdm import tqdm
from torchtext.vocab import GloVe
import torch.nn as nn
import torch
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

torch.manual_seed(42)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
od.download('https://www.kaggle.com/datasets/jessicali9530/kuc-hackathon-winter-2018')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: piyushhinduja
Your Kaggle Key: ··········
Downloading kuc-hackathon-winter-2018.zip to ./kuc-hackathon-winter-2018


100%|██████████| 40.7M/40.7M [00:02<00:00, 14.3MB/s]





In [None]:
EMBED_SIZE = 300
# BERT_MODEL = 'prajjwal1/bert-mini'
LEARNING_RATE = 0.01
EPOCHS = 10
glove = GloVe(name='6B', dim=EMBED_SIZE)

kaggle_train = pd.read_csv('/content/kuc-hackathon-winter-2018/drugsComTrain_raw.csv')
kaggle_test = pd.read_csv('/content/kuc-hackathon-winter-2018/drugsComTest_raw.csv')

main_x = list(pd.concat([kaggle_train['review'], kaggle_test['review']], axis=0, ignore_index=True))
main_y = list(pd.concat([kaggle_train['condition'], kaggle_test['condition']], axis=0))

main_x = main_x[:500]
main_y = main_y[:500]

x_train, x_test, y_train, y_test = train_test_split(main_x, main_y, test_size=.21, random_state=0)
x_test, x_val,  y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=0)

vocab = list(Counter(main_y).keys())
i_to_x = {i:vocab[i] for i in range(len(vocab))}
x_to_i = {vocab[i]:i for i in range(len(vocab))}

max_label = x_to_i[vocab[0]]

def create_data(x, y):
  k = 0
  while True:
    if k == len(x):
      break
    if len(x[k].split(' ')) > 500:
      x.pop(k)
      y.pop(k)
      k += 1
      continue
    k += 1
  last_indices = []
  tokens = []
  for i in range(len(x)):
    tokens.append(x[i].split(' '))
    last_indices.append(len(tokens[i])-1)
    pad_len = 500 - len(tokens[i]) % 500
    tokens[i].extend(['[PAD]']*pad_len)

  embeddings = []
  for i in range(len(tokens)):
    sent_embed = []
    for j in range(500):
      sent_embed.append(glove[tokens[i][j].lower()].tolist())
    embeddings.append(sent_embed)

  x_tensor = torch.tensor(embeddings).to(device)
  print(x_tensor.shape)
  y_tensor = torch.tensor([x_to_i[j] for j in y], dtype=torch.long).to(device)
  print(y_tensor.shape)
  last_indices = torch.tensor(last_indices).to(device)
  print(last_indices.shape)
  dataset = TensorDataset(x_tensor, y_tensor, last_indices)
  dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

  return dataloader


train_dataloader = create_data(x_train, y_train)
test_dataloader = create_data(x_test, y_test)
val_dataloader = create_data(x_val, y_val)

torch.Size([395, 500, 300])
torch.Size([395])
torch.Size([395])
torch.Size([52, 500, 300])
torch.Size([52])
torch.Size([52])
torch.Size([53, 500, 300])
torch.Size([53])
torch.Size([53])


In [None]:
LEARNING_RATE = 0.1
VOCAB_SIZE = len(vocab)

class LSTM(nn.Module):
  def __init__(self):
    super().__init__()
    self.lstm = nn.LSTM(input_size=300, hidden_size=300, num_layers=1, batch_first=True)
    self.lin1 = nn.Linear(300, VOCAB_SIZE)

  def forward(self, x, indices):
    out, (h, c) = self.lstm(x)
    out1 = torch.reshape(out, (out.shape[0]*out.shape[1], out.shape[2]))
    out1 = torch.index_select(out1, -2, indices)
    l1 = self.lin1(out1)
    return l1

model = LSTM().to(device)
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
best_model = {'accuracy':-1, 'epoch':-1, 'model':{}, 'optimizer':{}}
for epoch in range(EPOCHS):
  print('Epoch:', epoch+1)
  losses = []
  f1_scores = []
  accuracies = []
  max_label_accuracies = []
  for x, y, indices in tqdm(train_dataloader):
    model.train()
    pred = model(x, indices)
    loss = loss_func(pred, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    pred = torch.max(pred, dim=1, keepdim=True)[1]
    pred = pred.view(pred.shape[0]).to(torch.float32).to(device)
    f1 = f1_score(pred.tolist(), y.tolist(), average='weighted')
    f1_scores.append(f1.item())
    losses.append(loss.item())

    # Accuracy with original predictions
    acc = accuracy_score(pred.tolist(), y.tolist())
    accuracies.append(acc.item())

    # Max label accuracy
    max_label_list = [max_label]*len(y.tolist())
    acc1 = accuracy_score(max_label_list, y.tolist())
    max_label_accuracies.append(acc1.item())


  print('Train Loss: ', sum(losses)/len(losses))
  print('Train F1 score: ', sum(f1_scores)/len(f1_scores))
  print('Train accuracy: ', sum(accuracies)/len(accuracies))
  print('Train Max label accuracy: ', sum(max_label_accuracies)/len(max_label_accuracies))

  val_accuracies = []
  val_f1_scores = []
  val_max_label_acc = []
  val_losses = []
  with torch.no_grad():
    for x, y, indices in tqdm(val_dataloader):
      model.eval()
      pred = model(x, indices)
      loss = loss_func(pred, y)
      pred = torch.max(pred, dim=1, keepdim=True)[1]
      pred = pred.view(pred.shape[0]).to(torch.float32)
      f1 = f1_score(pred.tolist(), y.tolist(), average='weighted')
      val_f1_scores.append(f1.item())
      val_losses.append(loss.item())

      # Accuracy with original predictions
      acc = accuracy_score(pred.tolist(), y.tolist())
      val_accuracies.append(acc.item())

      # Max label accuracy
      max_label_list = [max_label]*len(y.tolist())
      acc1 = accuracy_score(max_label_list, y.tolist())
      val_max_label_acc.append(acc1.item())

    print('Dev Loss: ', sum(val_losses)/len(val_losses))
    print('Dev F1 score: ', sum(val_f1_scores)/len(val_f1_scores))
    print('Dev Accuracy: ', sum(val_accuracies)/len(val_accuracies))
    print('Dev Max label accuracy: ', sum(val_max_label_acc)/len(val_max_label_acc))

  if best_model['accuracy'] < sum(val_accuracies)/len(val_accuracies):
    best_model['accuracy'] = sum(val_accuracies)/len(val_accuracies)
    best_model['epoch'] = epoch+1
    best_model['model'] = model.state_dict()
    best_model['optimizer'] = optimizer.state_dict()

torch.save({
    'accuracy':best_model['accuracy'],
    'epoch':best_model['epoch'],
    'model':best_model['model'],
    'optimizer':best_model['optimizer']
}, './best_model6')


Epoch: 1


100%|██████████| 7/7 [00:00<00:00, 14.30it/s]


Train Loss:  9.101677894592285
Train F1 score:  0.07330733589850792
Train accuracy:  0.05539772727272728
Train Max label accuracy:  0.002232142857142857


100%|██████████| 1/1 [00:00<00:00, 41.91it/s]


Dev Loss:  12.199882507324219
Dev F1 score:  0.25157232704402516
Dev Accuracy:  0.1509433962264151
Dev Max label accuracy:  0.0
Epoch: 2


100%|██████████| 7/7 [00:00<00:00, 19.29it/s]


Train Loss:  11.72543716430664
Train F1 score:  0.053431839144309934
Train accuracy:  0.03125
Train Max label accuracy:  0.002232142857142857


100%|██████████| 1/1 [00:00<00:00, 41.00it/s]


Dev Loss:  14.241783142089844
Dev F1 score:  0.0
Dev Accuracy:  0.0
Dev Max label accuracy:  0.0
Epoch: 3


100%|██████████| 7/7 [00:00<00:00, 19.32it/s]


Train Loss:  10.420822416033063
Train F1 score:  0.1068918535568916
Train accuracy:  0.07589285714285714
Train Max label accuracy:  0.002232142857142857


100%|██████████| 1/1 [00:00<00:00, 46.57it/s]


Dev Loss:  16.42892837524414
Dev F1 score:  0.03689727463312369
Dev Accuracy:  0.018867924528301886
Dev Max label accuracy:  0.0
Epoch: 4


100%|██████████| 7/7 [00:00<00:00, 19.27it/s]


Train Loss:  9.143024103982109
Train F1 score:  0.14068067873175494
Train accuracy:  0.09334415584415585
Train Max label accuracy:  0.002232142857142857


100%|██████████| 1/1 [00:00<00:00, 45.88it/s]


Dev Loss:  12.390040397644043
Dev F1 score:  0.036163522012578615
Dev Accuracy:  0.018867924528301886
Dev Max label accuracy:  0.0
Epoch: 5


100%|██████████| 7/7 [00:00<00:00, 19.07it/s]


Train Loss:  7.7620954513549805
Train F1 score:  0.07970572082800463
Train accuracy:  0.049107142857142856
Train Max label accuracy:  0.002232142857142857


100%|██████████| 1/1 [00:00<00:00, 45.87it/s]


Dev Loss:  11.466385841369629
Dev F1 score:  0.24528301886792453
Dev Accuracy:  0.1509433962264151
Dev Max label accuracy:  0.0
Epoch: 6


100%|██████████| 7/7 [00:00<00:00, 19.07it/s]


Train Loss:  7.045139040265765
Train F1 score:  0.1254818463246133
Train accuracy:  0.078125
Train Max label accuracy:  0.002232142857142857


100%|██████████| 1/1 [00:00<00:00, 41.57it/s]


Dev Loss:  10.266172409057617
Dev F1 score:  0.24407868325973503
Dev Accuracy:  0.1509433962264151
Dev Max label accuracy:  0.0
Epoch: 7


100%|██████████| 7/7 [00:00<00:00, 18.75it/s]


Train Loss:  6.32019989831107
Train F1 score:  0.27041803800715974
Train accuracy:  0.17694805194805197
Train Max label accuracy:  0.002232142857142857


100%|██████████| 1/1 [00:00<00:00, 47.29it/s]


Dev Loss:  8.796109199523926
Dev F1 score:  0.06659267480577136
Dev Accuracy:  0.05660377358490566
Dev Max label accuracy:  0.0
Epoch: 8


100%|██████████| 7/7 [00:00<00:00, 19.40it/s]


Train Loss:  5.974339144570487
Train F1 score:  0.2128441543295802
Train accuracy:  0.13717532467532467
Train Max label accuracy:  0.002232142857142857


100%|██████████| 1/1 [00:00<00:00, 38.43it/s]


Dev Loss:  8.93282413482666
Dev F1 score:  0.28692257644762525
Dev Accuracy:  0.16981132075471697
Dev Max label accuracy:  0.0
Epoch: 9


100%|██████████| 7/7 [00:00<00:00, 18.60it/s]


Train Loss:  5.887031214577811
Train F1 score:  0.14656746712511298
Train accuracy:  0.09375
Train Max label accuracy:  0.002232142857142857


100%|██████████| 1/1 [00:00<00:00, 44.66it/s]


Dev Loss:  8.209282875061035
Dev F1 score:  0.012578616352201257
Dev Accuracy:  0.018867924528301886
Dev Max label accuracy:  0.0
Epoch: 10


100%|██████████| 7/7 [00:00<00:00, 19.05it/s]


Train Loss:  5.57134062903268
Train F1 score:  0.1622804938779002
Train accuracy:  0.10267857142857142
Train Max label accuracy:  0.002232142857142857


100%|██████████| 1/1 [00:00<00:00, 47.56it/s]


Dev Loss:  7.546858787536621
Dev F1 score:  0.2372431715438131
Dev Accuracy:  0.1509433962264151
Dev Max label accuracy:  0.0


In [None]:
model_path = './best_model6'
checkpoint = torch.load(model_path)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])

test_accuracies = []
test_f1_scores = []
test_losses = []
with torch.no_grad():
  for x, y, indices in tqdm(test_dataloader):
      model.eval()
      pred = model(x, indices)
      loss = loss_func(pred, y)
      pred = torch.max(pred, dim=1, keepdim=True)[1]
      pred = pred.view(pred.shape[0]).to(torch.float32)
      f1 = f1_score(pred.tolist(), y.tolist(), average='weighted')
      val_f1_scores.append(f1.item())
      val_losses.append(loss.item())
      acc = accuracy_score(pred.tolist(), y.tolist())
      val_accuracies.append(acc.item())

  print('Test Loss: ', sum(val_losses)/len(val_losses))
  print('Test F1 score: ', sum(val_f1_scores)/len(val_f1_scores))
  print('Test Accuracy: ', sum(val_accuracies)/len(val_accuracies))