In [None]:
import sys

!{sys.executable} -m pip install torch numpy transformers pandas

In [1]:
import pandas as pd

train_data = pd.read_csv("data/train.csv")

train_columns = ['qa_id', 'question_title', 'question_body', 'question_user_name',
       'question_user_page', 'answer', 'answer_user_name', 'answer_user_page',
       'url', 'category', 'host']
X = train_data[train_columns]
y = train_data.drop(train_columns, axis=1)


In [2]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [3]:
from transformers import AutoTokenizer, AutoModel
import numpy as np

MODEL = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
roberta = AutoModel.from_pretrained(MODEL).to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
from torch.utils.data import TensorDataset, DataLoader

MAX_LENGTH_T = 30
MAX_LENGTH_Q = 128 # 128
MAX_LENGTH_A = 128 # 128
MAX_SEQUENCE = 160 # 290

BATCH_SIZE = 40 # 40

category_dict = {
  'LIFE_ARTS': 0,
  'STACKOVERFLOW': 1,
  'TECHNOLOGY': 2,
  'SCIENCE': 3,
  'CULTURE': 4
}

def get_merged_text_plus_categories(X):
  T = X["question_title"].tolist()
  Q = X["question_body"].tolist()
  A = X["answer"].tolist()
  C = X['category'].tolist()
  results = []

  for t,q,a in zip(T,Q,A):
    merged = t.split()[:MAX_LENGTH_T] + ["[SEP]"] + q.split()[:MAX_LENGTH_Q] + ["[SEP]"] + a.split()[:MAX_LENGTH_A]
    results.append(" ".join(i for i in merged))

  X_tmp = [(i,j) for i,j in zip(results, [category_dict[x] for x in C])]
  X_train, X_valid, y_train, y_valid = train_test_split(X_tmp, y, test_size=0.2, random_state=42)

  X_train_text = tokenizer([i[0] for i in X_train], truncation=True, padding='max_length', max_length=MAX_SEQUENCE, return_tensors='pt')
  X_valid_text = tokenizer([i[0] for i in X_valid], truncation=True, padding='max_length', max_length=MAX_SEQUENCE, return_tensors='pt')

  X_train_category = torch.tensor([i[1] for i in X_train]).type(torch.float)
  X_valid_category = torch.tensor([i[1] for i in X_valid]).type(torch.float)

  y_train = torch.tensor(y_train.values).type(torch.float)
  y_valid = torch.tensor(y_valid.values).type(torch.float)

  ids = X_train_text['input_ids']
  attention_mask = X_train_text['attention_mask']

  dataset_train = TensorDataset(ids, attention_mask, X_train_category, y_train)
  dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE)

  ids = X_valid_text['input_ids']
  attention_mask = X_valid_text['attention_mask']

  dataset_valid = TensorDataset(ids, attention_mask, X_valid_category, y_valid)
  dataloader_valid = DataLoader(dataset_valid, batch_size=BATCH_SIZE)

  return dataloader_train, dataloader_valid

def get_merged_T_Q_plus_A_plus_categories(X):
  T = X["question_title"].tolist()
  Q = X["question_body"].tolist()
  A = X["answer"].tolist()
  C = X['category'].tolist()
  results_T_Q = []
  results_A = []

  for t,q in zip(T,Q):
    merged = t.split()[:MAX_LENGTH_T] + ["[SEP]"] + q.split()[:MAX_LENGTH_Q]
    results_T_Q.append(" ".join(i for i in merged))
  
  for i in A:
    a = i.split()[:MAX_LENGTH_A]
    results_A.append(" ".join(j for j in a))

  X_tmp = [(i,j,k) for i,j,k in zip(results_T_Q, results_A, [category_dict[x] for x in C])]
  X_train, X_valid, y_train, y_valid = train_test_split(X_tmp, y, test_size=0.2, random_state=42)

  X_train_T_Q = tokenizer([i[0] for i in X_train], truncation=True, padding='max_length', max_length=MAX_SEQUENCE, return_tensors='pt')
  X_valid_T_Q = tokenizer([i[0] for i in X_valid], truncation=True, padding='max_length', max_length=MAX_SEQUENCE, return_tensors='pt')

  X_train_A = tokenizer([i[1] for i in X_train], truncation=True, padding='max_length', max_length=MAX_LENGTH_A, return_tensors='pt')
  X_valid_A = tokenizer([i[1] for i in X_valid], truncation=True, padding='max_length', max_length=MAX_LENGTH_A, return_tensors='pt')

  X_train_category = torch.tensor([i[2] for i in X_train]).type(torch.float)
  X_valid_category = torch.tensor([i[2] for i in X_valid]).type(torch.float)

  y_train = torch.tensor(y_train.values).type(torch.float)
  y_valid = torch.tensor(y_valid.values).type(torch.float)

  ids_T_Q = X_train_T_Q['input_ids']
  attention_mask_T_Q = X_train_T_Q['attention_mask']
  ids_A = X_train_A['input_ids']
  attention_mask_A = X_train_A['attention_mask']

  dataset_train = TensorDataset(ids_T_Q, attention_mask_T_Q, ids_A, attention_mask_A, X_train_category, y_train)
  dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE)

  ids_T_Q = X_valid_T_Q['input_ids']
  attention_mask_T_Q = X_valid_T_Q['attention_mask']
  ids_A = X_valid_A['input_ids']
  attention_mask_A = X_valid_A['attention_mask']

  dataset_valid = TensorDataset(ids_T_Q, attention_mask_T_Q, ids_A, attention_mask_A, X_valid_category, y_valid)
  dataloader_valid = DataLoader(dataset_valid, batch_size=BATCH_SIZE)

  return dataloader_train, dataloader_valid

In [5]:
from sklearn.model_selection import train_test_split

# dataloader_train, dataloader_valid = get_merged_text_plus_categories(X)

dataloader_train, dataloader_valid = get_merged_T_Q_plus_A_plus_categories(X)

In [6]:
import torch
from torch import nn

class Model_Double_BERT(nn.Module):
    def __init__(self, bert, hidden_size=1024, output_size=30):
        super(Model_Double_BERT, self).__init__()
        self.bert = bert
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(1536, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.low_dropout = nn.Dropout(0.2)
        self.high_dropout = nn.Dropout(0.5)

    def forward(self, input_ids_Q_T, mask_Q_T, input_ids_A, mask_A, category):
        x1 = self.bert(input_ids_Q_T, attention_mask=mask_Q_T)
        x1 = x1.pooler_output
        x2 = self.bert(input_ids_A, attention_mask=mask_A)
        x2 = x2.pooler_output
        x = torch.cat((x1, x2), dim=1)
        x = self.low_dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.high_dropout(x)
        x = self.fc2(x)
        return x

In [7]:
import torch
from torch import nn

class Model(nn.Module):
    def __init__(self, bert, hidden_size=512, output_size=30):
        super(Model, self).__init__()
        self.bert = bert
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.low_dropout = nn.Dropout(0.2)
        self.high_dropout = nn.Dropout(0.5)

    def forward(self, input_ids, mask, category):
        x = self.bert(input_ids, attention_mask=mask)
        x = x.pooler_output
        # x = torch.cat((x, category.unsqueeze(1)), dim=1)
        x = self.low_dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.high_dropout(x)
        x = self.fc2(x)
        return x

In [8]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.train()
    train_loss = 0
    for step, batch in enumerate(dataloader):
        batch = [i.to(device) for i in batch]
        sent_id_T_Q, mask_T_Q, sent_id_A, mask_A, category, y = batch

        pred = model(sent_id_T_Q, mask_T_Q, sent_id_A, mask_A, category)
        loss = loss_fn(pred, y)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if step % 50 == 0:
            loss, current = loss.item(), step * len(sent_id_T_Q)
            print(f"loss: {loss :>7f}  [{current:>5d}/{size:>5d}]")
    train_loss /= num_batches
    print(f"Avg train loss: {train_loss:>8f} \n")


In [9]:
from scipy import stats
import numpy as np

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    preds = []
    labels = []
    sigmoid = torch.nn.Sigmoid()
    with torch.no_grad():
        for batch in dataloader:
            batch = [i.to(device) for i in batch]
            sent_id_T_Q, mask_T_Q, sent_id_A, mask_A, category, y = batch
            pred = model(sent_id_T_Q, mask_T_Q, sent_id_A, mask_A, category)
            test_loss += loss_fn(pred, y).item()
            preds.append(pred)
            labels.append(y)
        test_loss /= num_batches
        print(f"Avg test loss: {test_loss:>8f} \n")
        preds, labels = sigmoid(torch.cat(preds)), torch.cat(labels)
        spearman_coef = np.mean([stats.spearmanr(preds[:,i].cpu(),labels[:,i].cpu()).correlation for i in range(preds.shape[1])])
        print(f"Avg spearman coef: {spearman_coef:>8f} \n")

In [10]:
model = Model_Double_BERT(roberta).to(device)

In [11]:
loss_fn = nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [12]:
epochs = 30
for t in range(epochs):
    train(dataloader_train, model, loss_fn, optimizer)
    test(dataloader_valid, model, loss_fn)
print("Done!")

loss: 0.694176  [    0/ 4863]
loss: 0.509614  [ 2000/ 4863]
loss: 0.434722  [ 4000/ 4863]
Avg train loss: 0.516839 

Avg test loss: 0.424644 

Avg spearman coef: 0.149993 

loss: 0.423653  [    0/ 4863]
loss: 0.433708  [ 2000/ 4863]
loss: 0.411865  [ 4000/ 4863]
Avg train loss: 0.423154 

Avg test loss: 0.402801 

Avg spearman coef: 0.230306 

loss: 0.402474  [    0/ 4863]
loss: 0.417902  [ 2000/ 4863]
loss: 0.395803  [ 4000/ 4863]
Avg train loss: 0.407484 

Avg test loss: 0.394826 

Avg spearman coef: 0.280476 

loss: 0.391383  [    0/ 4863]
loss: 0.409117  [ 2000/ 4863]
loss: 0.386159  [ 4000/ 4863]
Avg train loss: 0.398100 

Avg test loss: 0.388806 

Avg spearman coef: 0.303915 

loss: 0.377464  [    0/ 4863]
loss: 0.396554  [ 2000/ 4863]
loss: 0.376496  [ 4000/ 4863]
Avg train loss: 0.389856 

Avg test loss: 0.382841 

Avg spearman coef: 0.322186 

loss: 0.370707  [    0/ 4863]
loss: 0.396235  [ 2000/ 4863]
loss: 0.371710  [ 4000/ 4863]
Avg train loss: 0.382070 

Avg test loss: 0.3

In [None]:
torch.save(model, "model.pth")