In [1]:
import os
import numpy as np

import pandas as pd
import tensorflow as tf

from transformers import BertTokenizer, RobertaTokenizer
from Trainer import Trainer
from preprocessing import dataPreprocessor
from parameters import *

from pathlib import Path

import horovod.tensorflow as hvd

In [2]:
        hvd.init()
        gpus = tf.config.list_physical_devices('GPU') 
        for gpu in gpus:
            print(gpu)
            tf.config.experimental.set_memory_growth(gpu, True)
        if gpus:
            print("gpus ", gpus)
            print("local rank ",hvd.local_rank())
            tf.config.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
            print(tf.config.get_visible_devices())

In [3]:
data_dir = "google-quest-challenge/"

train_df = pd.read_csv(os.path.join(data_dir, "train.csv"))
test_df = pd.read_csv(os.path.join(data_dir, "test.csv"))
submit_df = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"))
stack_df = pd.read_csv(os.path.join(data_dir, "stackexchange_data.csv"))
stack_df.head()

Unnamed: 0,id,host,question_username,question_score,question_views,question_favs,answers_count,answers_max_score,answers_mean_score,question_title,question_body,answer_username,answer,answer_score,is_answer_accepted
0,1,3dprinting.stackexchange.com,Adam Davis,3.76781,3.912607,0.0,3.18806,3.749319,3.431946,How to obtain high resolution prints in a shor...,When I've printed an object I've had to choose...,hroncok,You could experiment with slicing. For example...,3.213366,2.07692
1,1,3dprinting.stackexchange.com,Adam Davis,3.76781,3.912607,0.0,3.18806,3.749319,3.431946,How to obtain high resolution prints in a shor...,When I've printed an object I've had to choose...,plaintoothpaste,For FDM technologies in general with a single ...,1.606683,0.0
2,4,3dprinting.stackexchange.com,Adam Davis,4.541637,3.921554,2.779972,4.670423,3.320479,3.431946,Are there any metals that exhibit a large glas...,Plastic is used in 3D FDM/FFF printing partly ...,TextGeek,"I""m no expert on this, but the article at http...",2.620387,0.0
3,4,3dprinting.stackexchange.com,Adam Davis,4.541637,3.921554,2.779972,4.670423,3.320479,3.431946,Are there any metals that exhibit a large glas...,Plastic is used in 3D FDM/FFF printing partly ...,Ryan Carlyle,A few things are required for effective extrus...,2.845828,2.07692
4,2,3dprinting.stackexchange.com,kenorb,5.291027,5.526324,3.507934,5.199501,5.195138,4.521993,Is 3D printing safe for your health?,"I would like to buy a 3D printer, but I'm conc...",Tom van der Zanden,There is very little information about safety ...,4.452511,2.07692


In [4]:
train_X_df = train_df[train_columns]
train_targets_df = train_df[target_columns]

In [5]:
q_title = train_X_df['question_title'].values
q_body = train_X_df['question_body'].values
answer = train_X_df['answer'].values

targets = train_targets_df.to_numpy()[:30]

stack_q_title = stack_df['question_title'].values
stack_q_body = stack_df['question_body'].values
stack_answer = stack_df['answer'].values

In [6]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

dataPreprocessor.logger = False
dataPreprocessor.tokenizer = tokenizer
dataPreprocessor.model = "Roberta"

In [7]:
preprocessedInput = dataPreprocessor.preprocessBatch(q_body[:30], q_title[:30], answer[:30], max_seq_lengths=(26,260,210,500))

Token indices sequence length is longer than the specified maximum sequence length for this model (881 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (898 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2900 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (848 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (976 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

In [8]:
preprocessedStack = dataPreprocessor.preprocessBatch(stack_q_body[:40], stack_q_title[:40], stack_answer[:40], max_seq_lengths=(26,260,210,500))

Token indices sequence length is longer than the specified maximum sequence length for this model (800 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (646 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (770 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (958 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (613 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [9]:
model_name = "RoBERTaForQALabeling"

checkpoint_dir = os.path.join(save_dir, "{}_tokenizer_data" .format(model_name))
Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)
print("saving tokenizer in ", checkpoint_dir)
tokenizer.save_pretrained(checkpoint_dir)

saving tokenizer in  ./checkpoints/RoBERTaForQALabeling_tokenizer_data


('./checkpoints/RoBERTaForQALabeling_tokenizer_data/vocab.json',
 './checkpoints/RoBERTaForQALabeling_tokenizer_data/merges.txt',
 './checkpoints/RoBERTaForQALabeling_tokenizer_data/special_tokens_map.json',
 './checkpoints/RoBERTaForQALabeling_tokenizer_data/added_tokens.json')

In [10]:
Trainer.train(model_name=model_name, 
              preprocessedInput=preprocessedInput, 
              targets=targets, 
              preprocessedPseudo=preprocessedStack)

Fold0/2 
loaded model  RoBERTaForQALabeling


  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


epoch 0 train loss 0.17030954360961914 test loss 0.17202387750148773 test metric 0.007710277389597416 train metric 0.09657178126892761
model for RoBERTaForQALabeling saved under ./checkpoints/RoBERTaForQALabeling_fold_0... 
epoch 1 train loss 0.16902130842208862 test loss 0.15670667588710785 test metric 0.017847665503889113 train metric 0.005440003542950807
model for RoBERTaForQALabeling saved under ./checkpoints/RoBERTaForQALabeling_fold_0... 
Fold1/2 
loaded model  RoBERTaForQALabeling


  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


epoch 0 train loss 0.18538109958171844 test loss 0.1830405741930008 test metric 0.012425756392580475 train metric 0.012675132175222304
model for RoBERTaForQALabeling saved under ./checkpoints/RoBERTaForQALabeling_fold_1... 
epoch 1 train loss 0.17696423828601837 test loss 0.1664683073759079 test metric 0.0030140600185576526 train metric 0.010507294480287152


In [None]:
Trainer.pseudo_predict(model_name=model_name, 
                       preprocessedPseudo=preprocessedStack, 
                       pseudo_df=stack_df[:40])

Fold0/2 
best checkpoint for fold 0 restored from ./checkpoints/RoBERTaForQALabeling_fold_0 ...
creating pseudo-labels...
