In [1]:
# ! pip install gdown pandas scikit-learn transformers nltk tqdm

### Download Data if needed

In [8]:
import os
if not os.path.exists('./data'):
    os.mkdir('./data')

if not os.path.exists('./data/yahoo_answers_csv/train.csv'):
    import gdown
    url = 'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU'
    output = './data/yahoo_answers_csv.tar.gz'
    gdown.download(url, output, quiet=False)
    gdown.extractall(output, './data')

### Load Dataframe

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

with open ('data/yahoo_answers_csv/classes.txt') as f:
    classes = f.read()
rev_label_map = {i: c for i, c in enumerate(classes.split("\n")[:-1])}
label_map = {v : i for i, v in rev_label_map.items()}

def dataframe_process(df):
    df = df.fillna('')
    df['Text'] = 'Q. ' + df['Question'] + ' ' + df['Question Desc'] + ' A. ' + df['Answers']
    df = df.drop(['Question', 'Question Desc', 'Answers'], axis=1)
    df['Label'] = df['Label'] - 1
    return df

df = pd.read_csv("./data/yahoo_answers_csv/train.csv", header = None, names = ['Label', 'Question', 'Question Desc','Answers'], nrows = 600000)
df = dataframe_process(df)

test = pd.read_csv("./data/yahoo_answers_csv/test.csv", header = None, names = ['Label', 'Question', 'Question Desc','Answers'])
test = dataframe_process(test)
X_test, y_test = list(test['Text']), list(test['Label'])


X_train, X_valid, y_train, y_valid = train_test_split(list(df['Text']), list(df['Label']), test_size=0.2)

del test
del df

### Modelling

In [10]:
from utils import device, train, evaluate, save_checkpoint, HANDataset
import pytorch_lightning as pl
from plmodel import HierarchicalAttentionNetwork, Preprocessor
from transformers import AutoTokenizer, AutoModel
from nltk.tokenize import PunktSentenceTokenizer
from tqdm import tqdm
import torch

In [11]:
pretrained_embedding_model = 'distilroberta-base'
embedding_layer = AutoModel.from_pretrained(pretrained_embedding_model).get_input_embeddings()
pre = Preprocessor(PunktSentenceTokenizer(), AutoTokenizer.from_pretrained(pretrained_embedding_model, use_fast=True))
train_features = [pre.encode_document(x) for x in tqdm(X_train)]
valid_features = [pre.encode_document(x) for x in tqdm(X_valid)]
test_features = [pre.encode_document(x) for x in tqdm(X_test)]
train_dataset = HANDataset(train_features, y_train)
valid_dataset = HANDataset(valid_features, y_valid)
test_dataset = HANDataset(test_features, y_test)

100%|██████████| 480000/480000 [07:45<00:00, 1032.22it/s]
100%|██████████| 120000/120000 [01:57<00:00, 1020.06it/s]
100%|██████████| 60000/60000 [01:02<00:00, 963.59it/s] 


### Train

In [12]:
batch_size = 64
lr = 1e-3
epochs = 10
model_filename = f"model.pth.tar"

model = HierarchicalAttentionNetwork(n_classes = len(rev_label_map.keys()), 
                                    embedding_layer = embedding_layer,
                                    embedding_size = 768,
                                    fine_tune_embeddings = False, 
                                    word_rnn_size = 50, 
                                    sentence_rnn_size = 50, 
                                    word_rnn_layers = 1,
                                    sentence_rnn_layers = 1, 
                                    word_att_size = 100, # size of the word-level attention layer (also the size of the word context vector)
                                    sentence_att_size = 100, # size of the sentence-level attention layer (also the size of the sentence context vector)
                                    dropout = 0.3,
                                    train_dataset = train_dataset,
                                    valid_dataset = valid_dataset,
                                    test_dataset = test_dataset)

In [None]:
trainer = pl.Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=20)
trainer.fit(model)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type              | Params
---------------------------------------------------------
0 | sentence_attention | SentenceAttention | 38.9 M
1 | fc                 | Linear            | 1.0 K 
2 | dropout            | Dropout           | 0     
---------------------------------------------------------
313 K     Trainable params
38.6 M    Non-trainable params
38.9 M    Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

In [18]:
trainer.test()



1

In [15]:
dir(trainer)

['_Trainer__test_given_model',
 '_Trainer__test_using_best_weights',
 '__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_cache_logged_metrics',
 '_default_root_dir',
 '_device_type',
 '_distrib_type',
 '_enable_pl_optimizer',
 '_flatten_dl_only',
 '_get_distributed_sampler',
 '_is_data_prepared',
 '_progress_bar_callback',
 '_reset_eval_dataloader',
 '_reset_result_and_set_hook_fx_name',
 '_state',
 '_weights_save_path',
 '_worker_check',
 'accelerator_backend',
 'accelerator_connector',
 'accumulate_grad_batches',
 'accumulation_scheduler',
 'add_argparse_args',
 'a