### Download Data if needed

In [1]:
import os
if not os.path.exists('./data'):
    os.mkdir('./data')

if not os.path.exists('./data/yahoo_answers_csv/train.csv'):
    import gdown
    url = 'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU'
    output = './data/yahoo_answers_csv.tar.gz'
    gdown.download(url, output, quiet=False)
    gdown.extractall(output, './data')

### Load Dataframe

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

with open ('data/yahoo_answers_csv/classes.txt') as f:
    classes = f.read()
rev_label_map = {i: c for i, c in enumerate(classes.split("\n")[:-1])}
label_map = {v : i for i, v in rev_label_map.items()}

def dataframe_process(df):
    df = df.fillna('')
    df['Text'] = 'Q. ' + df['Question'] + ' ' + df['Question Desc'] + ' A. ' + df['Answers']
    df = df.drop(['Question', 'Question Desc', 'Answers'], axis=1)
    df['Label'] = df['Label'] - 1
    return df

df = pd.read_csv("./data/yahoo_answers_csv/train.csv", header = None, names = ['Label', 'Question', 'Question Desc','Answers'])
df = dataframe_process(df)

test = pd.read_csv("./data/yahoo_answers_csv/test.csv", header = None, names = ['Label', 'Question', 'Question Desc','Answers'])
test = dataframe_process(test)
X_test, y_test = list(test['Text']), list(test['Label'])


X_train, X_val, y_train, y_val = train_test_split(list(df['Text']), list(df['Label']), test_size=0.1)

### Modelling

In [3]:
from utils import HANDataset
import pytorch_lightning as pl
from model import HierarchicalAttentionNetwork, Preprocessor
from transformers import AutoTokenizer, AutoModel
from nltk.tokenize import PunktSentenceTokenizer
from tqdm import tqdm
import torch

In [4]:
pretrained_embedding_model = 'distilroberta-base'
embedding_layer = AutoModel.from_pretrained(pretrained_embedding_model).get_input_embeddings()
pre = Preprocessor(PunktSentenceTokenizer(), AutoTokenizer.from_pretrained(pretrained_embedding_model, use_fast=True))
train_features = [pre.encode_document(x) for x in tqdm(X_train)]
val_features = [pre.encode_document(x) for x in tqdm(X_val)]
test_features = [pre.encode_document(x) for x in tqdm(X_test)]
train_dataset = HANDataset(train_features, y_train)
val_dataset = HANDataset(val_features, y_val)
test_dataset = HANDataset(test_features, y_test)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=480.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=331070498.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…

  0%|          | 0/1260000 [00:00<?, ?it/s]




100%|██████████| 1260000/1260000 [46:01<00:00, 456.24it/s] 
100%|██████████| 140000/140000 [05:08<00:00, 453.84it/s]
100%|██████████| 60000/60000 [02:12<00:00, 451.68it/s]


### Train

In [5]:
model_filename = f"model.pth.tar"

model = HierarchicalAttentionNetwork(n_classes = len(rev_label_map.keys()), 
                                    embedding_layer = embedding_layer,
                                    embedding_size = 768,
                                    fine_tune_embeddings = False, 
                                    word_rnn_size = 50, 
                                    sentence_rnn_size = 50, 
                                    word_rnn_layers = 1,
                                    sentence_rnn_layers = 1, 
                                    word_att_size = 100, # size of the word-level attention layer (also the size of the word context vector)
                                    sentence_att_size = 100, # size of the sentence-level attention layer (also the size of the sentence context vector)
                                    dropout = 0.3,
                                    train_dataset = train_dataset,
                                    valid_dataset = val_dataset,
                                    test_dataset = test_dataset,
                                    batch_size = 256,
                                    lr = 1e-3)



In [6]:
trainer = pl.Trainer(gpus=1, max_epochs=10, progress_bar_refresh_rate=100)
trainer.fit(model)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type              | Params
---------------------------------------------------------
0 | sentence_attention | SentenceAttention | 38.9 M
1 | fc                 | Linear            | 1.0 K 
2 | dropout            | Dropout           | 0     
---------------------------------------------------------
313 K     Trainable params
38.6 M    Non-trainable params
38.9 M    Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1

In [7]:
trainer.test()



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': tensor(0.7409, device='cuda:0'),
 'test_loss': tensor(0.7983, device='cuda:0')}
--------------------------------------------------------------------------------


[{'test_loss': 0.7982543110847473, 'test_acc': 0.7408833503723145}]

In [23]:
state_dict = model.state_dict()
del state_dict['sentence_attention.word_attention.embeddings.weight']
torch.save(state_dict, 'model.pth')