In [1]:
# ! pip install gdown pandas scikit-learn transformers nltk tqdm

### Download Data if needed

In [2]:
import os
if not os.path.exists('./data'):
    os.mkdir('./data')

if not os.path.exists('./data/yahoo_answers_csv/train.csv'):
    import gdown
    url = 'https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU'
    output = './data/yahoo_answers_csv.tar.gz'
    gdown.download(url, output, quiet=False)
    gdown.extractall(output, './data')

### Load Dataframe

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

with open ('data/yahoo_answers_csv/classes.txt') as f:
    classes = f.read()
rev_label_map = {i: c for i, c in enumerate(classes.split("\n")[:-1])}
label_map = {v : i for i, v in rev_label_map.items()}

def dataframe_process(df):
    df = df.fillna('')
    df['Text'] = 'Q. ' + df['Question'] + ' ' + df['Question Desc'] + ' A. ' + df['Answers']
    df = df.drop(['Question', 'Question Desc', 'Answers'], axis=1)
    df['Label'] = df['Label'] - 1
    return df

df = pd.read_csv("./data/yahoo_answers_csv/train.csv", header = None, names = ['Label', 'Question', 'Question Desc','Answers'], nrows = 60000)
df = dataframe_process(df)

test = pd.read_csv("./data/yahoo_answers_csv/test.csv", header = None, names = ['Label', 'Question', 'Question Desc','Answers'], nrows = 10000)
test = dataframe_process(test)
X_test, y_test = list(test['Text']), list(test['Label'])


X_train, X_valid, y_train, y_valid = train_test_split(list(df['Text']), list(df['Label']), test_size=0.2)

del test
del df

### Modelling

In [4]:
from utils import device, train, evaluate, save_checkpoint, HANDataset
from plmodel import HierarchicalAttentionNetwork, Preprocessor
from transformers import AutoTokenizer, AutoModel
from nltk.tokenize import PunktSentenceTokenizer
from tqdm import tqdm
import torch

In [5]:
pretrained_embedding_model = 'distilroberta-base'
pre = Preprocessor(PunktSentenceTokenizer(), AutoTokenizer.from_pretrained(pretrained_embedding_model, use_fast=True))
train_features = [pre.encode_document(x) for x in tqdm(X_train)]
valid_features = [pre.encode_document(x) for x in tqdm(X_valid)]
test_features = [pre.encode_document(x) for x in tqdm(X_test)]
train_dataset = HANDataset(train_features, y_train)
valid_dataset = HANDataset(valid_features, y_valid)
test_dataset = HANDataset(test_features, y_test)

100%|██████████| 48000/48000 [01:00<00:00, 794.46it/s]
100%|██████████| 12000/12000 [00:14<00:00, 809.38it/s]


### Train

In [6]:
batch_size = 64
lr = 1e-3
epochs = 10
model_filename = f"model.pth.tar"

model = HierarchicalAttentionNetwork(n_classes = len(rev_label_map.keys()), 
                                    embedding_layer = AutoModel.from_pretrained(pretrained_embedding_model).get_input_embeddings(),
                                    embedding_size = 768,
                                    fine_tune_embeddings = False, 
                                    word_rnn_size = 50, 
                                    sentence_rnn_size = 50, 
                                    word_rnn_layers = 1,
                                    sentence_rnn_layers = 1, 
                                    word_att_size = 100, # size of the word-level attention layer (also the size of the word context vector)
                                    sentence_att_size = 100, # size of the sentence-level attention layer (also the size of the sentence context vector)
                                    dropout = 0.3,
                                    train_dataset = train_dataset,
                                    valid_dataset = valid_dataset,
                                    test_dataset = test_dataset)


optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
criterion = torch.nn.CrossEntropyLoss()
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

model.to(device)
            
for epoch in range(epochs):
    train(train_loader, model, criterion, optimizer, epoch)
    evaluate(valid_loader, model)
#     save_checkpoint(epoch, model, optimizer, pretrained_embedding_model, rev_label_map, model_filename)



Epoch: 0	Time 35.616	Loss 1.3120	Accuracy 0.556

 * TEST ACCURACY - 64.1 per cent

Epoch: 1	Time 35.424	Loss 1.0539	Accuracy 0.643

 * TEST ACCURACY - 65.9 per cent

Epoch: 2	Time 35.244	Loss 1.0092	Accuracy 0.655

 * TEST ACCURACY - 66.3 per cent

Epoch: 3	Time 35.484	Loss 0.9717	Accuracy 0.667

 * TEST ACCURACY - 66.4 per cent

Epoch: 4	Time 35.200	Loss 0.9491	Accuracy 0.676

 * TEST ACCURACY - 66.8 per cent

Epoch: 5	Time 35.176	Loss 0.9244	Accuracy 0.681

 * TEST ACCURACY - 66.8 per cent

Epoch: 6	Time 35.532	Loss 0.9040	Accuracy 0.688

 * TEST ACCURACY - 66.8 per cent

Epoch: 7	Time 35.977	Loss 0.8807	Accuracy 0.697

 * TEST ACCURACY - 66.9 per cent

Epoch: 8	Time 35.152	Loss 0.8623	Accuracy 0.703

 * TEST ACCURACY - 67.2 per cent

Epoch: 9	Time 35.161	Loss 0.8479	Accuracy 0.706

 * TEST ACCURACY - 66.7 per cent

