In [23]:
import torch
import re
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
import csv
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import random
import time
import os
from collections import Counter
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader
from keras.preprocessing.sequence import pad_sequences
# coding: utf-8
import torch
from transformers import AutoTokenizer, AutoModel
from transformers import BertModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
class TitleCategorization:
    def __init__(self):
        self.readFile()
    def readFile(self):
        # read train.csv
        datafile = os.path.join('news_data','train.csv')
        file = open(datafile,encoding='utf-8')
        rows = csv.reader(file)
        next(rows) # remove header
        self.train_data = []
        for row in rows:
            self.train_data.append(row[1:])
        file.close()
        
        # read test.csv
        datafile = os.path.join('news_data','test.csv')
        file = open(datafile,encoding='utf-8')
        rows = csv.reader(file)
        next(rows) # remove header
        self.test_data = []
        for row in rows:
            self.test_data.append(row)
        file.close()
    def labelToNum(self):
        for i in range(len(self.train_label)):
            # sport, entertainment, politics, tech, business
            if(self.train_label[i] == "sport"):
                self.train_label[i] = 0
            elif(self.train_label[i] == "entertainment"):
                self.train_label[i] = 1
            elif(self.train_label[i] == "politics"):
                self.train_label[i] = 2
            elif(self.train_label[i] == "tech"):
                self.train_label[i] = 3
            elif(self.train_label[i] == "business"):
                self.train_label[i] = 4
    def shuffleData(self):
        random.shuffle(self.train_data)
    def prepareData(self):
        # get train label/title
        self.train_label = []
        self.train_title = []
        for row in self.train_data:
            self.train_label.append(row[0])
            self.train_title.append(row[1])
        # get test label/title
        self.test_title = []
        for row in self.test_data:
            self.test_title.append(row[1])
        
        stop_words = set(stopwords.words('english'))
        # filter stopwords from training title
        for i in range(len(self.train_title)):
            text = self.train_title[i].lower()
            text_tokens = bert_tokenizer.tokenize('[CLS]'+text+'[SEP]')
            #text_tokens = [w for w in text_tokens if not w in stop_words]
            self.train_title[i] = text_tokens
        # filter stopwords from test title
        for i in range(len(self.test_title)):
            text = self.test_title[i].lower()
            text_tokens = bert_tokenizer.tokenize('[CLS]'+text+'[SEP]')
            #text_tokens = [w for w in text_tokens if not w in stop_words]
            self.test_title[i] = text_tokens
    def splitTrainValidation(self,rate):
        validation_num = int(len(self.train_title)*rate)
        self.valid_label = self.train_label[-validation_num:]
        self.valid_title = self.train_title[-validation_num:]
        self.train_label = self.train_label[:-validation_num]
        self.train_title = self.train_title[:-validation_num]
    def toTensor(self):
        self.train_text_list = torch.tensor([])
        self.train_label_list = torch.tensor([])
        self.train_offset = [0]
        for i in range(len(self.train_title)):
            self.train_label_list.append(label_pipeline(self.train_label[i]))
            processed_text = torch.tensor(text_pipeline(self.train_title[i]), dtype=torch.int64)
            self.train_offset.append(processed_text.size(0))
            self.train_text_list.append(processed_text)
        self.train_offset = torch.tensor(offsets[:-1]).cumsum(dim=0)
        self.train_label_list = torch.tensor(self.train_label_list, dtype=torch.int64)
        self.train_text_list = torch.cat(self.train_text_list,axis=1)
        #
        self.valid_text_list = []
        self.valid_label_list = []
        self.valid_offset = [0]
        for i in range(len(self.valid_title)):
            self.valid_label_list.append(label_pipeline(self.valid_label[i]))
            processed_text = torch.tensor(text_pipeline(self.valid_title[i]), dtype=torch.int64)
            self.valid_offset.append(processed_text.size(0))
            self.valid_text_list.append(processed_text)
        self.valid_label_list = torch.tensor(self.valid_label_list, dtype=torch.int64)
        self.valid_offset = torch.tensor(self.valid_offset[:-1]).cumsum(dim=0)
        self.train_text_list = torch.cat(self.train_text_list)
    def evaluate(self,dataloader):
        self.model.eval()
        total_acc, total_count = 0, 0

        with torch.no_grad():
            for idx, (label, tokens,masks) in enumerate(dataloader):
                predicted_label = self.model(tokens,masks)
                loss = self.criterion(predicted_label, label)
                total_acc += (predicted_label.argmax(1) == label).sum().item()
                total_count += label.size(0)
        return total_acc/total_count
    def run(self,dataloader):
        self.model.train()
        total_acc, total_count = 0, 0
        log_interval = 500
        start_time = time.time()

        for idx, (label, tokens,masks) in enumerate(dataloader):
            self.optimizer.zero_grad()
            predicted_label = self.model(tokens,masks)
            loss = self.criterion(predicted_label, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.1)
            self.optimizer.step()
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            if idx % log_interval == 0 and idx > 0:
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches '
                      '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                                  total_acc/total_count))
                total_acc, total_count = 0, 0
                start_time = time.time()
    def getTestResult(self):
        # dataloader
        test_dataset = self.test_title.copy()
        test_dataloader = DataLoader(test_dataset, batch_size=1,
                                      shuffle=False, collate_fn=collate_batch_test)
        self.model.eval()
        total_acc, total_count = 0, 0
        
        result = []
        with torch.no_grad():
            for idx, (tokens,masks) in enumerate(test_dataloader):
                predicted_label = self.model(tokens,masks)
#                 attention = self.model.embedding(tokens,masks)[-1]
#                 if(idx == 0 or idx == 1 or idx == 2 or idx == 3 or idx == 4):
#                     title = bert_tokenizer.convert_ids_to_tokens(tokens[0])
#                     head_view(attention, title)
                idx = predicted_label.argmax(1).item()
                result.append(idx)
        # turn result from number back to string
        for i in range(len(result)):
            if(result[i] == 0):
                result[i] = 'sport'
            elif(result[i] == 1):
                result[i] = 'entertainment'
            elif(result[i] == 2):
                result[i] = 'politics'
            elif(result[i] == 3):
                result[i] = 'tech'
            elif(result[i] == 4):
                result[i] = 'business'
        # write result to file
        file = open('309555025_submission_transformer.csv','w')
        file.write('Id,Category\n')
        for i in range(len(result)):
            file.write(str(i)+','+result[i]+'\n')
        file.close()

    def train(self):
        num_class = 5
        vocab_size = len(vocab)
        self.model = TextClassificationModel(vocab_size, num_class).to(device)
        
        # training setting
        self.criterion = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=LR)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1.0, gamma=0.1)
        total_accu = None
        
        # dataloader
        train_dataset = list(zip(self.train_title,self.train_label))
        train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                      shuffle=True, collate_fn=collate_batch)
        valid_dataset = list(zip(self.valid_title,self.valid_label))
        valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE,
                                      shuffle=False, collate_fn=collate_batch)
        # training
        for epoch in range(1, EPOCHS + 1):
            epoch_start_time = time.time()
            self.run(train_dataloader)
            accu_val = self.evaluate(valid_dataloader)
            if total_accu is not None and total_accu > accu_val:
                self.scheduler.step()
            else:
                total_accu = accu_val
            print('-' * 59)
            print('| end of epoch {:3d} | time: {:5.2f}s | '
                  'valid accuracy {:8.3f} '.format(epoch,
                                                   time.time() - epoch_start_time,
                                                   accu_val))
            print('-' * 59)

In [25]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size,num_class):
        super(TextClassificationModel, self).__init__()
        #self.embedding, num_embeddings, embedding_dim = create_emb_layer(True)
        #self.embedding = nn.EmbeddingBag(vocab_size, EMBED_DIM, sparse=True)
        self.embedding = BertModel.from_pretrained('bert-base-uncased',return_dict=False,output_attentions=True)
        self.num_layers = NUM_LAYERS
        self.hidden_size = HIDDEN_SIZE
        self.lstm = nn.LSTM(
            #input_size=embedding_dim,
            input_size=EMBED_DIM,
            hidden_size=HIDDEN_SIZE,
            num_layers=NUM_LAYERS,
            batch_first = True, # use batch size as first dimension, x -> (batch_size, seq, input_size)
            dropout = DROP_OUT
        )
        self.fc = nn.Linear(HIDDEN_SIZE, num_class)
        self.fc.weight.data.uniform_(-1, 1)
        self.fc.bias.data.uniform_(-1, 1)

    def forward(self, tokens,masks):
        # Bert
        #out = self.embedding(text, offsets)
        out, b, _ = self.embedding(tokens, attention_mask=masks)
        out = out[:, 0, :]
        out = out.view(-1,1,out.size(1))
        # lstm
        h0 = torch.zeros(self.num_layers, out.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, out.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(out,(h0,c0))
        out = out[:,-1,:]
        out = self.fc(out)
        return out

In [26]:
def collate_batch(batch):
    label_list, text_list, masks_list = [],[],[]
    for (_text, _label) in batch:
        label_list.append(label_pipeline(_label))
        tokens = bert_tokenizer.convert_tokens_to_ids(_text)
        tokens_padding = pad_sequences([tokens], maxlen=10, padding='post', dtype='int')
        masks = [[float(value>0) for value in values] for values in tokens_padding]
        text_list.append(tokens_padding[0])
        masks_list.append(masks[0])
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.tensor(text_list, dtype=torch.int64)
    masks_list = torch.tensor(masks_list, dtype=torch.float)
    return label_list.to(device), text_list.to(device), masks_list.to(device)

def collate_batch_test(batch):
    text_list, masks_list = [],[]
    for (_text) in batch:
        tokens = bert_tokenizer.convert_tokens_to_ids(_text)
        tokens_padding = pad_sequences([tokens], maxlen=10, padding='post', dtype='int')
        masks = [[float(value>0) for value in values] for values in tokens_padding]
        text_list.append(tokens_padding[0])
        masks_list.append(masks[0])
    text_list = torch.tensor(text_list, dtype=torch.int64)
    masks_list = torch.tensor(masks_list, dtype=torch.float)
    return text_list.to(device), masks_list.to(device)

In [28]:
from bertviz import head_view
# Hyperparameters
EPOCHS = 10 # epoch
LR = 2.5  # learning rate
BATCH_SIZE = 32 # batch size for training
NUM_LAYERS = 1
EMBED_DIM = 768
HIDDEN_SIZE = 128
DROP_OUT = 0.0
RATE = 0.3 # rate of validation set size

# Tokenizer and Bert Model
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
vocab = bert_tokenizer.vocab
# text, label pipeline
text_pipeline = lambda x: bert_tokenizer.convert_tokens_to_ids([token for token in x])
label_pipeline = lambda x: int(x)
#
attention = 0
#
tc = TitleCategorization()
tc.shuffleData()
tc.prepareData()
tc.labelToNum()
tc.splitTrainValidation(RATE)
tc.train()
tc.getTestResult()

-----------------------------------------------------------
| end of epoch   1 | time:  5.08s | valid accuracy    0.764 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  5.12s | valid accuracy    0.854 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time:  4.95s | valid accuracy    0.826 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time:  5.03s | valid accuracy    0.893 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time:  5.01s | valid accuracy    0.895 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   6 | time:  4.94s |