In [19]:
import torch
import os
import re
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
import csv
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import random
import time
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
def getPretrainedModel():
    file = open('glove.6B.300d.txt','r',encoding="utf-8")
    rows = file.readlines()
    for row in rows:
        row = row.split()
        word_dict[row[0]] = [float(n) for n in row[1:]]
# generate pre-trained model
word_dict = {}
getPretrainedModel()

In [15]:
class TitleCategorization:
    def __init__(self):
        self.readFile()
    def readFile(self):
        # read train.csv
        datafile = os.path.join('news_data','train.csv')
        file = open(datafile,encoding='utf-8')
        rows = csv.reader(file)
        next(rows) # remove header
        self.train_data = []
        for row in rows:
            self.train_data.append(row[1:])
        file.close()
        
        # read test.csv
        datafile = os.path.join('news_data','test.csv')
        file = open(datafile,encoding='utf-8')
        rows = csv.reader(file)
        next(rows) # remove header
        self.test_data = []
        for row in rows:
            self.test_data.append(row)
        file.close()
    def labelToNum(self):
        for i in range(len(self.train_label)):
            # sport, entertainment, politics, tech, business
            if(self.train_label[i] == "sport"):
                self.train_label[i] = 0
            elif(self.train_label[i] == "entertainment"):
                self.train_label[i] = 1
            elif(self.train_label[i] == "politics"):
                self.train_label[i] = 2
            elif(self.train_label[i] == "tech"):
                self.train_label[i] = 3
            elif(self.train_label[i] == "business"):
                self.train_label[i] = 4
    def shuffleData(self):
        random.shuffle(self.train_data)
    def prepareData(self):
        # get train label/title
        self.train_label = []
        self.train_title = []
        for row in self.train_data:
            self.train_label.append(row[0])
            self.train_title.append(row[1])
        # get test label/title
        self.test_title = []
        for row in self.test_data:
            self.test_title.append(row[1])
        
        stop_words = set(stopwords.words('english'))
        # filter stopwords from training title
        for i in range(len(self.train_title)):
            text = self.train_title[i].lower()
            reg_token = RegexpTokenizer(r'[a-z\-]+')
            word_tokens = reg_token.tokenize(text)
            #word_tokens = word_tokenize(text)
            #word_tokens = text.split()
            filtered_sentence = [w for w in word_tokens if not w in stop_words]
            self.train_title[i] = filtered_sentence
        # filter stopwords from test title
        for i in range(len(self.test_title)):
            text = self.test_title[i].lower()
            reg_token = RegexpTokenizer(r'[a-z\-]+')
            #reg_token = RegexpTokenizer(r'\w+')
            word_tokens = reg_token.tokenize(text)
            filtered_sentence = [w for w in word_tokens if not w in stop_words]
            self.test_title[i] = filtered_sentence
    def splitTrainValidation(self,rate):
        validation_num = int(len(self.train_title)*rate)
        self.valid_label = self.train_label[-validation_num:]
        self.valid_title = self.train_title[-validation_num:]
        self.train_label = self.train_label[:-validation_num]
        self.train_title = self.train_title[:-validation_num]
    def toTensor(self):
        self.train_text_list = torch.tensor([])
        self.train_label_list = torch.tensor([])
        self.train_offset = [0]
        for i in range(len(self.train_title)):
            self.train_label_list.append(label_pipeline(self.train_label[i]))
            processed_text = torch.tensor(text_pipeline(self.train_title[i]), dtype=torch.int64)
            self.train_offset.append(processed_text.size(0))
            self.train_text_list.append(processed_text)
        self.train_offset = torch.tensor(offsets[:-1]).cumsum(dim=0)
        self.train_label_list = torch.tensor(self.train_label_list, dtype=torch.int64)
        self.train_text_list = torch.cat(self.train_text_list,axis=1)
        #
        self.valid_text_list = []
        self.valid_label_list = []
        self.valid_offset = [0]
        for i in range(len(self.valid_title)):
            self.valid_label_list.append(label_pipeline(self.valid_label[i]))
            processed_text = torch.tensor(text_pipeline(self.valid_title[i]), dtype=torch.int64)
            self.valid_offset.append(processed_text.size(0))
            self.valid_text_list.append(processed_text)
        self.valid_label_list = torch.tensor(self.valid_label_list, dtype=torch.int64)
        self.valid_offset = torch.tensor(self.valid_offset[:-1]).cumsum(dim=0)
        self.train_text_list = torch.cat(self.train_text_list)
    def evaluate(self,dataloader):
        self.model.eval()
        total_acc, total_count = 0, 0

        with torch.no_grad():
            for idx, (label, text, offsets) in enumerate(dataloader):
                predicted_label = self.model(text, offsets)
                loss = self.criterion(predicted_label, label)
                total_acc += (predicted_label.argmax(1) == label).sum().item()
                total_count += label.size(0)
        return total_acc/total_count
    def run(self,dataloader):
        self.model.train()
        total_acc, total_count = 0, 0
        log_interval = 500
        start_time = time.time()

        for idx, (label, text, offsets) in enumerate(dataloader):
            self.optimizer.zero_grad()
            predicted_label = self.model(text, offsets)
            loss = self.criterion(predicted_label, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.1)
            self.optimizer.step()
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            if idx % log_interval == 0 and idx > 0:
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches '
                      '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                                  total_acc/total_count))
                total_acc, total_count = 0, 0
                start_time = time.time()
    def getTestResult(self):
        # dataloader
        test_dataset = self.test_title.copy()
        test_dataloader = DataLoader(test_dataset, batch_size=1,
                                      shuffle=False, collate_fn=collate_batch_test)
        self.model.eval()
        total_acc, total_count = 0, 0
        
        result = []
        with torch.no_grad():
            for idx, (text, offsets) in enumerate(test_dataloader):
                predicted_label = self.model(text, offsets)
                idx = predicted_label.argmax(1).item()
                result.append(idx)
        # turn result from number back to string
        for i in range(len(result)):
            if(result[i] == 0):
                result[i] = 'sport'
            elif(result[i] == 1):
                result[i] = 'entertainment'
            elif(result[i] == 2):
                result[i] = 'politics'
            elif(result[i] == 3):
                result[i] = 'tech'
            elif(result[i] == 4):
                result[i] = 'business'
        # write result to file
        file = open('309555025_submission_RNN.csv','w')
        file.write('Id,Category\n')
        for i in range(len(result)):
            file.write(str(i)+','+result[i]+'\n')
        file.close()

    def train(self):
        num_class = 5
        vocab_size = len(vocab)
        self.model = TextClassificationModel(vocab_size, num_class).to(device)
        
        # training setting
        self.criterion = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=LR)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1.0, gamma=0.1)
        total_accu = None
        
        # dataloader
        train_dataset = list(zip(self.train_title,self.train_label))
        train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                      shuffle=True, collate_fn=collate_batch)
        valid_dataset = list(zip(self.valid_title,self.valid_label))
        valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE,
                                      shuffle=False, collate_fn=collate_batch)
        # training
        for epoch in range(1, EPOCHS + 1):
            epoch_start_time = time.time()
            self.run(train_dataloader)
            accu_val = self.evaluate(valid_dataloader)
            if total_accu is not None and total_accu > accu_val:
                self.scheduler.step()
            else:
                total_accu = accu_val
            print('-' * 59)
            print('| end of epoch {:3d} | time: {:5.2f}s | '
                  'valid accuracy {:8.3f} '.format(epoch,
                                                   time.time() - epoch_start_time,
                                                   accu_val))
            print('-' * 59)

In [16]:
def create_emb_layer(non_trainable=False):
    num_embeddings, embedding_dim = weight_matrix.size()
    emb_layer = nn.EmbeddingBag(num_embeddings, embedding_dim, sparse=True)
    emb_layer.load_state_dict({'weight': weight_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(True)
        #self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.num_layers = NUM_LAYERS
        self.hidden_size = HIDDEN_SIZE
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            #input_size=embed_dim,
            hidden_size=HIDDEN_SIZE,
            num_layers=NUM_LAYERS,
            batch_first = True, # use batch size as first dimension, x -> (batch_size, seq, input_size)
            dropout = DROP_OUT
        )
        self.fc = nn.Linear(HIDDEN_SIZE, num_class)
        self.fc.weight.data.uniform_(-1, 1)
        self.fc.bias.data.uniform_(-1, 1)

    def forward(self, text, offsets):
        out = self.embedding(text, offsets)
        h0 = torch.zeros(self.num_layers, out.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, out.size(0), self.hidden_size).to(device)
        out = out.view(-1,1,VEC_LENGTH)
        #out = out.view(-1,1,out.size(1))
        out, _ = self.lstm(out,(h0,c0))
        out = out[:,-1,:]
        out = self.fc(out)
        return out

In [17]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)
def collate_batch_test(batch):
    text_list, offsets = [], [0]
    for _text in batch:
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list.to(device), offsets.to(device)

In [20]:
# Hyperparameters
EPOCHS = 10 # epoch
LR = 2.5  # learning rate
BATCH_SIZE = 16 # batch size for training
NUM_LAYERS = 1
HIDDEN_SIZE = 256
DROP_OUT = 0.0
VEC_LENGTH = 300
RATE = 0.3 # rate of validation set size
#
tc = TitleCategorization()
tc.shuffleData()
tc.prepareData()
tc.labelToNum()
tc.splitTrainValidation(RATE)
#
# build weight matrix
counter = Counter()
for line in tc.train_title:
    counter.update(line)
for line in tc.valid_title:
    counter.update(line)
for line in tc.test_title:
    counter.update(line)
vocab = Vocab(counter, min_freq=1)
weight_matrix = np.zeros((len(vocab), VEC_LENGTH))
words_found = 0
for i, word in enumerate(vocab.itos): # itos: match from index to corresponding string
    try: 
        #print(word)
        weight_matrix[i] = word_dict[word]
        words_found += 1
    except KeyError:
        weight_matrix[i] = np.random.normal(scale=0.6, size=(VEC_LENGTH, ))
print(words_found,'/',len(vocab))
weight_matrix = torch.from_numpy(weight_matrix)
# text, label pipeline
text_pipeline = lambda x: [vocab[token] for token in x]
label_pipeline = lambda x: int(x)
#
tc.train()
tc.getTestResult()


3586 / 3642
-----------------------------------------------------------
| end of epoch   1 | time:  0.33s | valid accuracy    0.854 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  0.26s | valid accuracy    0.876 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time:  0.26s | valid accuracy    0.882 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time:  0.27s | valid accuracy    0.876 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time:  0.27s | valid accuracy    0.878 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   6 | ti