## Import Libs

In [1]:
%matplotlib inline
import time
import pandas
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pythainlp
import csv
from torch.utils.data import Dataset, DataLoader
from IPython.display import display
from pythainlp.tokenize import word_tokenize
from collections import defaultdict
from torchtext.vocab import build_vocab_from_iterator
from pythainlp.tokenize import word_tokenize
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
import re
from sklearn.model_selection import train_test_split
import torchmetrics

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
torch.manual_seed(0)


import random
random.seed(0)

import numpy as np
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


In [3]:
datapath = "/home/nopparuj/CP3-S2/NLP/takehome/"
df_occupation = pd.read_csv(datapath+"occupation_mapper.csv")
df_test = pd.read_csv(datapath+"test_for_submission.csv", index_col="Id")
df_train = pd.read_csv(datapath+"train.csv")
df_train = df_train.drop(
    ["occupation_group", "occupation", "occupation_group_index"], axis=1)


In [4]:
# Data cleaning
df_train.job_title = df_train.job_title.apply(lambda x : x.lower().strip())
df_test.job_title = df_test.job_title.apply(lambda x : x.lower().strip())

def clean_html(x):
    r = re.compile('<.*?>') 
    return re.sub(r, ' ', x)


df_train.job_title = df_train.job_title.apply(clean_html)
df_test.job_title = df_test.job_title.apply(clean_html)

In [5]:
# tokenized
train_title = []
train_data = []
test_data = []
for i in df_train.iterrows():
    row = i[1]
    title = row["job_title"]
    title = word_tokenize(title, engine="newmm")
    train_title.append((title))
    train_data.append([row["occupation_index"],title])

vocab = build_vocab_from_iterator(train_title, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

encoding_data = [[label,vocab(tokens)] for label,tokens in train_data]
for i in df_test.iterrows():
    row = i[1]
    title = row["job_title"]
    title = word_tokenize(title)
    test_data.append([0,vocab(title)])

In [6]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim)
        self.dropout = nn.Dropout(p=0.5)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        dropout = self.dropout(embedded)
        return self.fc(dropout)

In [7]:
tacc = torchmetrics.Accuracy(task="multiclass", num_classes=len(df_occupation))
tf1 = torchmetrics.F1Score(task="multiclass", num_classes=len(df_occupation))

In [8]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(_label)
         processed_text = torch.tensor(_text, dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [9]:
batch_size = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_class = len(df_occupation)
emsize = 300
model = TextClassificationModel(len(vocab), emsize, num_class).to(device)

In [10]:
train_data, val_data = train_test_split(encoding_data, test_size=0.01, shuffle=True)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True,collate_fn=collate_batch)
val_dataloader = DataLoader(val_data, batch_size=1, shuffle=False,collate_fn=collate_batch)
test_dataloader = DataLoader(test_data, batch_size=1, shuffle=False,collate_fn=collate_batch)

In [11]:
EPOCHS = 30
LR = 0.001
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    model.train()
    cnt, loss_sum = 0, 0

    for idx, (label, text, offsets) in enumerate(train_dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label).mean()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        loss_sum += loss.item()
        optimizer.step()
        cnt += 1
    loss_train = loss_sum / cnt
    
    model.eval()
    preds = []
    true = []

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(val_dataloader):
            predicted_label = model(text, offsets)
            preds.append(predicted_label.argmax(dim=1).cpu().tolist())
            true.append(label.cpu().tolist())

    preds = torch.Tensor(preds).flatten()
    true = torch.Tensor(true).flatten()
    acc_val =  tacc(preds, true)
    f1_val = tf1(preds, true)

    print('-' * 59)
    print(f"Epoch {epoch} | train loss: {loss_train:.04f} | acc: {acc_val:.04f} | f1: {f1_val:.04f} ")
    print('-' * 59)

-----------------------------------------------------------
Epoch 1 | train loss: 6.7488 | acc: 0.0375 | f1: 0.0375 
-----------------------------------------------------------
-----------------------------------------------------------
Epoch 2 | train loss: 6.0116 | acc: 0.0625 | f1: 0.0625 
-----------------------------------------------------------
-----------------------------------------------------------
Epoch 3 | train loss: 5.3536 | acc: 0.1250 | f1: 0.1250 
-----------------------------------------------------------
-----------------------------------------------------------
Epoch 4 | train loss: 4.7763 | acc: 0.1250 | f1: 0.1250 
-----------------------------------------------------------
-----------------------------------------------------------
Epoch 5 | train loss: 4.2611 | acc: 0.1625 | f1: 0.1625 
-----------------------------------------------------------
-----------------------------------------------------------
Epoch 6 | train loss: 3.8006 | acc: 0.2250 | f1: 0.2250

In [12]:
model.eval()

import csv

with torch.no_grad(), open('predictions.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Id', 'Predicted'])
    for idx, (label, text, offsets) in enumerate(test_dataloader):
        predicted_label = model(text, offsets)
        pre = predicted_label.argmax(dim=1).item()
        writer.writerow([idx, pre]) 
