## Import Libs

In [15]:
%matplotlib inline
import time
import pandas
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pythainlp
import csv
from torch.utils.data import Dataset, DataLoader
from IPython.display import display
from pythainlp.tokenize import word_tokenize
from collections import defaultdict
from torchtext.vocab import build_vocab_from_iterator
from pythainlp.tokenize import word_tokenize
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
import re
from sklearn.model_selection import train_test_split
import torchmetrics

In [16]:
import torch
torch.manual_seed(0)


import random
random.seed(0)

import numpy as np
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


In [17]:
datapath = "/home/nopparuj/CP3-S2/NLP/takehome/"
df_occupation = pd.read_csv(datapath+"occupation_mapper.csv")
df_test = pd.read_csv(datapath+"test_for_submission.csv", index_col="Id")
df_train = pd.read_csv(datapath+"train.csv")
df_train = df_train.drop(
    ["occupation_group", "occupation", "occupation_group_index"], axis=1)


In [18]:
# Data cleaning
df_train.job_title = df_train.job_title.apply(lambda x : x.lower().strip())
df_test.job_title = df_test.job_title.apply(lambda x : x.lower().strip())

def clean_html(x):
    r = re.compile('<.*?>') 
    return re.sub(r, ' ', x)

def clean_special_chars(text):
    text = re.sub(r'[^\w\s\u0E00-\u0E7F]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text


df_train.job_title = df_train.job_title.apply(clean_html)
df_test.job_title = df_test.job_title.apply(clean_html)
df_train.job_title = df_train.job_title.apply(clean_special_chars)
df_test.job_title = df_test.job_title.apply(clean_special_chars)

In [20]:
# tokenized
train_title = []
train_data = []
test_data = []
for i in df_train.iterrows():
    row = i[1]
    title = row["job_title"]
    title = word_tokenize(title, engine="newmm")
    train_title.append((title))
    train_data.append([row["occupation_index"],title])

vocab = build_vocab_from_iterator(train_title, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

encoding_data = [[label,vocab(tokens)] for label,tokens in train_data]
for i in df_test.iterrows():
    row = i[1]
    title = row["job_title"]
    title = word_tokenize(title)
    test_data.append([0,vocab(title)])

In [21]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim)
        self.dropout = nn.Dropout(p=0.5)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        dropout = self.dropout(embedded)
        return self.fc(dropout)

In [22]:
tacc = torchmetrics.Accuracy(task="multiclass", num_classes=len(df_occupation))
tf1 = torchmetrics.F1Score(task="multiclass", num_classes=len(df_occupation))

In [23]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(_label)
         processed_text = torch.tensor(_text, dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [24]:
batch_size = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_class = len(df_occupation)
emsize = 300
model = TextClassificationModel(len(vocab), emsize, num_class).to(device)

In [25]:
train_data, val_data = train_test_split(encoding_data, test_size=0.01, shuffle=True)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True,collate_fn=collate_batch)
val_dataloader = DataLoader(val_data, batch_size=1, shuffle=False,collate_fn=collate_batch)
test_dataloader = DataLoader(test_data, batch_size=1, shuffle=False,collate_fn=collate_batch)

In [26]:
EPOCHS = 30
LR = 0.001
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    model.train()
    cnt, loss_sum = 0, 0

    for idx, (label, text, offsets) in enumerate(train_dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label).mean()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        loss_sum += loss.item()
        optimizer.step()
        cnt += 1
    loss_train = loss_sum / cnt
    
    model.eval()
    preds = []
    true = []

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(val_dataloader):
            predicted_label = model(text, offsets)
            preds.append(predicted_label.argmax(dim=1).cpu().tolist())
            true.append(label.cpu().tolist())

    preds = torch.Tensor(preds).flatten()
    true = torch.Tensor(true).flatten()
    acc_val =  tacc(preds, true)
    f1_val = tf1(preds, true)

    print(f"Epoch {epoch} | train loss: {loss_train:.04f} | acc: {acc_val:.04f} | f1: {f1_val:.04f} ")

Epoch 1 | train loss: 6.6980 | acc: 0.0125 | f1: 0.0125 
Epoch 2 | train loss: 5.9389 | acc: 0.0875 | f1: 0.0875 
Epoch 3 | train loss: 5.2884 | acc: 0.1375 | f1: 0.1375 
Epoch 4 | train loss: 4.7045 | acc: 0.1500 | f1: 0.1500 
Epoch 5 | train loss: 4.1757 | acc: 0.1875 | f1: 0.1875 
Epoch 6 | train loss: 3.7183 | acc: 0.2500 | f1: 0.2500 
Epoch 7 | train loss: 3.3197 | acc: 0.3125 | f1: 0.3125 
Epoch 8 | train loss: 2.9660 | acc: 0.3750 | f1: 0.3750 
Epoch 9 | train loss: 2.6746 | acc: 0.4000 | f1: 0.4000 
Epoch 10 | train loss: 2.3968 | acc: 0.4500 | f1: 0.4500 
Epoch 11 | train loss: 2.1614 | acc: 0.5125 | f1: 0.5125 
Epoch 12 | train loss: 1.9468 | acc: 0.5375 | f1: 0.5375 
Epoch 13 | train loss: 1.7546 | acc: 0.5375 | f1: 0.5375 
Epoch 14 | train loss: 1.6005 | acc: 0.5500 | f1: 0.5500 
Epoch 15 | train loss: 1.4443 | acc: 0.5875 | f1: 0.5875 
Epoch 16 | train loss: 1.3116 | acc: 0.6250 | f1: 0.6250 
Epoch 17 | train loss: 1.2030 | acc: 0.6250 | f1: 0.6250 
Epoch 18 | train loss: 

In [27]:
model.eval()

import csv

with torch.no_grad(), open('predictions.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Id', 'Predicted'])  # Write header row
    for idx, (label, text, offsets) in enumerate(test_dataloader):
        predicted_label = model(text, offsets)
        pre = predicted_label.argmax(dim=1).item()
        writer.writerow([idx, pre])  # Write prediction row
