In [1]:
import torch, torchdata, torchtext
import torch.nn as nn
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.__version__, torchdata.__version__, torchtext.__version__

('2.2.2+cu121', '0.7.1', '0.17.2+cpu')

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
SEED = 1234 #change three times
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## 1. load dataset

In [5]:
df = pd.read_json('../data/train-qar.jsonl', lines=True, nrows=300000)
df = df[['category', 'questionText']]
df.head(5)

Unnamed: 0,category,questionText
0,Toys_and_Games,"Many have stated similar to the following: ""Pa..."
1,Health_and_Personal_Care,Will these work with the Phillips sonicare han...
2,Cell_Phones_and_Accessories,What kind of sim card it use?
3,Home_and_Kitchen,does anyone know if this dinnerware set does n...
4,Musical_Instruments,I'm thinking of getting in to modular synthesi...


In [6]:
# Product category
df['category'].unique()

array(['Toys_and_Games', 'Health_and_Personal_Care',
       'Cell_Phones_and_Accessories', 'Home_and_Kitchen',
       'Musical_Instruments', 'Baby', 'Sports_and_Outdoors',
       'Patio_Lawn_and_Garden', 'Video_Games', 'Pet_Supplies',
       'Tools_and_Home_Improvement', 'Beauty', 'Electronics',
       'Grocery_and_Gourmet_Food', 'Automotive', 'Office_Products',
       'Clothing_Shoes_and_Jewelry'], dtype=object)

In [7]:
df['category'].value_counts()[:10]

category
Electronics                    69163
Home_and_Kitchen               43733
Sports_and_Outdoors            28873
Tools_and_Home_Improvement     25503
Health_and_Personal_Care       19230
Automotive                     18688
Cell_Phones_and_Accessories    17052
Patio_Lawn_and_Garden          14845
Toys_and_Games                 12599
Office_Products                10436
Name: count, dtype: int64

In [8]:
# convert the text to numeric class
# class_mapping = {
#     'Toys_and_Games': 0,
#     'Health_and_Personal_Care': 1,
#     'Cell_Phones_and_Accessories': 2,
#     'Home_and_Kitchen': 3,
#     'Musical_Instruments': 4,
#     'Baby': 5,
#     'Sports_and_Outdoors': 6,
#     'Patio_Lawn_and_Garden': 7,
#     'Video_Games': 8,
#     'Pet_Supplies': 9,
#     'Tools_and_Home_Improvement': 10,
#     'Beauty': 11,
#     'Electronics': 12,
#     'Grocery_and_Gourmet_Food': 13,
#     'Automotive': 14,
#     'Office_Products': 15,
#     'Clothing_Shoes_and_Jewelry': 16
# }

class_mapping = {
    'Electronics': 0,
    'Home_and_Kitchen': 1,
    'Sports_and_Outdoors': 2,
    'Tools_and_Home_Improvement': 3,
    'Health_and_Personal_Care': 4,
    'Automotive': 5,
    'Cell_Phones_and_Accessories': 6,
    'Patio_Lawn_and_Garden': 7,
    'Toys_and_Games': 8,
    'Office_Products': 9
}

# Map class names to numerical labels
df['category'] = df['category'].map(class_mapping)

In [9]:
# sample 100 data in each classes
df_sample = df.groupby('category', group_keys=False).apply(lambda x: x.sample(2000))

In [10]:
# convert to lower case
df_sample['questionText']  =  df_sample['questionText'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [11]:
def data_cleaning(data):
    regex_s = re.sub("\\(.+?\\)|[\r\n|\n\r]|!", "", data)
    fin = " ".join(regex_s.split())
    return fin

In [12]:
df_sample['questionText'] = df_sample['questionText'].apply(data_cleaning)

In [13]:
df_sample = df_sample.astype({'category':int})

In [14]:
train_df, val_df = train_test_split(df_sample, test_size=0.1,stratify=df_sample['category'], random_state=SEED)

In [15]:
train_df, test_df = train_test_split(train_df, test_size=0.1, stratify=train_df['category'],random_state=SEED)

In [16]:
train_df['category'].value_counts()

category
4    1620
6    1620
1    1620
7    1620
5    1620
9    1620
2    1620
8    1620
3    1620
0    1620
Name: count, dtype: int64

In [17]:
train_df

Unnamed: 0,category,questionText
16094,4,is the drops better or the pills?
247198,6,can this be used for the samsung galaxy note 2...
15813,1,do i have to remove seeds before putting fruit...
118360,7,how many posts are needed per section?
194796,5,- is the item durable?
...,...,...
63893,8,please explain the different gauge sizes.
225007,0,does it make sense to purchase this kit if i a...
132286,0,does this fir with the original windows surfac...
170970,4,is this asprin tablet coated?


## 2.Preprocessing

### Tokenizing

In [18]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
tokens    = tokenizer("What is the best product?")
tokens

['What', 'is', 'the', 'best', 'product', '?']

### Text to integers (numeral)

In [19]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data):
    for data_sample in data:
        yield tokenizer(data_sample) 
        
vocab = build_vocab_from_iterator(yield_tokens(train_df['questionText']), specials = ['<unk>', '<pad>', '<bos>', '<eos>'])
vocab.set_default_index(vocab["<unk>"])

In [20]:
vocab(['here', 'it', 'is'])

[404, 9, 7]

In [21]:
mapping = vocab.get_itos()
mapping[0]

'<unk>'

In [22]:
len(vocab)

15344

## 3. Data loader

### FastText Embedding

In [23]:
from torchtext.vocab import FastText
fast_vectors = FastText(language='simple')

In [24]:
fast_embedding = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)

In [25]:
fast_embedding.shape

torch.Size([15344, 300])

In [26]:
class DataWrap(Dataset):

    def __init__(self, dataframe):
        self.dataframe = dataframe
    
    def __len__(self):
        return len(self.dataframe)
        
    def __getitem__(self, idx):
        return self.dataframe.iloc[idx]

In [27]:
train_df.iloc[0]

category                                        4
questionText    is the drops better or the pills?
Name: 16094, dtype: object

In [28]:
train = DataWrap(train_df)
valid = DataWrap(val_df)
test = DataWrap(test_df)

In [29]:
text_pipeline  = lambda x: vocab(tokenizer(x)) #{hello world this is yt} => {'hello', 'world', 'this', 'is', 'yt'} => {4, 88, 11, 22, 6}

In [30]:
text_pipeline("what is the best product?")

[23, 7, 5, 285, 58, 4]

In [31]:
#collate_fn to let each batch has same size
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

pad_idx = vocab['<pad>'] #get index of pad in vocab list
# padding for every sentencce in batch to have same length 

def collate_batch(batch):
    label_list, text_list, length_list = [], [], []
    for (_label, _text) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64) # convert to integer before appending
        text_list.append(processed_text)
        length_list.append(processed_text.size(0))
    return torch.tensor(label_list, dtype=torch.int64), pad_sequence(text_list, padding_value=pad_idx, batch_first=True), torch.tensor(length_list, dtype=torch.int64)

In [32]:
batch_size = 64

train_loader = DataLoader(train, batch_size=batch_size, shuffle=True,  collate_fn=collate_batch) #num_workers to train faster
val_loader   = DataLoader(valid, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_loader  = DataLoader(test,  batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [33]:
for label, text, length in val_loader:
    break

In [34]:
label.shape #(batch_size, )

torch.Size([64])

In [35]:
text.shape #(batch_size, seq len)

torch.Size([64, 46])

In [36]:
length.shape #(batch_size, )

torch.Size([64])

## 4. Model (biLSTM)

In [37]:
class LSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, num_layers, 
                 bidirectional, dropout, output_dim):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.lstm      = nn.LSTM(
                            emb_dim,
                            hid_dim,
                            num_layers=num_layers,
                            bidirectional=bidirectional,
                            dropout = dropout,
                            batch_first = True
                        )
        self.fc        = nn.Linear(hid_dim * 2, output_dim) # time 2 b/c bidirectional, output_dim = 4 since there are 4 class
    
    def forward(self, text, text_length):
        #text = [batch_size, seq len]
        embedded = self.embedding(text)
        #text = [batch_size, seq len, emb_dim]
        
        #pack sequence to ignore any padding
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length.to('cpu'), 
                                                            enforce_sorted=False, batch_first=True)
        
        packed_output, (hn, cn) = self.lstm(packed_embedded)
        #output is basically all the hidden states;  hn is only last hidden state; cn is last cell state
        
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        #output = [batch_size, seq len, hidden_dim * num directions]
        #hn     = [num_layers * num_directions, batch_size,  hid_dim]  #3 layers birectional - hn1f, hn1b, hn2f, hn2b, hn3f, hn3b
        #cn     = [num_layers * num_directions, batch_size,  hid_dim]
        
        hn      = torch.cat((hn[-2, :, :], hn[-1, :, :]), dim = 1) # hn3f, hn3b
        #hn     = [batch_size, hidden_dim * num_directions]
        
        return self.fc(hn)

## 5. Train

In [38]:
def initialize_weight(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight) # normal distribution
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)

In [39]:
def accuracy(preds, y):
    predicted  = torch.max(preds.data, 1)[1] #.data for getting value in tensor
    batch_corr = (predicted == y).sum()
    acc        = batch_corr / len(y)
    return acc

In [40]:
def train(model, loader, optimizer, criterion, loader_length):
    epoch_loss = 0
    epoch_acc  = 0
    model.train()
    
    for i, (label, text, text_length) in enumerate(loader):
        label = label.to(device)
        text  = text.to(device)
        
        predictions = model(text, text_length).squeeze(1)
        
        loss = criterion(predictions, label)
        acc  = accuracy(predictions, label)
        
        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc  += acc.item()
        
    return epoch_loss / loader_length, epoch_acc / loader_length

In [41]:
def evaluate(model, loader, criterion, loader_length):
    epoch_loss = 0
    epoch_acc  = 0
    model.eval()
    
    with torch.no_grad():
        for i, (label, text, text_length) in enumerate(loader):
            label = label.to(device)
            text  = text.to(device)
            
            predictions = model(text, text_length).squeeze(1)
            
            loss = criterion(predictions, label)
            acc  = accuracy(predictions, label)
            
            epoch_loss += loss.item()
            epoch_acc  += acc.item()
        
    return epoch_loss / loader_length, epoch_acc / loader_length

### Actual training

In [42]:
train_loader_length = len(list(iter(train_loader)))
val_loader_length   = len(list(iter(val_loader)))
test_loader_length  = len(list(iter(test_loader)))

In [43]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [44]:
#experiment tracking
import mlflow
from mlflow.models import infer_signature
import os

# This the dockerized method.
# We build two docker containers, one for python/jupyter and another for mlflow.
# The url `mlflow` is resolved into another container within the same composer.
mlflow.set_tracking_uri("http://mlflow:5000")
# In the dockerized way, the user who runs this code will be `root`.
# The MLflow will also log the run user_id as `root`.
# To change that, we need to set this environ["LOGNAME"] to your name.
os.environ["LOGNAME"] = "noppawee"
#mlflow.create_experiment(name="noppawee-ML-project")  #create if you haven't create
mlflow.set_experiment(experiment_name="biLSTM2000_10class")



* 'schema_extra' has been renamed to 'json_schema_extra'
2024/04/26 17:25:31 INFO mlflow.tracking.fluent: Experiment with name 'biLSTM2000_10class' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/296003431626535286', creation_time=1714127131354, experiment_id='296003431626535286', last_update_time=1714127131354, lifecycle_stage='active', name='biLSTM2000_10class', tags={}>

In [45]:
import time
import torch.optim as optim

num_epochs = [5]
hid_dims = [64,128,256]
num_layers = [2,4,6]

for num_epoch in num_epochs:
    for hid_dim in hid_dims:
        for num_layer in num_layers:
            input_dim = len(vocab)
            emb_dim   = 300 #fasttext
            output_dim = 10 #10 types of product question
            bidirectional = True
            dropout    = 0.5  # dropout between layers

            params={"model":"biLSTM", "num_epochs":num_epoch, "input_dim":input_dim, "hid_dim":hid_dim, "emb_dim":emb_dim, "output_dim":output_dim, "num_layers":num_layer, "dropout":0.5}
            mlflow.start_run(run_name=f"biLSTM2000-{params['num_epochs']}-epochs-{params['hid_dim']}-hidden dim-{params['num_layers']}-num layers-10class")
            mlflow.log_params(params)

            print("="*5, f"biLSTM with {params['num_epochs']}-epochs-{params['hid_dim']}-hidden dim-{params['num_layers']}-num layers","="*5)

            model = LSTM(input_dim, emb_dim, hid_dim, num_layer, bidirectional, dropout, output_dim)
            model.apply(initialize_weight)
            model.embedding.weight.data = fast_embedding

            lr = 1e-3
            optimizer = optim.Adam(model.parameters(), lr=lr)
            criterion = nn.CrossEntropyLoss()

            train_losses, train_accs, val_losses, val_accs = [],[],[],[]
            best_valid_loss = float('inf')

            for epoch in range(num_epoch):
                start_time = time.time()
                
                train_loss, train_acc = train(model, train_loader, optimizer, criterion, train_loader_length)
                valid_loss, valid_acc = evaluate(model, val_loader, criterion, val_loader_length)
                
                #for plotting
                train_losses.append(train_loss)
                train_accs.append(train_acc)
                val_losses.append(valid_loss)
                val_accs.append(valid_acc)
                
                end_time = time.time()
                
                epoch_mins, epoch_secs = epoch_time(start_time, end_time)
                mlflow.log_metric(key="train_loss", value=train_loss, step=epoch)
                mlflow.log_metric(key="train_acc", value=train_acc, step=epoch)
                mlflow.log_metric(key="val_loss", value=valid_loss, step=epoch)
                mlflow.log_metric(key="val_acc", value=valid_acc, step=epoch)
                
            
                #early stopping
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    mlflow.pytorch.log_model(model, "model")
                
                print(f'Epoch: {epoch+1} | Time: {epoch_mins}m {epoch_secs}s')
                print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
                print(f'\tVal.  Loss: {valid_loss:.3f} | Val Acc: {valid_acc*100:.2f}%')
            mlflow.log_metric(key="min_val_loss", value=min(val_losses), step=epoch)    
            mlflow.end_run()
        

===== biLSTM with 5-epochs-64-hidden dim-2-num layers =====
Epoch: 1 | Time: 0m 27s
	Train Loss: 1.953 | Train Acc: 29.91%
	Val.  Loss: 1.678 | Val Acc: 42.53%




Epoch: 2 | Time: 0m 24s
	Train Loss: 1.469 | Train Acc: 50.55%
	Val.  Loss: 1.542 | Val Acc: 47.71%
Epoch: 3 | Time: 0m 24s
	Train Loss: 1.117 | Train Acc: 63.98%
	Val.  Loss: 1.524 | Val Acc: 50.39%
Epoch: 4 | Time: 0m 25s
	Train Loss: 0.843 | Train Acc: 73.24%
	Val.  Loss: 1.662 | Val Acc: 48.39%
Epoch: 5 | Time: 0m 25s
	Train Loss: 0.641 | Train Acc: 79.75%
	Val.  Loss: 1.823 | Val Acc: 48.97%
===== biLSTM with 5-epochs-64-hidden dim-4-num layers =====
Epoch: 1 | Time: 0m 46s
	Train Loss: 1.533 | Train Acc: 45.99%
	Val.  Loss: 1.842 | Val Acc: 45.02%
Epoch: 2 | Time: 0m 43s
	Train Loss: 0.867 | Train Acc: 73.16%
	Val.  Loss: 1.853 | Val Acc: 46.09%
Epoch: 3 | Time: 0m 43s
	Train Loss: 0.674 | Train Acc: 79.34%
	Val.  Loss: 1.921 | Val Acc: 46.78%
Epoch: 4 | Time: 0m 45s
	Train Loss: 0.566 | Train Acc: 82.74%
	Val.  Loss: 2.134 | Val Acc: 47.31%
Epoch: 5 | Time: 0m 45s
	Train Loss: 0.497 | Train Acc: 84.59%
	Val.  Loss: 2.175 | Val Acc: 46.88%
===== biLSTM with 5-epochs-64-hidden dim

: 