In [1]:
import torch, torchdata, torchtext
import torch.nn as nn
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.__version__, torchdata.__version__, torchtext.__version__

('2.2.2+cu121', '0.7.1', '0.17.2+cpu')

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
SEED = 1234 #change three times
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## 1. load dataset

In [5]:
df = pd.read_json('../data/train-qar.jsonl', lines=True, nrows=300000)
df = df[['category', 'questionText']]
df.head(5)

Unnamed: 0,category,questionText
0,Toys_and_Games,"Many have stated similar to the following: ""Pa..."
1,Health_and_Personal_Care,Will these work with the Phillips sonicare han...
2,Cell_Phones_and_Accessories,What kind of sim card it use?
3,Home_and_Kitchen,does anyone know if this dinnerware set does n...
4,Musical_Instruments,I'm thinking of getting in to modular synthesi...


In [6]:
# Product category
df['category'].unique()

array(['Toys_and_Games', 'Health_and_Personal_Care',
       'Cell_Phones_and_Accessories', 'Home_and_Kitchen',
       'Musical_Instruments', 'Baby', 'Sports_and_Outdoors',
       'Patio_Lawn_and_Garden', 'Video_Games', 'Pet_Supplies',
       'Tools_and_Home_Improvement', 'Beauty', 'Electronics',
       'Grocery_and_Gourmet_Food', 'Automotive', 'Office_Products',
       'Clothing_Shoes_and_Jewelry'], dtype=object)

In [7]:
df['category'].value_counts()[:10]

category
Electronics                    69163
Home_and_Kitchen               43733
Sports_and_Outdoors            28873
Tools_and_Home_Improvement     25503
Health_and_Personal_Care       19230
Automotive                     18688
Cell_Phones_and_Accessories    17052
Patio_Lawn_and_Garden          14845
Toys_and_Games                 12599
Office_Products                10436
Name: count, dtype: int64

In [8]:
# convert the text to numeric class
# class_mapping = {
#     'Toys_and_Games': 0,
#     'Health_and_Personal_Care': 1,
#     'Cell_Phones_and_Accessories': 2,
#     'Home_and_Kitchen': 3,
#     'Musical_Instruments': 4,
#     'Baby': 5,
#     'Sports_and_Outdoors': 6,
#     'Patio_Lawn_and_Garden': 7,
#     'Video_Games': 8,
#     'Pet_Supplies': 9,
#     'Tools_and_Home_Improvement': 10,
#     'Beauty': 11,
#     'Electronics': 12,
#     'Grocery_and_Gourmet_Food': 13,
#     'Automotive': 14,
#     'Office_Products': 15,
#     'Clothing_Shoes_and_Jewelry': 16
# }

class_mapping = {
    'Electronics': 0,
    'Home_and_Kitchen': 1,
    'Sports_and_Outdoors': 2,
    'Tools_and_Home_Improvement': 3,
    'Health_and_Personal_Care': 4,
    'Automotive': 5,
    'Cell_Phones_and_Accessories': 6,
    'Patio_Lawn_and_Garden': 7,
    'Toys_and_Games': 8,
    'Office_Products': 9
}

# Map class names to numerical labels
df['category'] = df['category'].map(class_mapping)

In [9]:
# sample 100 data in each classes
df_sample = df.groupby('category', group_keys=False).apply(lambda x: x.sample(2000))

In [10]:
# convert to lower case
df_sample['questionText']  =  df_sample['questionText'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [11]:
def data_cleaning(data):
    regex_s = re.sub("\\(.+?\\)|[\r\n|\n\r]|!", "", data)
    fin = " ".join(regex_s.split())
    return fin

In [12]:
df_sample['questionText'] = df_sample['questionText'].apply(data_cleaning)

In [13]:
df_sample = df_sample.astype({'category':int})

In [14]:
train_df, val_df = train_test_split(df_sample, test_size=0.1,stratify=df_sample['category'], random_state=SEED)

In [15]:
train_df, test_df = train_test_split(train_df, test_size=0.1, stratify=train_df['category'],random_state=SEED)

In [16]:
train_df['category'].value_counts()

category
4    1620
6    1620
1    1620
7    1620
5    1620
9    1620
2    1620
8    1620
3    1620
0    1620
Name: count, dtype: int64

In [17]:
train_df

Unnamed: 0,category,questionText
99368,4,how long does the hair need to be in order to ...
233237,6,why my straight talk galaxy s 2 serial number ...
66367,1,can this be used to sharpen santoku knives?
131690,7,can anyone tell me roughly how much and/or how...
8295,5,is this for the front or back
...,...,...
181175,8,can i install this on an 1/10 e revo and a rev...
70942,0,does this device also charges
139016,0,modem only has 1 ethernet output. i have an ac...
195660,4,how much does it weigh


## 2.Preprocessing

### Tokenizing

In [18]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
tokens    = tokenizer("What is the best product?")
tokens

['What', 'is', 'the', 'best', 'product', '?']

### Text to integers (numeral)

In [19]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data):
    for data_sample in data:
        yield tokenizer(data_sample) 
        
vocab = build_vocab_from_iterator(yield_tokens(train_df['questionText']), specials = ['<unk>', '<pad>', '<bos>', '<eos>'])
vocab.set_default_index(vocab["<unk>"])

In [20]:
vocab(['here', 'it', 'is'])

[349, 8, 7]

In [21]:
mapping = vocab.get_itos()
mapping[0]

'<unk>'

In [22]:
len(vocab)

15373

## 3. Data loader

### FastText Embedding

In [23]:
from torchtext.vocab import FastText
fast_vectors = FastText(language='simple')

In [24]:
fast_embedding = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)

In [25]:
fast_embedding.shape

torch.Size([15373, 300])

In [26]:
class DataWrap(Dataset):

    def __init__(self, dataframe):
        self.dataframe = dataframe
    
    def __len__(self):
        return len(self.dataframe)
        
    def __getitem__(self, idx):
        return self.dataframe.iloc[idx]

In [27]:
train_df.iloc[0]

category                                                        4
questionText    how long does the hair need to be in order to ...
Name: 99368, dtype: object

In [28]:
train = DataWrap(train_df)
valid = DataWrap(val_df)
test = DataWrap(test_df)

In [29]:
text_pipeline  = lambda x: vocab(tokenizer(x)) #{hello world this is yt} => {'hello', 'world', 'this', 'is', 'yt'} => {4, 88, 11, 22, 6}

In [30]:
text_pipeline("I am currently teaching LSTM")

[0, 75, 1390, 0, 0]

In [31]:
from torch.utils.data   import DataLoader
from torch.nn.utils.rnn import pad_sequence

pad_idx = vocab['<pad>'] 

def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
    #criterion expects float labels
    return torch.tensor(label_list, dtype=torch.int64), pad_sequence(text_list, padding_value=pad_idx, batch_first=True)

In [32]:
batch_size = 64

train_loader = DataLoader(train, batch_size=batch_size, shuffle=True,  collate_fn=collate_batch) #num_workers to train faster
val_loader   = DataLoader(valid, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_loader  = DataLoader(test,  batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [33]:
for label, text in val_loader:
    break

In [34]:
label.shape #(batch_size, )

torch.Size([64])

In [35]:
text.shape #(batch_size, seq len)

torch.Size([64, 57])

## 4. Model (CNN)

In [36]:
class CNN(nn.Module): #more elegant version
    def __init__(self, input_dim, emb_dim, output_dim, dropout, n_filters, filter_sizes):
        
        super().__init__()
                
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, emb_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        embedded = self.embedding(text)
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

## 5. Train

In [37]:
#explicitly initialize weights for better learning
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, (nn.Conv2d, nn.Conv2d)):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.kaiming_normal_(param) 

In [38]:
def accuracy(preds, y):
    
    predicted = torch.max(preds.data, 1)[1]
    batch_corr = (predicted == y).sum()
    acc = batch_corr / len(y)
    
    return acc

In [39]:
def train(model, loader, optimizer, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.train() #useful for batchnorm and dropout
    
    for i, (label, text) in enumerate(loader): 
        label = label.to(device) #(batch_size, )
        text = text.to(device) #(batch_size, seq len)
                
        #predict
        predictions = model(text).squeeze(1) #output by the fc is (batch_size, 1), thus need to remove this 1
        
        #calculate loss
        loss = criterion(predictions, label)
        acc  = accuracy(predictions, label)
        
        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
                        
    return epoch_loss / loader_length, epoch_acc / loader_length

In [40]:
def evaluate(model, loader, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for i, (label, text) in enumerate(loader): 
            label = label.to(device) #(batch_size, )
            text  = text.to(device)  #(seq len, batch_size)

            predictions = model(text).squeeze(1) 
            
            loss = criterion(predictions, label)
            acc  = accuracy(predictions, label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / loader_length, epoch_acc / loader_length

### Actual training

In [41]:
train_loader_length = len(list(iter(train_loader)))
val_loader_length   = len(list(iter(val_loader)))
test_loader_length  = len(list(iter(test_loader)))

In [42]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [43]:
#experiment tracking
import mlflow
from mlflow.models import infer_signature
import os

# This the dockerized method.
# We build two docker containers, one for python/jupyter and another for mlflow.
# The url `mlflow` is resolved into another container within the same composer.
mlflow.set_tracking_uri("http://mlflow:5000")
# In the dockerized way, the user who runs this code will be `root`.
# The MLflow will also log the run user_id as `root`.
# To change that, we need to set this environ["LOGNAME"] to your name.
os.environ["LOGNAME"] = "noppawee"
#mlflow.create_experiment(name="noppawee-ML-project")  #create if you haven't create
mlflow.set_experiment(experiment_name="CNN2000-10class")



* 'schema_extra' has been renamed to 'json_schema_extra'
2024/04/26 16:35:14 INFO mlflow.tracking.fluent: Experiment with name 'CNN2000-10class' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/231020780919091752', creation_time=1714124114718, experiment_id='231020780919091752', last_update_time=1714124114718, lifecycle_stage='active', name='CNN2000-10class', tags={}>

In [44]:
import time
import torch.optim as optim
import torch.nn.functional as F

num_epochs = [10,20]
n_filters = [50,100,150]


for num_epoch in num_epochs:
    for n_filter in n_filters:
        
        input_dim  = len(vocab)
        emb_dim    = 300
        output_dim = 10 #17 classes

        #for cnn
        dropout = 0.5
        filter_sizes = [3, 4, 5]

        params={"model":"CNN", "num_epochs":num_epoch, "n_filters":n_filter, "filter_sizes":filter_sizes, "input_dim":input_dim, "emb_dim":emb_dim, "output_dim":output_dim, "dropout":0.5}
        mlflow.start_run(run_name=f"CNN2000-{params['num_epochs']}-epochs-{params['n_filters']}-n_filters")
        mlflow.log_params(params)

        print("="*5, f"CNN with {params['num_epochs']}-epochs-{params['n_filters']}-n_filters","="*5)

        model = CNN(input_dim, emb_dim, output_dim, dropout, n_filter, filter_sizes).to(device)
        model.apply(initialize_weights)
        model.embedding.weight.data = fast_embedding

        lr=0.05
        #training hyperparameters
        optimizer = optim.SGD(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()

        train_losses, train_accs, val_losses, val_accs = [],[],[],[]
        best_valid_loss = float('inf')

        for epoch in range(num_epoch):
            start_time = time.time()
                
            train_loss, train_acc = train(model, train_loader, optimizer, criterion, train_loader_length)
            valid_loss, valid_acc = evaluate(model, val_loader, criterion, val_loader_length)
                
            #for plotting
            train_losses.append(train_loss)
            train_accs.append(train_acc)
            val_losses.append(valid_loss)
            val_accs.append(valid_acc)
                
            end_time = time.time()
                
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            mlflow.log_metric(key="train_loss", value=train_loss, step=epoch)
            mlflow.log_metric(key="train_acc", value=train_acc, step=epoch)
            mlflow.log_metric(key="val_loss", value=valid_loss, step=epoch)
            mlflow.log_metric(key="val_acc", value=valid_acc, step=epoch)
                
            
            #early stopping
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                mlflow.pytorch.log_model(model, "model")
                
            print(f'Epoch: {epoch+1} | Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
            print(f'\tVal.  Loss: {valid_loss:.3f} | Val Acc: {valid_acc*100:.2f}%')
        mlflow.log_metric(key="min_val_loss", value=min(val_losses), step=epoch)    
        mlflow.end_run()
        

===== CNN with 10-epochs-50-n_filters =====
Epoch: 1 | Time: 0m 22s
	Train Loss: 2.236 | Train Acc: 17.89%
	Val.  Loss: 2.050 | Val Acc: 29.98%




Epoch: 2 | Time: 0m 19s
	Train Loss: 1.998 | Train Acc: 29.90%
	Val.  Loss: 1.904 | Val Acc: 37.84%
Epoch: 3 | Time: 0m 20s
	Train Loss: 1.885 | Train Acc: 34.61%
	Val.  Loss: 1.828 | Val Acc: 40.62%
Epoch: 4 | Time: 0m 19s
	Train Loss: 1.798 | Train Acc: 38.59%
	Val.  Loss: 1.777 | Val Acc: 41.65%
Epoch: 5 | Time: 0m 21s
	Train Loss: 1.728 | Train Acc: 41.70%
	Val.  Loss: 1.736 | Val Acc: 43.36%
Epoch: 6 | Time: 0m 21s
	Train Loss: 1.669 | Train Acc: 43.63%
	Val.  Loss: 1.714 | Val Acc: 43.26%
Epoch: 7 | Time: 0m 20s
	Train Loss: 1.616 | Train Acc: 46.09%
	Val.  Loss: 1.692 | Val Acc: 45.02%
Epoch: 8 | Time: 0m 20s
	Train Loss: 1.567 | Train Acc: 47.70%
	Val.  Loss: 1.669 | Val Acc: 44.34%
Epoch: 9 | Time: 0m 21s
	Train Loss: 1.523 | Train Acc: 49.32%
	Val.  Loss: 1.664 | Val Acc: 45.21%
Epoch: 10 | Time: 0m 20s
	Train Loss: 1.483 | Train Acc: 50.78%
	Val.  Loss: 1.643 | Val Acc: 44.78%
===== CNN with 10-epochs-100-n_filters =====
Epoch: 1 | Time: 0m 27s
	Train Loss: 2.250 | Train Acc

: 