# Part 5 -- Model v1 Evaluation -- Base LLM model on the "Industry" feature

In [1]:
import evaluate
import json
import numpy as np
import os
import pandas as pd
import re
import torch
import torch.nn.functional as F

from datasets import Dataset, DatasetDict
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, EarlyStoppingCallback, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load exported Full and Train datasets to Pandas DataFrames

In [3]:
df = pd.read_csv(os.path.join('data', 'dataset.csv'))
df_train = pd.read_csv(os.path.join('data', 'train.csv'))

In [4]:
model_name = 'ft1'
text_cols = ['industry']
label_col = 'category'

In [5]:
df = df[['id']+text_cols+[label_col]]
df_train = df_train[['id']+text_cols+[label_col]].dropna(subset=text_cols).reset_index(drop=True)

In [6]:
num_labels = df[label_col].dropna().nunique()

## Extract Evaluation data as the set of samples which were not used in Training

In [7]:
df_eval = df.loc[~df['id'].isin(df_train['id'].tolist()), ['id']+text_cols+[label_col]].dropna(subset=[label_col]).reset_index(drop=True)

## Load stored label encoder

In [8]:
le = None
le_inv = None

with open(os.path.join('data', 'labels.json'), 'r') as j:
    json_contents = json.loads(j.read())

    le = {k: int(v) for k, v in json_contents.items()}
    le_inv = {int(v): k for k, v in json_contents.items()}

## Data transformation based on Label (category) and Text (input) feature(s)

In [9]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

def preprocess_text(sen):
    # Removing HTML tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    # Remove redundant whitespaces at the beginning and at the end
    sentence = sentence.strip()

    return sentence


for col in text_cols:
    texts = []
    for sen in list(df_train[col].astype(str)):
        texts.append(preprocess_text(sen))
    df_train.loc[:, col] = pd.Series((sen for sen in texts))

for col in text_cols:
    texts = []
    for sen in list(df_eval[col].astype(str)):
        texts.append(preprocess_text(sen))
    df_eval.loc[:, col] = pd.Series((sen for sen in texts))

In [10]:
df_train['text'] = df_train[text_cols].apply(lambda x: ' [SEP] '.join(x.values.astype(str)), axis=1)
df_eval['text'] = df_eval[text_cols].apply(lambda x: ' [SEP] '.join(x.values.astype(str)), axis=1)

In [11]:
df_train = df_train.dropna(subset=['text']).reset_index(drop=True)
df_eval = df_eval.dropna(subset=['text']).reset_index(drop=True)

In [12]:
df_train.loc[:, 'label'] = df_train[label_col].apply(lambda x: le[x])
df_eval.loc[:, 'label'] = df_eval[label_col].apply(lambda x: le[x])

## Tokenize text data

In [13]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [14]:
ds_train = Dataset.from_pandas(df_train[['label', 'text']])
ds_eval = Dataset.from_pandas(df_eval[['label', 'text']])

ds = DatasetDict({
    'train': ds_train,
    'eval': ds_eval,
})

In [15]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

ds_tokenized = ds.map(tokenize_function)

Map: 100%|█████████████████████████████████████████████████████| 26494/26494 [00:06<00:00, 3911.58 examples/s]
Map: 100%|█████████████████████████████████████████████████████| 46954/46954 [00:12<00:00, 3844.71 examples/s]


## Configure model training parameters

In [16]:
configuration = AutoConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)
configuration.hidden_dropout_prob = 0.2
configuration.attention_probs_dropout_prob = 0.2

# Load fine-tuned Bert model
model = AutoModelForSequenceClassification.from_pretrained(f'output/{model_name}', config=configuration)

# Move Bert model to the proper device
model.to(device)

# Switch Bert model to evaluation mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.2, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [17]:
training_args = TrainingArguments(
    push_to_hub=False,
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=13,
    per_device_eval_batch_size=13*int(5*len(df_eval)/len(df_train)),

    save_strategy='steps',
    eval_strategy="steps",
    save_steps=200,
    eval_steps=200,
    logging_steps=200,
    
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)

In [18]:
accuracy_metric = evaluate.load('accuracy')
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')
f1_metric = evaluate.load('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    return {
        'Accuracy': accuracy_metric.compute(predictions=predictions, references=labels)['accuracy'],
        'F1': f1_metric.compute(predictions=predictions, references=labels, average='weighted')['f1'],
        'Precision': precision_metric.compute(predictions=predictions, references=labels, average='weighted')['precision'],
        'Recall': recall_metric.compute(predictions=predictions, references=labels, average='weighted')['recall'],
    }

In [19]:
early_stop = EarlyStoppingCallback(5, 1e-4)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_tokenized['train'],
    eval_dataset=ds_tokenized['eval'],
    compute_metrics=compute_metrics,
    callbacks=[early_stop],
)

## Model evaluation on labeled samples which are not part of the training dataset

In [21]:
trainer.evaluate()

{'eval_loss': 0.0004986003623344004,
 'eval_model_preparation_time': 0.0023,
 'eval_Accuracy': 0.9999361076798569,
 'eval_F1': 0.9999361072531957,
 'eval_Precision': 0.999936152611165,
 'eval_Recall': 0.9999361076798569,
 'eval_runtime': 1185.0644,
 'eval_samples_per_second': 39.621,
 'eval_steps_per_second': 0.381}

## Manual category prediction of 10 random unlabeled samples

### Extract Test data as the set of unlabeled samples

In [22]:
df_test = df.loc[df[label_col].isna(), ['id']+text_cols].dropna().reset_index(drop=True)

### Data transformation based on Label (category) and Text (input) feature(s)

In [23]:
for col in text_cols:
    texts = []
    for sen in list(df_test[col].astype(str)):
        texts.append(preprocess_text(sen))
    df_test.loc[:, col] = pd.Series((sen for sen in texts))

df_test['text'] = df_test[text_cols].apply(lambda x: ' [SEP] '.join(x.values.astype(str)), axis=1)
df_test = df_test.dropna(subset=['text']).reset_index(drop=True)
df_test['label'] = None

### Perform predictions

In [24]:
for i in df_test.sample(n=10).index:
    sample = df_test.iloc[i]

    # Tokenize text features
    tokenized = tokenizer(sample['text'], padding='max_length', truncation=True, return_tensors="pt").to(device)

    with torch.no_grad():
        # Predict label
        outputs=model(**tokenized)
        probabilities = F.softmax(outputs.logits, dim=-1)
    
        # Store prediction
        df_test.loc[i, 'pred'] = le_inv[torch.argmax(probabilities, dim=-1).item()]

    print(df_test.loc[i, text_cols+['pred']])
    print()
    print()

industry           legal services
pred        Professional Services
Name: 782405, dtype: object


industry          human resources
pred        Professional Services
Name: 3769074, dtype: object


industry                            retail
pred        Commercial Services & Supplies
Name: 4767413, dtype: object


industry         consumer services
pred        Consumer Discretionary
Name: 127665, dtype: object


industry    wine and spirits
pred        Consumer Staples
Name: 3427313, dtype: object


industry      food beverages
pred        Consumer Staples
Name: 4202031, dtype: object


industry    information technology and services
pred                     Information Technology
Name: 4314532, dtype: object


industry                  internet
pred        Information Technology
Name: 4998583, dtype: object


industry                      maritime
pred        Transportation & Logistics
Name: 2526855, dtype: object


industry    mechanical or industrial engineering
pred              Comm