In [1]:
# Link- https://towardsdatascience.com/multi-class-text-classification-with-deep-learning-using-bert-b59ca2f5c613
# Here I have classified Company Industry Domain(147) from its Description from Crunchbase data
import pandas as pd
import numpy as np
import nltk
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import cross_val_score
import re
import codecs
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

In [2]:
df = pd.read_csv('../data/50k_data_crunchbase.csv')
df=df[['company_description','company_industry']]
df.head()

Unnamed: 0,company_description,company_industry
0,"Founded in 2005 by Art Howe, CIU Networks INC ...",Information Technology and Services
1,Pulling your hair out over mindless call after...,Information Technology and Services
2,Creative Facility Design can recommend alterna...,Information Technology and Services
3,PVM Innvensys Pvt Ltd is an information techno...,Information Technology and Services
4,Eleview International Inc is an information te...,Information Technology and Services


In [3]:
def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    return df

data_clean = clean_text(df, 'company_description', 'text_clean')
data_clean['company_description'] = data_clean['text_clean']
df = data_clean[['company_description','company_industry']]

In [4]:
possible_labels = df.company_industry.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'Information Technology and Services': 0,
 'E-Learning': 1,
 'Electrical/Electronic Manufacturing': 2,
 'Information Services': 3,
 'Food & Beverages': 4,
 'Internet': 5,
 'Education Management': 6,
 'Public Relations and Communications': 7,
 'Marketing and Advertising': 8,
 'Higher Education': 9,
 'Supermarkets': 10,
 'Professional Training & Coaching': 11,
 'Hospital & Health Care': 12,
 'Sporting Goods': 13,
 'Hospitality': 14,
 'Investment Banking': 15,
 'Staffing and Recruiting': 16,
 'Insurance': 17,
 'Research': 18,
 'Biotechnology': 19,
 'Computer Software': 20,
 'Public Policy': 21,
 'Leisure Travel & Tourism': 22,
 'Health Wellness and Fitness': 23,
 'Computer Games': 24,
 'Entertainment': 25,
 'Apparel & Fashion': 26,
 'Medical Devices': 27,
 'Consumer Services': 28,
 'Venture Capital & Private Equity': 29,
 'Online Media': 30,
 'Business Supplies and Equipment': 31,
 'Retail': 32,
 'Real Estate': 33,
 'Government Administration': 34,
 'Building Materials': 35,
 'Investment

In [5]:
df['label'] = df.company_industry.replace(label_dict)

In [6]:
df

Unnamed: 0,company_description,company_industry,label
0,founded in by art howe ciu networks inc is th...,Information Technology and Services,0
1,pulling your hair out over mindless call after...,Information Technology and Services,0
2,creative facility design can recommend alterna...,Information Technology and Services,0
3,pvm innvensys pvt ltd is an information techno...,Information Technology and Services,0
4,eleview international inc is an information te...,Information Technology and Services,0
...,...,...,...
50069,office of rep james p mcgovern is a representa...,Legislative Office,145
50070,the joint economic committee jec was created w...,Legislative Office,145
50071,the international court of justice icj is the ...,Judiciary,146
50072,middle district of florida web site provides i...,Judiciary,146


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['company_industry', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,company_description
company_industry,label,data_type,Unnamed: 3_level_1
Accounting,90,train,261
Accounting,90,val,46
Airlines/Aviation,74,train,122
Airlines/Aviation,74,val,22
Alternative Dispute Resolution,144,train,4
...,...,...,...
Wine and Spirits,62,val,12
Wireless,38,train,106
Wireless,38,val,19
Writing and Editing,122,train,56


In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].company_description.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].company_description.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [9]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [11]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-4, 
                  eps=1e-8)
                  
epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [12]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
device

device(type='cuda')

In [14]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'../data/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=2661.0, style=ProgressStyle(description_wid…


Epoch 1
Training loss: 2.3239291766360934
Validation loss: 1.8044585245720883
F1 Score (Weighted): 0.49523712577689033


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=2661.0, style=ProgressStyle(description_wid…


Epoch 2
Training loss: 1.5222494926321826
Validation loss: 1.6755151549235303
F1 Score (Weighted): 0.5276699886389506


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=2661.0, style=ProgressStyle(description_wid…


Epoch 3
Training loss: 1.0323462704212671
Validation loss: 1.6833917353698549
F1 Score (Weighted): 0.5409804527303835



In [34]:
torch.cuda.empty_cache()

In [15]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('../data/finetuned_BERT_epoch_3.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Class: Information Technology and Services
Accuracy: 524/915

Class: E-Learning
Accuracy: 44/64

Class: Electrical/Electronic Manufacturing
Accuracy: 47/94

Class: Information Services
Accuracy: 3/35

Class: Food & Beverages
Accuracy: 50/78

Class: Internet
Accuracy: 186/474

Class: Education Management
Accuracy: 40/87

Class: Public Relations and Communications
Accuracy: 26/34

Class: Marketing and Advertising
Accuracy: 305/432

Class: Higher Education
Accuracy: 51/75

Class: Supermarkets
Accuracy: 0/2

Class: Professional Training & Coaching
Accuracy: 11/33

Class: Hospital & Health Care
Accuracy: 77/134

Class: Sporting Goods
Accuracy: 12/15

Class: Hospitality
Accuracy: 26/49

Class: Investment Banking
Accuracy: 12/25

Class: Staffing and Recruiting
Accuracy: 51/70

Class: Insurance
Accuracy: 69/81

Class: Research
Accuracy: 18/51

Class: Biotechnology
Accuracy: 72/101

Class: Computer Software
Accuracy: 311/622

Class: Public Policy
Accuracy: 0/7

Class: Leisure Travel & Tourism
A