### Environment Setup

In [1]:
# Importing the required libraries.

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import trange
import random
import warnings
warnings.filterwarnings("ignore")

### Dataset Generation

In [2]:
# Importing the DataFrame for the class "Electric Aircraft".

EA_patent_df = pd.read_csv('EA_patent_df.csv')
EA_patent_df

Unnamed: 0,label,text
0,Electric Aircraft,A vehicle comprising a first electrical generator
1,Electric Aircraft,a second electrical generator arranged the fir...
2,Electric Aircraft,a first electrical distribution system connect...
3,Electric Aircraft,electrical power within the vehicle
4,Electric Aircraft,the first electrical distribution system confi...
...,...,...
58,Electric Aircraft,induced EMF operating a starting mode
59,Electric Aircraft,a flight mode is set to fine pitch
60,Electric Aircraft,the drive mode are disconnected rotor pitch
61,Electric Aircraft,a speed is synchronised the output frequency


In [3]:
# Importing the DataFrame for the class "Fuel Cell".

FC_patent_df = pd.read_csv('FC_patent_df.csv')
FC_patent_df

Unnamed: 0,label,text
0,Fuel Cell,A solid oxide fuel cell stack comprising a plu...
1,Fuel Cell,solid oxide electrolyte members having a first...
2,Fuel Cell,a second oppositely facing surface on the firs...
3,Fuel Cell,a cathode electrode to form a fuel cell
4,Fuel Cell,each anode electrode partially defining an ano...
...,...,...
147,Fuel Cell,two second modules lie the same plane
148,Fuel Cell,a divider member is positioned the reactant
149,Fuel Cell,the outer surfaces spent a spent first reactan...
150,Fuel Cell,a spent second reactant collection to burn the...


In [4]:
# Importing the DataFrame for the class "Microgrid".

MG_patent_df = pd.read_csv('MG_patent_df.csv')
MG_patent_df

Unnamed: 0,label,text
0,Microgrid,An electrical generator network comprising a p...
1,Microgrid,electrical generators having an electrical gen...
2,Microgrid,at least one to avoid the non-detection zone
3,Microgrid,loss-of-mains relays being arranged the real p...
4,Microgrid,the electrical generator network to the at lea...
...,...,...
80,Microgrid,the propulsion system to provide flight contro...
81,Microgrid,he first group to jointly produce one or more ...
82,Microgrid,the one or more AC buses out synchronization
83,Microgrid,The propulsion system to refrain the second po...


In [5]:
# Importing the DataFrame for the class "Nuclear Reactor".

NR_patent_df = pd.read_csv('NR_patent_df.csv')
NR_patent_df

Unnamed: 0,label,text
0,Nuclear Reactor,A pressure-containing silo for one or more com...
1,Nuclear Reactor,a primary coolant circuit of a nuclear power p...
2,Nuclear Reactor,a nuclear reactor containing fuel assemblies
3,Nuclear Reactor,which are cooled pressurised coolant
4,Nuclear Reactor,the primary coolant circuit defining a release...
...,...,...
91,Nuclear Reactor,respect or a lower region
92,Nuclear Reactor,an integral self pressurized water cooled nucl...
93,Nuclear Reactor,the pressurizer being dimensioned the integral...
94,Nuclear Reactor,two abnormal positions has been displaced the ...


In [6]:
# Combined DataFrame having all the classes.

patent_df = pd.concat([EA_patent_df, FC_patent_df, MG_patent_df, NR_patent_df])
patent_df = patent_df.reset_index(drop=True)
patent_df

Unnamed: 0,label,text
0,Electric Aircraft,A vehicle comprising a first electrical generator
1,Electric Aircraft,a second electrical generator arranged the fir...
2,Electric Aircraft,a first electrical distribution system connect...
3,Electric Aircraft,electrical power within the vehicle
4,Electric Aircraft,the first electrical distribution system confi...
...,...,...
391,Nuclear Reactor,respect or a lower region
392,Nuclear Reactor,an integral self pressurized water cooled nucl...
393,Nuclear Reactor,the pressurizer being dimensioned the integral...
394,Nuclear Reactor,two abnormal positions has been displaced the ...


In [7]:
# Label Encoding for BERT Classification.

label_encoder = preprocessing.LabelEncoder()
patent_df['label']= label_encoder.fit_transform(patent_df['label'])
patent_df

Unnamed: 0,label,text
0,0,A vehicle comprising a first electrical generator
1,0,a second electrical generator arranged the fir...
2,0,a first electrical distribution system connect...
3,0,electrical power within the vehicle
4,0,the first electrical distribution system confi...
...,...,...
391,3,respect or a lower region
392,3,an integral self pressurized water cooled nucl...
393,3,the pressurizer being dimensioned the integral...
394,3,two abnormal positions has been displaced the ...


In [8]:
# Random shuffling of the rows in the DataFrame.

patent_df = patent_df.sample(frac=1).reset_index(drop=True)
patent_df

Unnamed: 0,label,text
0,0,a winding to adjust magnetic field strength
1,1,a formation within housing and defining
2,3,primary water coolant therebetween is dimensio...
3,3,the first and second silos are isolated the in...
4,1,the first voltage are also provided connectors
...,...,...
391,0,the aircraft extending the aircraft fuselage
392,1,the electrolyte to optimize the electrochemica...
393,1,one or more fluids located the downwards flowi...
394,1,steam reforming hydrocarbon fuel


### Preprocessing for BERT

In [None]:
# The code is sourced from: https://mccormickml.com/2019/07/22/BERT-fine-tuning/.

In [9]:
# Extracting the text and label values.

df = patent_df
text = df.text.values
labels = df.label.values

In [10]:
# Downloading the BertTokenizer and converting the rows to lower case.

tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

In [11]:
# Preprocessing for BERT.

token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation = True
                   )

for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])

token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.LongTensor(labels)

In [12]:
# Setting the Validation Ratio.
val_ratio = 0.2

# Recommended batch size: 16, 32.
batch_size = 16

# Indices of the Train and Validation splits stratified by labels.
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and Validation sets.
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

In [13]:
# Preparing DataLoader.

train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [14]:
# Defining the Performance Metrics.

def b_tp(preds, labels):
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

### Fine-tuning BERT for Patent Classification

In [None]:
# The code is sourced from: https://mccormickml.com/2019/07/22/BERT-fine-tuning/.

In [15]:
# Loading the BertForSequenceClassification model.

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 4,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning Rates (Adam): 5e-5, 3e-5, 2e-5.
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4.
epochs = 2

for _ in trange(epochs, desc = 'Epoch'):
    
    # Training.
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        
        # Forward pass.
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        
        # Backward pass.
        train_output.loss.backward()
        optimizer.step()
        
        # Updating the tracking variables.
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # Validation.
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            
          # Forward pass.
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculation of the Performance Metrics.
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        if b_precision != 'nan': val_precision.append(b_precision)
        if b_recall != 'nan': val_recall.append(b_recall)
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t  Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t  Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t  Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t  Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t  Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

Epoch:  50%|██████████████████████████████████████▌                                      | 1/2 [01:27<01:27, 87.09s/it]


	  Train loss: 1.3129
	  Validation Accuracy: 0.3625
	  Validation Precision: 0.4374
	  Validation Recall: 1.0000
	  Validation Specificity: 0.0000



Epoch: 100%|█████████████████████████████████████████████████████████████████████████████| 2/2 [02:52<00:00, 86.30s/it]


	  Train loss: 0.9639
	  Validation Accuracy: 0.4250
	  Validation Precision: 0.5121
	  Validation Recall: 0.9444
	  Validation Specificity: 0.1086






### Predictions using fine-tuned BERT

In [17]:
# Closest sentence: A vehicle comprising a first electrical generator (Class: Electric Aircraft).
test_sent1 = 'A vehicle having an electrical generator'  

test_ids = []
test_attention_mask = []
encoding = preprocessing(test_sent1, tokenizer)
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)
with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
if np.argmax(output.logits.cpu().numpy()).flatten().item() == 0:
    prediction = 'Electric Aircraft'
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 1:
    prediction = 'Fuel Cell'
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 2:
    prediction = 'Microgrid'
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 3:
    prediction = 'Nuclear Reactor'

print('Input Sentence: ', test_sent1)
print('Predicted Class: ', prediction)

Input Sentence:  A vehicle having an electrical generator
Predicted Class:  Electric Aircraft


In [18]:
# Closest sentence: a reactor vessel forming a ring (Class: Nuclear Reactor).
test_sent2 = 'A vessel forming a ring' 

test_ids = []
test_attention_mask = []
encoding = preprocessing(test_sent2, tokenizer)
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)
with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
if np.argmax(output.logits.cpu().numpy()).flatten().item() == 0:
    prediction = 'Electric Aircraft'
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 1:
    prediction = 'Fuel Cell'
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 2:
    prediction = 'Microgrid'
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 3:
    prediction = 'Nuclear Reactor'

print('Input Sentence: ', test_sent2)
print('Predicted Class: ', prediction)

Input Sentence:  A vehicle having an electrical generator
Predicted Class:  Nuclear Reactor
