<a href="https://colab.research.google.com/github/okechukwuchude/Automating-Medical-Coding/blob/main/Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
import json
import stanza
import csv
import pandas as pd

# Build an English pipeline
stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) # download English model
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) # initialize English neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-03-28 22:28:25 INFO: Downloaded file to /home/chudeo/stanza_resources/resources.json
2024-03-28 22:28:25 INFO: Downloading these customized packages for language: en (English)...
| Processor       | Package        |
------------------------------------
| tokenize        | mimic          |
| pos             | mimic_charlm   |
| lemma           | mimic_nocharlm |
| depparse        | mimic_charlm   |
| ner             | i2b2           |
| backward_charlm | mimic          |
| pretrain        | mimic          |
| forward_charlm  | mimic          |

2024-03-28 22:28:25 INFO: File exists: /home/chudeo/stanza_resources/en/tokenize/mimic.pt
2024-03-28 22:28:26 INFO: File exists: /home/chudeo/stanza_resources/en/pos/mimic_charlm.pt
2024-03-28 22:28:26 INFO: File exists: /home/chudeo/stanza_resources/en/lemma/mimic_nocharlm.pt
2024-03-28 22:28:27 INFO: File exists: /home/chudeo/stanza_resources/en/depparse/mimic_charlm.pt
2024-03-28 22:28:28 INFO: File exists: /home/chudeo/stanza_resources/e

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-03-28 22:28:29 INFO: Downloaded file to /home/chudeo/stanza_resources/resources.json
2024-03-28 22:28:30 INFO: Loading these models for language: en (English):
| Processor | Package        |
------------------------------
| tokenize  | mimic          |
| pos       | mimic_charlm   |
| lemma     | mimic_nocharlm |
| depparse  | mimic_charlm   |
| ner       | i2b2           |

2024-03-28 22:28:30 INFO: Using device: cuda
2024-03-28 22:28:30 INFO: Loading: tokenize
2024-03-28 22:28:33 INFO: Loading: pos
2024-03-28 22:28:33 INFO: Loading: lemma
2024-03-28 22:28:33 INFO: Loading: depparse
2024-03-28 22:28:34 INFO: Loading: ner
2024-03-28 22:28:34 INFO: Done loading processors!


In [5]:
# Global variable to store all extracted texts
extracted_texts = []

# Global variable to store cumulative sentence count
global_sentence_count = 0

In [6]:
# Inside the function process_text_with_stanza, update the global_sentence_count
def process_text_with_stanza(text):
    global global_sentence_count
    doc = nlp(text)
    global_sentence_count += len(doc.sentences)  # Increment global_sentence_count by the number of sentences in the current document
    return doc

In [7]:
def extract_info_from_json(json_file_path, hadm_id_set):
    global extracted_texts
    global global_sentence_count  # Access the global global_sentence_count variable
    last_sentence_id = global_sentence_count  # Initialize last assigned sentence ID to global_sentence_count

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None

        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information

            # Add 'hadm_id' to the note information
            note_info['hadm_id'] = hadm_id

            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)

            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information

            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information

                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)

                annotations_info.append(annotation_info)  # Append annotation information to the list

            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)

            # Process text with Stanza
            if note_info['text']:
                processed_text = process_text_with_stanza(note_info['text'])
                note_info['processed_text'] = processed_text

                # Extract sentence information and generate labels
                sentence_info = []
                for sent in processed_text.sentences:
                    tokens = [word.text for word in sent.words]
                    labels = ['O'] * len(tokens)  # Initialize labels as 'O'

                    for annotation in note_info['annotations']:
                        begin = annotation['begin']
                        end = annotation['end']

                        word_indices = find_word_indices(sent.words, begin, end)

                        # Assign labels
                        if word_indices:
                            start_idx, end_idx = word_indices[0], word_indices[-1]
                            labels[start_idx] = 'B'
                            for idx in range(start_idx + 1, end_idx + 1):
                                labels[idx] = 'I'

                    # Assign a unique sentence ID
                    sentence_id = last_sentence_id + sent.index  # Calculate sentence ID based on last assigned ID and current sentence index

                    # Print the result for each sentence
                    print('Sentence ID:', sentence_id)
                    print('Sentence:', ' '.join(tokens))
                    print('Labels:', ' '.join(labels))
                    print()

                    sentence_info.append({
                        'sentence_id': sentence_id,
                        'words': tokens,
                        'labels': labels
                    })

                note_info['sentence_info'] = sentence_info
                last_sentence_id += len(processed_text.sentences)  # Update last_sentence_id

            extracted_texts.append(note_info)  # Append note information to the global variable


In [8]:
def find_word_indices(words, begin, end):
    # Create an empty list to store the indices of words found within the given range
    word_indices = []

    # Iterate through each token in the list of tokens
    for idx, word in enumerate(words):
        # Check if the start character of the token matches the beginning of the range
        # or if the token spans the beginning of the range
        if word.start_char == begin or (word.start_char < begin and word.end_char > begin):
            # If it matches, add the index of the token to the list of word indices
            word_indices.append(idx)
        # Check if the end character of the token matches the end of the range
        # or if the token spans the end of the range
        if word.end_char == end or (word.start_char < end and word.end_char > end):
            # If it matches, add the index of the token to the list of word indices
            word_indices.append(idx)
            break

    # Return the list of word indices found within the given range
    return word_indices

In [9]:
def generate_labels(words, annotations, start_token_idx, end_token_idx):
    labels = ['O'] * len(words)

    for annotation in annotations:
        code = annotation['code']

        # # Update labels for the matched words
        # if start_token_idx is not None and end_token_idx is not None:
        #     labels[start_token_idx] = f'B-{code}'
        #     for idx in range(start_token_idx + 1, end_token_idx + 1):
        #         labels[idx] = f'I-{code}'

    return labels

In [10]:
# Function to search for JSON files in a given folder and its subfolders
def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)

    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))




In [14]:
import gc, torch

gc.collect()

torch.cuda.empty_cache()

In [15]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = '/home/chudeo/project/with_text'

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)

Processing: /home/chudeo/project/with_text/gold/Profee/ICD-9/1.0/126586-ICD-9.json
Sentence ID: 16932
Sentence: Baseline artifact .
Labels: O O O

Sentence ID: 16933
Sentence: Probable sinus tachycardia with atrial premature beats .
Labels: O B I O O O O O

Sentence ID: 16934
Sentence: Low limb lead voltage .
Labels: O O O O O

Sentence ID: 16935
Sentence: Leftward axis .
Labels: O O O

Sentence ID: 16936
Sentence: Late R wave progression .
Labels: O O O O O

Sentence ID: 16937
Sentence: Consider anteroseptal myocardial infarction .
Labels: O O O O O

Sentence ID: 16938
Sentence: Other ST - T wave abnormalities .
Labels: O B I I I I O

Sentence ID: 16939
Sentence: Since the previous tracing of [ **2120-1-25 **] ST- T wave abnormalities may be more prominent and axis is less leftward .
Labels: O O O O O O O O O O O O O O O O O O O O O O

Sentence ID: 16940
Sentence: Atrial premature beats persist .
Labels: O O O O O

Sentence ID: 16941
Sentence: Sinus tachycardia with occasional aberran

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.36 GiB (GPU 0; 11.92 GiB total capacity; 2.91 GiB already allocated; 2.08 GiB free; 3.28 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [16]:
import pandas as pd

# Function to create a DataFrame from the extracted sentence information
def create_dataframe(extracted_texts):
    data = []

    for note_info in extracted_texts:
        if 'sentence_info' in note_info:
            hadm_id = note_info['hadm_id']
            note_id = note_info['note_id']
            for sentence_info in note_info['sentence_info']:
                words = sentence_info['words']
                labels = sentence_info['labels']
                sentence_id = sentence_info['sentence_id']

                for word, label in zip(words, labels):
                    data.append({
                        'hadm_id': hadm_id,
                        'note_id': note_id,
                        'sentence_id': sentence_id,
                        'word': word,
                        'label': label
                    })

    df = pd.DataFrame(data)
    return df

df = create_dataframe(extracted_texts)
print(df)

        hadm_id  note_id  sentence_id         word label
0        126586   245505            0     Baseline     O
1        126586   245505            0     artifact     O
2        126586   245505            0            .     O
3        126586   245505            1     Probable     O
4        126586   245505            1        sinus     B
...         ...      ...          ...          ...   ...
625505   168976   593737        33845         Code     O
625506   168976   593737        33845       status     O
625507   168976   593737        33845            :     O
625508   168976   593737        33845  Disposition     O
625509   168976   593737        33845            :     O

[625510 rows x 5 columns]


In [17]:
label_counts = df['label'].value_counts()
print(label_counts)

label
O    607895
B      8820
I      8795
Name: count, dtype: int64


In [18]:
token_df= df.drop(['hadm_id', 'note_id',], axis=1)

In [19]:
token_df

Unnamed: 0,sentence_id,word,label
0,0,Baseline,O
1,0,artifact,O
2,0,.,O
3,1,Probable,O
4,1,sinus,B
...,...,...,...
625505,33845,Code,O
625506,33845,status,O
625507,33845,:,O
625508,33845,Disposition,O


In [20]:
token_df.rename(columns = {'word':'words', 'label':'labels'}, inplace = True)

In [21]:
token_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,Baseline,O
1,0,artifact,O
2,0,.,O
3,1,Probable,O
4,1,sinus,B


In [22]:
token_df.to_csv('33k_sentence.csv', index=False)

In [None]:
x = token_df[['sentence_id', 'words']]
y = token_df['labels']

SPLITTING DATA

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [None]:
#build the training and test data
train_data = pd.DataFrame({'sentence_id': x_train['sentence_id'], 'words': x_train['words'], 'labels': y_train})
test_data = pd.DataFrame({'sentence_id': x_test['sentence_id'], 'words': x_test['words'], 'labels': y_test})

In [None]:
train_data

MODEL TRAINING

BERT

In [None]:
pip install simpletransformers

In [None]:
from simpletransformers.ner import NERModel, NERArgs

In [None]:
label = token_df['labels'].unique().tolist()
label

In [None]:
args = NERArgs()
args.num_train_epochs = 10
args.learning_rate = 1e-4
args.overwrite_output_dir = True
args.train_batch_size = 32
args.eval_batch_size = 32

In [None]:
model = NERModel('bert', 'bert-base-cased', labels=label, args=args, use_cuda = False)

In [None]:
from sklearn.metrics import accuracy_score

model.train_model(train_data, eval_data = test_data, acc = accuracy_score)

In [None]:
result, model_outputs, preds_list = model.eval_model(test_data)

In [None]:
result

In [None]:
prediction, model_output = model.predict(['There is also intraventricular extension into the ipsilateral and contralateral lateral ventricles.'])

In [None]:
prediction

BioBERT

In [None]:
bio_label = token_df['labels'].unique().tolist()
bio_label

In [None]:
# Initialize NER model arguments
args = NERArgs()
args.num_train_epochs = 10  # Number of training epochs
args.learning_rate = 2e-5   # Learning rate
args.overwrite_output_dir = True
args.train_batch_size = 32
args.eval_batch_size = 32

In [None]:
# Create the NER model with BioBERT
model = NERModel('bert', 'dmis-lab/biobert-base-cased-v1.2', args=args, labels=bio_label, use_cuda=False)


In [None]:
# Train the model
model.train_model(train_data)

# Evaluate the model on test data
bio_result, model_outputs, preds_list = model.eval_model(test_data)

In [None]:
bio_result

In [None]:

predictions, raw_outputs = model.predict(["T[**Name2 (NI) **] was extubated on [**10-24**] without"])
print(predictions)


Hyperparameter Tuning

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Build the training, validation, and test data
train_data = pd.DataFrame({'sentence_id': x_train['sentence_id'], 'words': x_train['words'], 'labels': y_train})
val_data = pd.DataFrame({'sentence_id': x_val['sentence_id'], 'words': x_val['words'], 'labels': y_val})
test_data = pd.DataFrame({'sentence_id': x_test['sentence_id'], 'words': x_test['words'], 'labels': y_test})


In [None]:
from sklearn.metrics import accuracy_score
from simpletransformers.ner import NERModel, NERArgs
import logging


# Define hyperparameters to search over
param_grid = {
    'learning_rate': [1e-5, 2e-5, 3e-5],
    'num_train_epochs': [1, 2, 3],
    'train_batch_size': [16, 32, 64]
}

# Initialize your model
model = NERModel('bert', 'dmis-lab/biobert-base-cased-v1.2', args=args, labels=bio_label, use_cuda=False)

best_accuracy = 0
best_params = {}

# Split test_data into batches to avoid memory issues
test_batches = [test_data[i:i+32] for i in range(0, len(test_data), 32)]

# Define early stopping criteria
early_stop_patience = 3
early_stop_counter = 0
best_val_accuracy = 0


# Configure logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
file_handler = logging.FileHandler('training.log')
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
file_handler.setFormatter(formatter)
stream_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.addHandler(stream_handler)


# Perform manual hyperparameter tuning
for lr in param_grid['learning_rate']:
    for epochs in param_grid['num_train_epochs']:
        for batch_size in param_grid['train_batch_size']:
            logger.info(f"Training with lr={lr}, epochs={epochs}, batch_size={batch_size}")

            # Initialize the model with current hyperparameters
            args = NERArgs()
            args.learning_rate = lr
            args.num_train_epochs = epochs
            args.train_batch_size = batch_size
            model = NERModel('bert', 'dmis-lab/biobert-base-cased-v1.2', args=args, labels=bio_label, use_cuda=False)


            # Train the model
            for epoch in range(epochs):
                model.train_model(train_data)

                # Evaluate on validation set
                _, _, val_preds = model.eval_model(val_data)
                val_accuracy = accuracy_score(val_data['labels'], val_preds)

                # Check for early stopping
                if val_accuracy > best_val_accuracy:
                    best_val_accuracy = val_accuracy
                    early_stop_counter = 0
                else:
                    early_stop_counter += 1
                    if early_stop_counter >= early_stop_patience:
                        logger.info(f"Early stopping after {epoch + 1} epochs.")
                        break

            # Evaluate on test set
            _, _, test_preds = model.eval_model(test_data)
            accuracy = accuracy_score(test_data['labels'], test_preds)
            logger.info(f"Accuracy: {accuracy:.4f}")

            # Update best parameters and accuracy
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {'learning_rate': lr, 'num_train_epochs': epochs, 'train_batch_size': batch_size}

# Log best hyperparameters and accuracy
logger.info("Best Parameters: %s", str(best_params))
logger.info("Best Accuracy: %.4f", best_accuracy)


Regularization

In [None]:

# Define regularization parameters
args = NERArgs()
args.num_train_epochs = 5
args.learning_rate = 2e-5
args.overwrite_output_dir = True
args.train_batch_size = 32
args.eval_batch_size = 32
args.weight_decay = 0.01  # Example regularization parameter

# Initialize the model with regularization
model = NERModel('bert', 'dmis-lab/biobert-base-cased-v1.2', args=args, labels=bio_label, use_cuda=False)

# Train the model
model.train_model(train_data)

# Evaluate the model
result, model_outputs, preds_list = model.eval_model(test_data)
print(result)


In [None]:
result

Fine-tuning BERT for named-entity recognition

In [None]:
token_df

In [None]:

label2id = {k: v for v, k in enumerate(token_df.labels.unique())}
id2label = {v: k for v, k in enumerate(token_df.labels.unique())}
label2id

#### **Preparing the dataset and dataloader**

In [None]:
from transformers import BertTokenizer, BertForMaskedLM

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
import torch
from torch.utils.data import Dataset

class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        words = self.data.iloc[index]['words'].split(",")  # Assuming tokenized words are separated by comma
        labels = self.data.iloc[index]['labels'].split(",")

        # Convert tokenized words to token IDs
        input_ids = [self.tokenizer.convert_tokens_to_ids(word.split()) for word in words]

        # Pad or truncate tokenized words and labels to fit MAX_LEN
        if len(input_ids) > self.max_len:
            input_ids = input_ids[:self.max_len]
            labels = labels[:self.max_len]
        elif len(input_ids) < self.max_len:
            padding_length = self.max_len - len(input_ids)
            input_ids += [[0]] * padding_length  # Padding token ID assumed to be 0
            labels += ['O'] * padding_length

        # Flatten the list of token IDs
        input_ids = [token_id for sublist in input_ids for token_id in sublist]

        # Convert labels to label ids
        label_ids = [label2id[label] for label in labels]

        # Create attention mask
        attention_mask = [1 if token_id != 0 else 0 for token_id in input_ids]  # Assuming padding token ID is 0

        return {
            'ids': torch.tensor(input_ids, dtype=torch.long),
            'mask': torch.tensor(attention_mask, dtype=torch.long),
            'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len


In [None]:
# from torch.nn.utils.rnn import pad_sequence

# def collate_fn(batch):
#     # Get input_ids, masks, and targets from the batch
#     ids = [sample['ids'] for sample in batch]
#     masks = [sample['mask'] for sample in batch]
#     targets = [sample['targets'] for sample in batch]

#     # Pad sequences to the length of the longest sequence in the batch
#     ids = pad_sequence(ids, batch_first=True, padding_value=tokenizer.pad_token_id)
#     masks = pad_sequence(masks, batch_first=True, padding_value=0)  # Padding token ID is assumed to be 0
#     targets = pad_sequence(targets, batch_first=True, padding_value=-100)  # Replace -100 with appropriate padding value for labels

#     # Ensure alignment between input_ids, masks, and targets
#     masks = masks.bool()  # Convert to boolean mask
#     targets = targets * masks.long()  # Apply mask to targets

#     return {
#         'ids': ids,
#         'mask': masks,
#         'targets': targets
#     }


In [None]:
def collate_fn(batch):
    ids = [sample['ids'] for sample in batch]
    masks = [sample['mask'] for sample in batch]
    targets = [sample['targets'] for sample in batch]

    # Pad sequences dynamically to the maximum length within the batch
    max_len = max(len(seq) for seq in ids)
    ids = [torch.nn.functional.pad(seq, (0, max_len - len(seq)), value=0) for seq in ids]
    masks = [torch.nn.functional.pad(seq, (0, max_len - len(seq)), value=0) for seq in masks]
    targets = [torch.nn.functional.pad(seq, (0, max_len - len(seq)), value=-100) for seq in targets]

    ids = torch.stack(ids, dim=0)
    masks = torch.stack(masks, dim=0)
    targets = torch.stack(targets, dim=0)

    return {
        'ids': ids,
        'mask': masks,
        'targets': targets
    }


In [None]:
# Define train and test datasets using train_test_split
train_df, test_df = train_test_split(token_df, test_size=0.2, random_state=42)

print("FULL Dataset: {}".format(token_df.shape))
print("TRAIN Dataset: {}".format(train_df.shape))
print("TEST Dataset: {}".format(test_df.shape))

# Create dataset instances
training_set = dataset(train_df, tokenizer, MAX_LEN)
testing_set = dataset(test_df, tokenizer, MAX_LEN)

In [None]:
training_set[0]

In [None]:
training_set[0]["ids"]

In [None]:
# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"][:30]), training_set[0]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))

Now, let's define the corresponding PyTorch dataloaders:

In [None]:
from torch.utils.data import Dataset, DataLoader

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

# training_loader = DataLoader(training_set, **train_params)
# testing_loader = DataLoader(testing_set, **test_params)

training_loader = DataLoader(training_set, collate_fn=collate_fn, **train_params)
testing_loader = DataLoader(testing_set, collate_fn=collate_fn, **test_params)

#### **Defining the model**

In [None]:
from transformers import BertTokenizer, BertConfig, BertForTokenClassification


from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [None]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', 
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

#### **Training the model**


In [None]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

In [None]:
tr_logits = outputs[1]
tr_logits.shape

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

#### **Evaluating the model**

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(targets)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
    
    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [None]:
labels, predictions = valid(model, testing_loader)

In [None]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))