https://www.analyticsvidhya.com/blog/2020/07/transfer-learning-for-nlp-fine-tuning-bert-for-text-classification/
### Model from:

https://github.com/allenai/scibert

In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import re
import seaborn as sns
from tqdm import tqdm
import nltk
import random
nltk.download('punkt')
from nltk.tokenize import word_tokenize,sent_tokenize
import pickle

train_example_names = [fn.split('.')[0] for fn in os.listdir('data/train')]
test_example_names = [fn.split('.')[0] for fn in os.listdir('data/test')]

metadata = pd.read_csv('data/train.csv')
docIdx = train_example_names.copy()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ozano\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Create dataframe for tokens and targets

In [2]:
import unidecode

match_puncs_re = r"([.,!?()\-;\[\]+\\\/@:<>#_{}&%'*=" + r'"' + r"|])"
match_puncs_re = re.compile(match_puncs_re)

def load_train_example_by_name(name):
    doc_path = os.path.join('data/train', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

def load_test_example_by_name(name):
    doc_path = os.path.join('data/test', name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

def text_cleaning_for_bert(text):
    # Keeps puncs, pads them with whitespaces

    text = text.replace('^', ' ')
    text = unidecode.unidecode(text)

    # Remove websites
    text = ' '.join(['specialwebsite' if 'http' in t or 'www' in t else t for t in text.split(' ') ])

    text = match_puncs_re.sub(r' \1 ', text)

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    return text

def text_cleaning_for_label(text):
    text = text.replace('^', ' ')
    text = unidecode.unidecode(text)

    text = re.sub('[^A-Za-z0-9]+', ' ', str(text)).strip() # remove unnecessary literals

    # Remove websites
    text = ' '.join(['specialwebsite' if 'http' in t or 'www' in t else t for t in text.split(' ') ])

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    return text.lower()

In [3]:
import string

def text_cleaning(text):
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text)).strip() # remove unnecessary literals

    # remove extra spaces
    text = re.sub("\s+"," ", text)

    return text.lower()

##### STEP 1: Make a list of the known labels provided to us

temp_1 = [text_cleaning(x) for x in metadata['dataset_label']]
temp_2 = [text_cleaning(x) for x in metadata['dataset_title']]
temp_3 = [text_cleaning(x) for x in metadata['cleaned_label']]

existing_labels = temp_1 + temp_2 + temp_3
existing_labels = [l.lower() for l in existing_labels]
existing_labels = list(set(existing_labels))
# Sort labels by length in descending order
existing_labels = sorted(existing_labels, key = len, reverse= True)

## Make sentences

In [None]:
pos_sentences = []
neg_sentences = []

def process_doc(doc_id):
    doc_json = load_train_example_by_name(doc_id)
    doc_text = ' '.join([sec['text'] for sec in doc_json])

    # Tokenize sentencewise
    sentences = sent_tokenize(doc_text)

    adni_count = 0
    for sentence in sentences:
        clean_sentence = text_cleaning(sentence)

        has_label = False
        label_is_adni = False
        for clean_label in existing_labels:
            if clean_label in clean_sentence:
                has_label = True

                if 'adni' in clean_label or 'alzheimer' in clean_label:
                    adni_count += 1
                    label_is_adni = True

                break

        if has_label and (adni_count <= 2 or not label_is_adni):
            pos_sentences.append(sentence)
        else:
            if random.uniform(0, 1) < 0.25:
                neg_sentences.append(sentence)

## Generate and Save Sentences

In [None]:
import pickle
assert len(docIdx) > 0

pos_sentences = []
neg_sentences = []

for doc_id in tqdm(docIdx):
    process_doc(doc_id)

with open(f'data/bert_ner_sentences/pos.pkl', 'wb') as f:
    pickle.dump(pos_sentences, f)

with open(f'data/bert_ner_sentences/neg.pkl', 'wb') as f:
    pickle.dump(neg_sentences, f)

print(f'pos size: {len(pos_sentences)}')
print(f'neg size: {len(neg_sentences)}')

## Load Sentences

In [None]:
import pickle

with open(f'data/bert_ner_sentences/pos.pkl', 'rb') as f:
    pos_sentences = pickle.load(f)

with open(f'data/bert_ner_sentences/neg.pkl', 'rb') as f:
    neg_sentences = pickle.load(f)

print(f'pos size: {len(pos_sentences)}')
print(f'neg size: {len(neg_sentences)}')

In [None]:
pos_sentences_processed = []
neg_sentences_processed = []
pos_labels = []
neg_labels = []

n_broken_sent = 0
n_pos_no_label = 0

def convert_tokens(text):
    if is_acronym(text):
        return 'ACRONYM'
    return text

def is_acronym(text):
    if len(text) < 3:
        return False
    if text.isupper():
        return True

def is_text_broken(tokens):
    # Some texts are like 'p a dsdv a d a ds f b', remove them
    if len(tokens) == 0:
        return True

    if len(tokens) < 50:
        return False

    one_char_token_ratio = len([l for l in tokens if len(l) == 1]) / len(tokens)
    return one_char_token_ratio > 0.15

def split_to_smaller_sent(tokens, s_size, overlap_size):
    # output sentences will be s_size + overlap_size long
    small_sents = []

    if len(tokens) <= s_size:
        return [tokens]

    n_parts = len(tokens) // s_size
    if len(tokens) % s_size != 0:
        n_parts += 1

    for i_part in range(n_parts):
        start_i = i_part * s_size
        if i_part > 0:
            start_i -= overlap_size

        end_i = min(len(tokens), (i_part + 1) * s_size)

        small_sents.append(tokens[start_i: end_i])

    return small_sents

def join_tuple_tokens(tuples):
    return ' '.join([t[1] for t in tuples])

def get_index(lst, el):
    idx = []
    for i, lst_el in enumerate(lst):
        if el in lst_el:
            idx.append(i)

    return idx

def process_pos_sentence(sentence):
    global n_broken_sent
    global last_doc_labels

    bert_sentence = text_cleaning_for_bert(sentence)
    label_sentence = text_cleaning_for_label(sentence)

    if is_text_broken(label_sentence.split(' ')): # Can't use bert cleaning for this, because all punc.s are padded with spaces
        n_broken_sent += 1
        return
    
    bert_tokens = bert_sentence.split(' ')
    ### STEP 1: Split into fixed sized sentences ###
    for small_sentence_tokens in split_to_smaller_sent(bert_tokens, s_size = 125, overlap_size = 25):

        small_bert_sentence = ' '.join(small_sentence_tokens)

        # Need to remove punc.s and uppercase letters to find labels
        small_label_sentence = text_cleaning_for_label(small_bert_sentence)

        has_label = False
        sent_labels = []
        ### STEP 2: Match labels ###
        # Check if contains labels
        for clean_label in existing_labels:
            if clean_label in small_label_sentence:
                has_label = True

                # Remove label from the text, to only match the largest label
                small_label_sentence = small_label_sentence.replace(clean_label, '')
                sent_labels.append(clean_label)

        small_sent_targets = ['O' for _ in range(len(small_sentence_tokens))]

        if has_label:
            # Tokenize labels for matching
            sent_label_tokens = [l.split(' ') for l in sent_labels]

            # Get index, token tuples for clean tokens. Indices are for raw tokens
            small_sent_tuples = [(i, token.lower()) for i, token in enumerate(small_sentence_tokens) if text_cleaning_for_label(token) != '']

            ### STEP 3: Set corresponding targets for each label ###
            # Target: (B, I, O), Label: adni
            for l in sent_labels:
                l_tokens = l.split(' ')
                small_sent_joined = [join_tuple_tokens(small_sent_tuples[i: i + len(l_tokens)]) for i in range(len(small_sent_tuples) - len(l_tokens) + 1)]

                label_start_idx = get_index(small_sent_joined, l) # list of indices
                for label_start_i in label_start_idx:
                    label_end_i = label_start_i + len(l_tokens) - 1

                    target_start_i = small_sent_tuples[label_start_i][0]
                    target_end_i = small_sent_tuples[label_end_i][0]

                    # Do not use the same tokens for multiple labels
                    #small_sent_tuples = small_sent_tuples[:label_start_i] + small_sent_tuples[label_end_i:]

                    try:
                        if small_sent_targets[target_start_i] == 'O': # If not was already labeled
                            small_sent_targets[target_start_i] = 'B'
                            if target_end_i - target_start_i > 0:
                                for i in range(target_start_i+1, target_end_i+1):
                                    small_sent_targets[i] = 'I'

                    except Exception as e:
                        print('DEBUG')
                        print(small_sentence_tokens)
                        print(len(small_sentence_tokens))
                        print(len(small_sent_targets))
                        print(target_start_i)
                        print(small_sent_joined)
                        print('DEBUG')
                        raise e
        
        ### STEP 4: Add sentence output to lists ###
        if has_label:
            pos_sentences_processed.append([convert_tokens(t) for t in small_sentence_tokens])
            pos_labels.append(small_sent_targets)
        """else:
            neg_sentences_processed.append(small_sentence_tokens)
            neg_labels.append(small_sent_targets)"""

def process_neg_sentence(sentence):
    global n_broken_sent
    
    bert_sentence = text_cleaning_for_bert(sentence)
    label_sentence = text_cleaning_for_label(sentence)

    if is_text_broken(label_sentence.split(' ')): # Can't use bert cleaning for this, because all punc.s are padded with spaces
        n_broken_sent += 1
        return

    bert_tokens = bert_sentence.split(' ')
    
    ### STEP 1: Split into fixed sized sentences ###
    for small_sentence_tokens in split_to_smaller_sent(bert_tokens, s_size = 125, overlap_size = 25):
        small_sent_targets = ['O' for _ in range(len(bert_tokens))]

        neg_sentences_processed.append([convert_tokens(t) for t in small_sentence_tokens])
        neg_labels.append(small_sent_targets)

#process_pos_sentence(pos_sentences[2472])

## Create NER Dataset and Save

In [None]:
assert len(pos_sentences) > 0

pos_sentences_processed = []
neg_sentences_processed = []
pos_labels = []
neg_labels = []

n_pos_no_label = 0
n_broken_sent = 0

for sent in tqdm(pos_sentences):
    process_pos_sentence(sent)

for sent in tqdm(neg_sentences):
    process_neg_sentence(sent)

import pickle

with open(f'data/bert_ner_data/pos.pkl', 'wb') as f:
    pickle.dump(pos_sentences_processed, f)

with open(f'data/bert_ner_data/neg.pkl', 'wb') as f:
    pickle.dump(neg_sentences_processed, f)

with open(f'data/bert_ner_data/pos_labels.pkl', 'wb') as f:
    pickle.dump(pos_labels, f)

with open(f'data/bert_ner_data/neg_labels.pkl', 'wb') as f:
    pickle.dump(neg_labels, f)


print('')
print(f'broken sentences: {n_broken_sent}')
print(f'n_pos_no_label: {n_pos_no_label}')
print(f'pos_proc size: {len(pos_sentences_processed)}')
print(f'neg_proc size: {len(neg_sentences_processed)}')

## Load NER Dataset

In [4]:
import pickle

with open(f'data/bert_ner_data/pos.pkl', 'rb') as f:
    pos_sentences_processed = pickle.load(f)

with open(f'data/bert_ner_data/neg.pkl', 'rb') as f:
    neg_sentences_processed = pickle.load(f)

with open(f'data/bert_ner_data/pos_labels.pkl', 'rb') as f:
    pos_labels = pickle.load(f)

with open(f'data/bert_ner_data/neg_labels.pkl', 'rb') as f:
    neg_labels = pickle.load(f)

print(f'pos size: {len(pos_sentences_processed)}')
print(f'neg size: {len(neg_sentences_processed)}')
print(f'pos label size: {len(pos_labels)}')
print(f'neg label size: {len(neg_labels)}')

pos size: 32016
neg size: 1032513
pos label size: 32016
neg label size: 1032513


## Augmentation

In [5]:
# Load dataset names
dataset_names = list(pd.read_csv('data/kaggle_22k_datasets.csv').title.values) +\
                list(pd.read_csv('data/kaggle_800_datasets.csv').title.values)

dataset_names = [n for n in dataset_names if len(n) > 15]
dataset_names = [text_cleaning_for_label(n) for n in dataset_names]
dataset_names = [n for n in dataset_names if len(n.split(' ')) < 8]

connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'from', 'the'}

def replace_target(x, lst):
    if x['label'].iloc[0] == 'O':
        # if not a dataset name, do not augment
        lst.append(x)
    else:
        random_name_tokens = random.choice(dataset_names).split(' ')
        random_name_tokens = [r[0].upper() + r[1:] if not r.lower() in connection_tokens else r for r in random_name_tokens]

        new_x = pd.DataFrame()
        # Replace tokens
        new_x['token'] = random_name_tokens
        new_x['label'] = 'I'
        new_x.loc[new_x.index == 0, 'label'] = 'B'
        lst.append(new_x)

def augment_sentence(tokens, labels, augment_chance = 0.9):
    if random.uniform(0,1) > augment_chance:
        # No augmentation
        return tokens, labels

    df_pieces = []
    sent_df = pd.DataFrame({'token': tokens, 'label': labels})
    sent_df['label_o'] = sent_df.label == 'O'

    gb = sent_df.groupby((sent_df['label_o'].shift() != sent_df['label_o']).cumsum())
    for name, group in gb:
        replace_target(group, df_pieces)

    sent_df = pd.concat(df_pieces, ignore_index = True, axis = 0)

    return list(sent_df.token.values), list(sent_df.label.values)

In [6]:
pos_sentences_processed_aug = []
pos_labels_aug = []

for _ in range(10):
    for s_tokens, s_labels in tqdm(zip(pos_sentences_processed, pos_labels), total = len(pos_labels)):
        aug_tokens, aug_labels = augment_sentence(s_tokens, s_labels)
        pos_sentences_processed_aug.append(aug_tokens)
        pos_labels_aug.append(aug_labels)

pos_sentences_processed = pos_sentences_processed_aug
pos_labels = pos_labels_aug

100%|███████████████████████████████████████████████████████████████████████████| 32016/32016 [02:45<00:00, 193.38it/s]
100%|███████████████████████████████████████████████████████████████████████████| 32016/32016 [02:24<00:00, 221.15it/s]
100%|███████████████████████████████████████████████████████████████████████████| 32016/32016 [02:29<00:00, 213.79it/s]
100%|███████████████████████████████████████████████████████████████████████████| 32016/32016 [02:22<00:00, 224.12it/s]
100%|███████████████████████████████████████████████████████████████████████████| 32016/32016 [02:18<00:00, 231.83it/s]
100%|███████████████████████████████████████████████████████████████████████████| 32016/32016 [02:04<00:00, 257.06it/s]
100%|███████████████████████████████████████████████████████████████████████████| 32016/32016 [02:05<00:00, 254.17it/s]
100%|███████████████████████████████████████████████████████████████████████████| 32016/32016 [02:07<00:00, 251.99it/s]
100%|███████████████████████████████████

In [7]:
pos_sentences_processed_aug[0]

['In',
 'fact',
 ',',
 'organizations',
 'are',
 'now',
 'identifying',
 'digital',
 'skills',
 'or',
 'computer',
 'literacy',
 'as',
 'one',
 'of',
 'their',
 'core',
 'values',
 'for',
 'employability',
 '(',
 'such',
 'as',
 'the',
 'US',
 'Department',
 'of',
 'Education',
 ',',
 'the',
 'US',
 'Department',
 'of',
 'commerce',
 ',',
 'the',
 'ACRONYM',
 '3d',
 'Flash',
 'Lidar',
 'Space',
 'Laser',
 'Phase',
 'I',
 'and',
 'the',
 'European',
 'Commission',
 ')',
 '.',
 '']

## Create Training Data

In [8]:
from sklearn.model_selection import train_test_split
import numpy as np

neg_size = 600000
neg_idx = np.random.permutation(len(neg_labels))
neg_sentences_processed = [neg_sentences_processed[i] for i in neg_idx[:neg_size]]
neg_labels = [neg_labels[i] for i in neg_idx[:neg_size]]

sentences = pos_sentences_processed + neg_sentences_processed
labels = pos_labels + neg_labels

del pos_sentences_processed
del neg_sentences_processed
del pos_labels
del neg_labels

"""print('Splitting data...')
train_sents, val_sents, train_labels, val_labels = train_test_split(sentences, labels, test_size=0.20, random_state=42)"""

"print('Splitting data...')\ntrain_sents, val_sents, train_labels, val_labels = train_test_split(sentences, labels, test_size=0.20, random_state=42)"

## Fine Tune Bert

In [9]:
import os
import math
import random
import csv
import sys
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report
import statistics as stats
from bert_sklearn import BertTokenClassifier 

In [10]:
model = BertTokenClassifier(bert_model='bert-base-cased',
                             num_mlp_hiddens= 500,
                             max_seq_length=150, 
                             epochs=1,
                             #gradient accumulation
                             gradient_accumulation_steps=4,
                             learning_rate=3e-5,
                             train_batch_size=8,#batch size for training
                             eval_batch_size=8, #batch size for evaluation
                             validation_fraction=0.15, 
                             #ignore the tokens with label ‘O’
                             ignore_label=['O'])

Building sklearn token classifier...


In [None]:
model.fit(sentences, labels)

  return np.array(X)
100%|████████████████████████████████████████████████████████████████████████| 213450/213450 [00:03<00:00, 63214.25B/s]


Loading bert-base-cased model...


100%|████████████████████████████████████████████████████████████████| 435779157/435779157 [03:08<00:00, 2309556.37B/s]
100%|█████████████████████████████████████████████████████████████████████████████| 433/433 [00:00<00:00, 108206.25B/s]


Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 782136, validation data size: 138024


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:1005.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Training  :  51%|██████████████████████▎                     | 198471/391068 [8:09:34<7:01:26,  7.62it/s, loss=0.00272]

In [None]:
# save model to disk
savefile='data/sklearn_bert_ner_cased.bin'
model.save(savefile)

## Load model

In [None]:
from bert_sklearn import load_model
bert_model = load_model(r'data/sklearn_bert_ner_cased.bin')

In [None]:
val_preds = model.predict(val_sents)

In [None]:
ex_i = 101
pd.DataFrame({'token': val_sents[ex_i], 'pred':val_preds[ex_i]})

In [None]:
#ex_sent = val_sents[101]
ex_sent = neg_sentences_processed[10]
ex_pred = model.predict([ex_sent])

pd.DataFrame({'token': ex_sent, 'pred':ex_pred[0]})

In [None]:
pos_sentences_processed[10]