Based on:  
https://www.tensorflow.org/text/tutorials/classify_text_with_bert

In [68]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

In [2]:
os.chdir('..')
os.chdir('..')
os.chdir('..')

In [3]:
pwd

'D:\\coding\\springboard\\capstone\\springboard-capstone-project\\app'

In [4]:
tfhub_handle_encoder = r"https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
tfhub_handle_preprocess = r"https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

In [5]:
tfhub_handle_preprocess

'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [6]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [7]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

### Test 1

In [9]:
text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_mask', 'input_type_ids', 'input_word_ids']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [10]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.76262885  0.99280983 -0.1861186   0.3667385   0.15233707  0.65504473
  0.9681154  -0.948627    0.00216191 -0.9877732   0.06842708 -0.97630596]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[-0.28946298  0.3432125   0.3323147  ...  0.2130084   0.710207
  -0.0577119 ]
 [-0.28741992  0.3198101  -0.23018652 ...  0.5845503  -0.21329783
   0.7269202 ]
 [-0.66157013  0.68876815 -0.87432945 ...  0.10877208 -0.2617318
   0.47855297]
 ...
 [-0.22561139 -0.2892564  -0.07064369 ...  0.4756601   0.83277094
   0.40025324]
 [-0.2982423  -0.27473155 -0.05450507 ...  0.48849753  1.0955356
   0.18163389]
 [-0.44378242  0.00930706  0.07223725 ...  0.17290062  1.1833242
   0.0789794 ]]


### Test 2

In [8]:
nyt = pd.read_csv('data/nyt/nyt.csv', encoding="ISO-8859-1").reset_index()
nyt_label = pd.read_csv('data/nyt/nyt_labelled.csv')

In [9]:
nyt = nyt.merge(nyt_label[['web_url', 'label']], how='inner', left_on='web_url', right_on='web_url')

In [10]:
nyt = nyt[~nyt.label.isnull()]

In [11]:
nyt.head()

Unnamed: 0,index,web_url,pub_date,organizations,subjects,headline,snippet,label
20,20,https://www.nytimes.com/2001/07/29/business/le...,2001-07-29T05:00:00+0000,SOCIAL SECURITY,Music | Medicine and Health | Insurance | Chil...,Letters: The Battle Lines of the Mommy Wars,"Readers thoughts on ""Is My Mom Better Than You...",S
52,52,https://www.nytimes.com/2001/08/16/business/me...,2001-08-16T05:00:00+0000,Industry Standard,Computers and the Internet | Boards of Directo...,Industry Standard Becomes Latest Casualty in D...,"The Industry Standard, a magazine that for muc...",O
92,92,https://www.nytimes.com/2001/08/31/business/gr...,2001-08-31T05:00:00+0000,Main Street,Computers and the Internet | Interest Rates | ...,Greenspan Says Market Swings Pose Challenge to...,"Alan Greenspan, chairman of the Federal Reserv...",O
118,118,https://www.nytimes.com/2001/09/17/business/fe...,2001-09-17T05:00:00+0000,WORLD TRADE CENTER,Interest Rates | Banks and Banking | Stocks an...,Fed Cuts Rates and Says It Will Work to Stabil...,The Federal Reserve cut its benchmark interest...,O
171,171,https://www.nytimes.com/2001/10/07/business/ne...,2001-10-07T05:00:00+0000,WORLD TRADE CENTER,Automobile Insurance and Liability | Unemploym...,"New York City May Issue More Notes, Mayor Says","DIARY New York City May Issue More Notes, Mayo...",O


In [12]:
nyt['headline'] = nyt.headline.str.replace("â\x80\x99","'").str.replace("â\x80\x98", "'")
nyt['snippet'] = nyt.snippet.str.replace("â\x80\x99","'").str.replace("â\x80\x98", "'")
nyt['text'] = nyt.headline.fillna('') + '. ' + nyt.snippet.fillna('')

In [13]:
nyt.text[-5:].map(print)

James Bond, Meet Jeff Bezos: Amazon Makes $8.45 Billion Deal for MGM. Metro-Goldwyn-Mayer, while diminished, commanded a premium price, with Amazon seeking to bolster its crucial Prime membership offering.
Private Inequity: How a Powerful Industry Conquered the Tax System. The I.R.S. almost never audits private equity firms, even as whistle-blowers have filed claims alleging illegal tax avoidance.
MacKenzie Scott Gives Away Another $2.74 Billion Even as Her Wealth Grows. Ms. Scott made a new round of grants, to 286 organizations. Her net worth, which Forbes estimates at $60 billion, keeps rising, thanks to Amazon stock.
Mobile Home Owners Fear Evictions as Pandemic Protections End. Many who have struggled to keep up with mortgage payments are at the whims of a few financing firms that dominate lending in this market.
Marco Gobbetti, Burberry's chief executive, quits to join Ferragamo.. Shares fell 8 percent on news that Mr. Gobbetti, who joined in 2017 with a goal to take Burberry furt

79909    None
80053    None
80072    None
80125    None
80170    None
Name: text, dtype: object

In [14]:
text_test = nyt.text.iloc[:2]
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_mask', 'input_type_ids', 'input_word_ids']
Shape      : (2, 128)
Word Ids   : [  101  4144  1024  1996  2645  3210  1997  1996 20565  5233  1012  8141]
Input Mask : [1 1 1 1 1 1 1 1 1 1 1 1]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [15]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(2, 512)
Pooled Outputs Values:[ 0.9099364   0.7996026   0.16957957  0.14161697 -0.19116503  0.26365104
  0.99871004 -0.23314759  0.30156675 -0.9566957  -0.601556   -0.97719204]
Sequence Outputs Shape:(2, 128, 512)
Sequence Outputs Values:[[-0.01004024  0.1613256   1.2124585  ... -1.3291919   0.03985997
   0.93941563]
 [ 1.1092067  -0.34477034  0.35124606 ... -0.65430474 -0.16948977
   0.40493447]
 [ 0.55424756  0.4374459   0.3233995  ... -0.7873844  -0.70253783
   1.0780007 ]
 ...
 [-0.617119    1.1918814   0.4726373  ... -0.4906849   0.9460207
  -0.5603203 ]
 [-0.5424152  -0.28814012 -0.44744694 ... -1.0085776  -0.05442642
   0.67153376]
 [-0.68385816 -0.6620978   0.6406372  ... -0.27576295  0.40056187
   0.43350038]]


### Model

In [16]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=False, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(4, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [17]:
classifier_model = build_classifier_model()

In [18]:
classifier_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
preprocessing (KerasLayer)      {'input_mask': (None 0           text[0][0]                       
__________________________________________________________________________________________________
BERT_encoder (KerasLayer)       {'default': (None, 5 28763649    preprocessing[0][0]              
                                                                 preprocessing[0][1]              
                                                                 preprocessing[0][2]              
______________________________________________________________________________________________

In [40]:
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

tf.Tensor(
[[0.22821486 0.6530317  0.6710446  0.6154146 ]
 [0.27016398 0.47285527 0.61136645 0.8327852 ]], shape=(2, 4), dtype=float32)


In [44]:
#tf.keras.utils.plot_model(classifier_model)

In [41]:
#loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
#metrics = tf.metrics.BinaryAccuracy()

In [42]:
classifier_model.compile(optimizer='adam',
                         loss='categorical_crossentropy',
                         metrics=['accuracy'])

In [22]:
x = nyt.text
#y = nyt.label.map({'O': 0, 'E': 1, 'S': 2, 'G':3})

In [23]:
nyt['dummy'] = 1

In [24]:
y = nyt.pivot_table(index='index', columns='label', values='dummy').fillna(0).reset_index(drop=True)[['E', 'S', 'G', 'O']]

In [25]:
x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, stratify=y, test_size=0.25)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, stratify=y_train_val, test_size=0.25)

In [26]:
len(y_train), len(y_val), len(y_test)

(1538, 513, 684)

In [27]:
y_train.mean(), y_val.mean(), y_test.mean()

(label
 E    0.204811
 S    0.230169
 G    0.284135
 O    0.280884
 dtype: float64,
 label
 E    0.204678
 S    0.230019
 G    0.284600
 O    0.280702
 dtype: float64,
 label
 E    0.204678
 S    0.229532
 G    0.285088
 O    0.280702
 dtype: float64)

In [28]:
#y_train.value_counts()/len(y_train), y_val.value_counts()/len(y_val), y_test.value_counts()/len(y_test)

In [72]:
epochs = 50
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min')
mcp_save = ModelCheckpoint('exploration/nyt/supervised_models/small_bert.hdf5', save_best_only=True, monitor='val_loss', mode='min')

In [75]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=x_train,
                               y=y_train,
                               validation_data=(x_val, y_val),
                               epochs=epochs,
                               callbacks=[early_stopping, mcp_save])

Training model with https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Epoch 1/50








Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50

KeyboardInterrupt: 

#### Further training

In [71]:
model = load_model('exploration/nyt/supervised_models/small_bert.hdf5', custom_objects={'KerasLayer': hub.KerasLayer})

In [74]:
loss, accuracy = model.evaluate(x_train, y_train)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 0.991649866104126
Accuracy: 0.5201560258865356


In [72]:
loss, accuracy = model.evaluate(x_val, y_val)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 1.0630779266357422
Accuracy: 0.539961040019989


In [76]:
epochs = 50
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='min')
mcp_save = ModelCheckpoint('exploration/nyt/supervised_models/small_bert.hdf5', save_best_only=True, monitor='val_loss', mode='min')

In [77]:
history = model.fit(x=x_train,
                   y=y_train,
                   validation_data=(x_val, y_val),
                   epochs=epochs,
                   callbacks=[early_stopping, mcp_save])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50


In [116]:
loss, accuracy = classifier_model.evaluate(x_val, y_val)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 0.4187076985836029
Accuracy: 0.7883597612380981


In [122]:
loss, accuracy = model.evaluate(x_val, y_val)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 0.3870939612388611
Accuracy: 0.8148148059844971


In [36]:
def check_performance(y_true, y_pred, y_scores, name):
    print(f"Accuracy: {metrics.accuracy_score(y_true, y_pred)}")
    print(f"Precision: {metrics.precision_score(y_true, y_pred)}")
    print(f"Recall: {metrics.recall_score(y_true, y_pred)}")
    print(f"F1-score: {metrics.f1_score(y_true, y_pred)}")
    print('\nConfusion Matrix')
    print(metrics.confusion_matrix(y_true, y_pred))
    
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_scores, pos_label=1)
    roc_auc = metrics.auc(fpr, tpr)

    fig = plt.figure(figsize=(10,5))
    fig.suptitle(name)
    
    plt.subplot(1, 2, 1)
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")

    plt.subplot(1, 2, 2)
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    avg_precision = metrics.average_precision_score(y_true, y_scores)
    plt.step(recall, precision, where='post', label=f"Average precision score {avg_precision:0.2f}")
    plt.ylabel('Precision')
    plt.xlabel('Recall')
    plt.title("Precision-Recall curve")
    plt.legend(loc="lower left")
    fig.tight_layout()
    plt.show()

In [37]:
model.predict(x_val)

array([[ 0.04159182,  0.3503313 ,  2.2663033 , -0.33867037],
       [ 3.7587411 , -0.09197435,  0.42919827,  0.1006193 ],
       [ 3.3101737 ,  1.0783124 , -0.35249534, -0.22931214],
       ...,
       [-0.34112847,  1.141657  ,  0.39522737,  1.3113966 ],
       [ 1.685962  ,  2.0352638 , -0.5769396 ,  0.84866303],
       [-0.43828553,  1.2553128 ,  2.9246657 , -1.8997823 ]],
      dtype=float32)

In [38]:
y_prob = model.predict(x_val)
y_classes = y_prob.argmax(axis=-1)

In [39]:
#y_classes = np.array([[1] if e >= 0 else [0] for e in y_prob])

In [40]:
y_prob[:10]

array([[ 0.04159182,  0.3503313 ,  2.2663033 , -0.33867037],
       [ 3.7587411 , -0.09197435,  0.42919827,  0.1006193 ],
       [ 3.3101737 ,  1.0783124 , -0.35249534, -0.22931214],
       [ 2.3690932 ,  0.04173407,  1.0165937 ,  1.097194  ],
       [ 1.8529938 , -0.38893613,  1.5128365 ,  0.49468505],
       [ 1.7769976 ,  1.0610814 ,  0.42591852,  0.8335908 ],
       [-0.7494328 ,  1.2668321 ,  0.3765151 ,  1.676476  ],
       [-0.24947894,  0.51995313,  1.0494008 ,  0.09289277],
       [ 0.80795497,  0.10416383,  0.5362246 ,  1.9652886 ],
       [ 1.1478353 ,  0.22655708,  0.90051293,  1.1504349 ]],
      dtype=float32)

In [41]:
y_classes[:10]

array([2, 0, 0, 0, 0, 0, 3, 2, 3, 3], dtype=int64)

In [56]:
y_val.apply(np.argmax, axis=1).iloc[:10]

1020    2
1100    0
2227    0
2658    0
877     0
498     0
2633    1
2224    3
990     3
304     3
dtype: int64

In [62]:
y_val.mean()

label
E    0.204678
S    0.230019
G    0.284600
O    0.280702
dtype: float64

In [61]:
pd.Series(y_classes).value_counts()/len(y_classes)

2    0.329435
3    0.243665
0    0.218324
1    0.208577
dtype: float64

In [146]:
from sklearn import metrics
from sklearn.metrics import precision_recall_curve

In [64]:
name = f'Small BERT'
y_pred = y_classes
y_true = y_val
y_scores = y_prob
check_performance(y_true, y_pred, y_scores, name)

NameError: name 'metrics' is not defined

In [None]:
y_prob = model.predict(x_test) 
y_classes = y_prob.argmax(axis=-1)

In [None]:
name = f'Small BERT'
y_pred = y_classes
y_true = y_test
y_scores = y_prob
check_performance(y_true, y_pred, y_scores, name)