# 0. Configuration

In [64]:
import tensorflow as tf
import pandas as pd
import numpy as np
import json

from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, GRU
from tensorflow.keras.losses import cosine_similarity
from tensorflow import keras
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel
from pprint import pprint

# 1. Data and Model Loading

## 1-1. Data Loading

In [2]:
data_fpath = './data/Headline_Trainingdata.json'

In [3]:
with open(data_fpath, 'r', encoding='utf-8') as file:
    data = json.load(file)

print('Number of total data: %d\n' % len(data))
print('Data examples:')
pprint(data[:2])

Number of total data: 1142

Data examples:
[{'company': 'Morrisons',
  'id': 2,
  'sentiment': 0.43,
  'title': 'Morrisons book second consecutive quarter of sales growth'},
 {'company': 'IMI',
  'id': 3,
  'sentiment': -0.344,
  'title': 'IMI posts drop in first-quarter organic revenue; warns on full '
           'year'}]


In [4]:
ids = []
companies = []
titles = []
sentiments = []
for i in range(len(data)):
    ids.append(data[i]['id'])
    companies.append(data[i]['company'])
    titles.append(data[i]['title'])
    sentiments.append(data[i]['sentiment'])

## 1-2. BERT Model Loading

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


## 1-3. Vectorization to Tensorflow

In [6]:
encoded_titles = tokenizer(titles, padding=True, return_tensors='tf')

In [7]:
%%time
X = model(encoded_titles).last_hidden_state

CPU times: total: 7min 31s
Wall time: 41.8 s


In [8]:
Y = tf.convert_to_tensor(sentiments)

## 1-4. Train-test Split

In [9]:
train_idx, test_idx = train_test_split(range(len(ids)), test_size=0.1, random_state=1)
X_train = tf.gather(X, train_idx)
Y_train = tf.gather(Y, train_idx)
X_test = tf.gather(X, test_idx)
Y_test = tf.gather(Y, test_idx)
print('Number of training samples: %d' % len(train_idx))
print('Number of test samples: %d' % len(test_idx))

Number of training samples: 1027
Number of test samples: 115


# 2. Train Sentiment Classifier

## 2-1. Build Classifier

### Configuration

In [62]:
lstm_hidden_size = 256
dense_size = 128

### Tensorflow Model

In [116]:
sentiment_model = keras.Sequential()
sentiment_model.add(keras.Input(shape=(X.shape[1], X.shape[2])))
sentiment_model.add(Bidirectional(GRU(lstm_hidden_size, return_sequences=True,
                                      name='biLSTM_layer_1')))
sentiment_model.add(Bidirectional(GRU(lstm_hidden_size), name='biLSTM_layer_2'))
sentiment_model.add(Dense(dense_size*2, activation='gelu', name='dense_layer_1'))
sentiment_model.add(Dropout(0.2, name='dropout_layer_1'))
sentiment_model.add(Dense(dense_size, activation='gelu', name='dense_layer_2'))
sentiment_model.add(Dropout(0.2, name='dropout_layer_2'))
sentiment_model.add(Dense(1, activation='tanh', name='output_layer'))
print(sentiment_model.summary())

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_8 (Bidirectio  (None, 29, 512)          1575936   
 nal)                                                            
                                                                 
 biLSTM_layer_2 (Bidirection  (None, 512)              1182720   
 al)                                                             
                                                                 
 dense_layer_1 (Dense)       (None, 256)               131328    
                                                                 
 dropout_layer_1 (Dropout)   (None, 256)               0         
                                                                 
 dense_layer_2 (Dense)       (None, 128)               32896     
                                                                 
 dropout_layer_2 (Dropout)   (None, 128)             

In [117]:
exp_name = '20220602_bert_sentiment'
batch_size = len(X_train)

In [118]:
# Include the epoch in the file name (uses `str.format`)
checkpoint_path = exp_name + '/cp-{epoch:04d}.ckpt'

# Create a callback that saves the model's weights every 10 epochs
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1,
                                                 save_freq=10)

# Save the weights using the `checkpoint_path` format
sentiment_model.save_weights(checkpoint_path.format(epoch=0))

In [119]:
# 모델 컴파일 (regression)
sentiment_model.compile(loss='cosine_similarity',
              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
              metrics=['cosine_similarity'])

In [120]:
history = sentiment_model.fit(X_train, Y_train, epochs=100, batch_size=batch_size,
                              validation_split=1/9,
                              verbose=1, workers=15, callbacks=[cp_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 10: saving model to 20220602_bert_sentiment\cp-0010.ckpt
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


KeyboardInterrupt: 

In [104]:
train_result_fpath = exp_name + '.xlsx'
df_result = pd.DataFrame({'train_sim': history.history['cosine_similarity'],
                          'val_sim': history.history['val_cosine_similarity']})
df_result.to_excel(train_result_fpath)

In [121]:
Y_pred = sentiment_model.predict(X_test)
Y_pred = np.squeeze(Y_pred)
print('Cosine similarity on test data:%.4f' % -cosine_similarity(Y_pred, Y_test))

Cosine similarity on test data:-0.0127


In [122]:
Y_pred

array([-0.0307282 ,  0.02860223, -0.13742292, -0.10137612, -0.02497484,
        0.06693387, -0.04862431, -0.10901194,  0.13828076, -0.04018787,
        0.01311471,  0.06325718, -0.03880176, -0.05301975, -0.01960249,
        0.01865119,  0.00034039, -0.09637383, -0.03594363,  0.01330043,
       -0.02939804,  0.05623235, -0.01551616, -0.03550649, -0.0226656 ,
       -0.06662359, -0.13722637, -0.08618344,  0.04916712, -0.10312538,
       -0.08794115, -0.10029439,  0.06917384,  0.00213898,  0.02408555,
       -0.00347129, -0.10474423, -0.08512443,  0.03649005, -0.03842682,
       -0.01176735, -0.0976997 , -0.03193242, -0.08837546, -0.04835438,
        0.04428313, -0.03356117,  0.01020439, -0.08735496,  0.04369081,
        0.04899858,  0.00723328, -0.07913991,  0.07980208, -0.04276426,
        0.02854053,  0.013426  , -0.03813474,  0.00470224, -0.07979129,
       -0.01195675,  0.07811739, -0.10505171, -0.08434291, -0.07715809,
       -0.07640834, -0.09716013,  0.0534755 , -0.08502798,  0.01