In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tokens import FullTokenizer
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
def string_to_list(input_string: str) -> list:
    res = input_string.strip('][')
    res = res.replace('"', '')
    res = res.split(', ')
    
    return res

In [10]:
def preprocess(file, bert_layer, only_a_few=False):
    file = file[file['action'] != 'no']
    file = file[file['motivation'] != 'none']
    file = file[file['maslow'] != '[]']
    file = file[file['reiss'] != '[]']
    file = file[file['reiss'] != '["na"]']
    file["maslow"] = file["maslow"].apply(string_to_list)
    file.reset_index(drop=True, inplace=True)

    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = FullTokenizer(vocab_file, do_lower_case)

    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(file["maslow"])

    return data, labels

In [4]:
training_file_motivation = pd.read_csv(os.path.join("scs-baselines-master/data/dev/motivation", "allcharlinepairs_noids.csv"))
testing_file_motivation = pd.read_csv(os.path.join("scs-baselines-master/data/test/motivation", "allcharlinepairs_noids.csv"))

In [5]:
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)

2021-12-09 20:54:01.678613: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=128, only_a_few=False):
    all_tokens = []
    all_masks = []
    all_segments = []

    j = 0
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

        j += 1
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [7]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(5, activation='sigmoid')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [11]:
train_input, train_labels = preprocess(training_file_motivation, bert_layer)
test_input, test_labels = preprocess(testing_file_motivation, bert_layer, only_a_few=True)

A few
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
broken


In [14]:
model = build_model(bert_layer, max_len=128)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 128, 768)]                'input_mask[0][0]',         

  super(Adam, self).__init__(name, **kwargs)


In [34]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_history = model.fit(
    train_input, train_labels, 
    validation_split=0.2,
    epochs=3,
    callbacks=[checkpoint, earlystopping],
    batch_size=32,
    verbose=1
)

Epoch 1/3

In [15]:
model.load_weights('model.h5')

In [18]:
test_output = model.predict(test_input)
print(test_output)
print(test_labels)

[[0.03188419 0.08617873 0.80413693 0.02892018 0.04888003]
 [0.03188419 0.08617873 0.80413693 0.02892018 0.04888003]
 [0.03188419 0.08617873 0.80413693 0.02892018 0.04888003]
 [0.28705162 0.05856194 0.02321398 0.14558955 0.48558298]
 [0.28705162 0.05856194 0.02321398 0.14558955 0.48558298]
 [0.54748553 0.1383801  0.01152629 0.23038761 0.07222055]
 [0.02476677 0.05374946 0.73146236 0.02509226 0.1649291 ]
 [0.38043195 0.31352013 0.04867551 0.24522965 0.01214272]
 [0.38043195 0.31352013 0.04867551 0.24522965 0.01214272]
 [0.6074848  0.14596707 0.00161824 0.24225476 0.00267522]
 [0.6074848  0.14596707 0.00161824 0.24225476 0.00267522]
 [0.3344316  0.03081152 0.01772548 0.2688434  0.34818804]
 [0.3344316  0.03081152 0.01772548 0.2688434  0.34818804]
 [0.17419957 0.01192915 0.02532709 0.06031655 0.7282276 ]
 [0.4241949  0.02954231 0.01432405 0.13679944 0.39513925]
 [0.4241949  0.02954231 0.01432405 0.13679944 0.39513925]
 [0.25527433 0.4360069  0.00837377 0.17605321 0.12429176]
 [0.25527433 0