-
Notifications
You must be signed in to change notification settings - Fork 0
/
ml_bert.py
117 lines (99 loc) · 5.7 KB
/
ml_bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import numpy as np
import tensorflow as tf
import preprocess
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel, DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
def prepare_data(X, tokenizer, max_length=512):
# Tokenizing the text data
encodings = tokenizer(X.tolist(), truncation=True, padding="max_length", max_length=max_length, return_tensors="tf")
return encodings['input_ids'], encodings['attention_mask']
def build_model(bert_model, learning_rate=5e-5):
# Define input layers
input_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids")
attention_masks = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask")
# Lambda to convert Keras tensors to TensorFlow tensors and specify output shape
bert_output = tf.keras.layers.Lambda(
lambda x: bert_model(x[0], attention_mask=x[1])[0],
output_shape=(512, 768) # This needs to be set to the output dimensions of the BERT model
)([input_ids, attention_masks])
# Extract the [CLS] token's output for classification tasks
cls_token = tf.keras.layers.Lambda(lambda x: x[:, 0], output_shape=(768,))(bert_output)
# Additional layers on top
dense = tf.keras.layers.Dense(256, activation='relu')(cls_token)
dropout = tf.keras.layers.Dropout(0.3)(dense)
output = tf.keras.layers.Dense(4, activation='softmax')(dropout)
# Construct the model
model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output)
model.compile(optimizer=Adam(learning_rate=learning_rate),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
return model
def train_model(input_ids, attention_masks, y, model, batch_size=8, epochs=10): # Lower batch size if you want to speed up training
# Ensure the data is in TensorFlow tensor format, if not convert it
if not isinstance(input_ids, tf.Tensor):
input_ids = tf.convert_to_tensor(input_ids.values, dtype=tf.int32)
if not isinstance(attention_masks, tf.Tensor):
attention_masks = tf.convert_to_tensor(attention_masks.values, dtype=tf.int32)
if not isinstance(y, tf.Tensor):
y = tf.convert_to_tensor(y.values, dtype=tf.int32)
# Convert tensors to numpy for sklearn compatibility
input_ids_np = input_ids.numpy()
attention_masks_np = attention_masks.numpy()
y_np = y.numpy()
# Splitting the data
X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = train_test_split(
input_ids_np, attention_masks_np, y_np, test_size=0.2, random_state=69
)
# Early stopping to monitor the validation loss and stop training when it starts to increase
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
# Training the model
history = model.fit(
[X_train_ids, X_train_masks], y_train,
validation_split=0.1,
batch_size=batch_size,
epochs=epochs,
callbacks=[early_stopping],
verbose=1
)
print("\nModel Performance on Test Set:")
model.evaluate([X_test_ids, X_test_masks], y_test)
def main():
file_path = './uci-news-aggregator_very_small.csv'
data = preprocess.load_data(file_path)
data = preprocess.clean_missing_values(data)
data = preprocess.normalize_text(data)
data = preprocess.extract_url_features(data)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
input_ids, attention_masks = prepare_data(data['TITLE'], tokenizer)
y = data['CATEGORY'].astype('category').cat.codes
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
model = build_model(bert_model)
train_model(input_ids, attention_masks, y, model)
if __name__ == '__main__':
main()
'''
Epoch 1/10
381/381 ━━━━━━━━━━━━━━━━━━━━ 692s 2s/step - accuracy: 0.4774 - loss: 1.2225 - val_accuracy: 0.7485 - val_loss: 0.7700
Epoch 2/10
381/381 ━━━━━━━━━━━━━━━━━━━━ 704s 2s/step - accuracy: 0.7425 - loss: 0.7381 - val_accuracy: 0.8136 - val_loss: 0.6102
Epoch 3/10
381/381 ━━━━━━━━━━━━━━━━━━━━ 686s 2s/step - accuracy: 0.7865 - loss: 0.6123 - val_accuracy: 0.8225 - val_loss: 0.5547
Epoch 4/10
381/381 ━━━━━━━━━━━━━━━━━━━━ 693s 2s/step - accuracy: 0.8179 - loss: 0.5518 - val_accuracy: 0.8314 - val_loss: 0.5294
Epoch 5/10
381/381 ━━━━━━━━━━━━━━━━━━━━ 689s 2s/step - accuracy: 0.8057 - loss: 0.5384 - val_accuracy: 0.8195 - val_loss: 0.5137
Epoch 6/10
381/381 ━━━━━━━━━━━━━━━━━━━━ 672s 2s/step - accuracy: 0.8263 - loss: 0.4906 - val_accuracy: 0.8284 - val_loss: 0.5016
Epoch 7/10
381/381 ━━━━━━━━━━━━━━━━━━━━ 720s 2s/step - accuracy: 0.8321 - loss: 0.4765 - val_accuracy: 0.8343 - val_loss: 0.4938
Epoch 8/10
381/381 ━━━━━━━━━━━━━━━━━━━━ 704s 2s/step - accuracy: 0.8420 - loss: 0.4589 - val_accuracy: 0.8314 - val_loss: 0.4914
Epoch 9/10
381/381 ━━━━━━━━━━━━━━━━━━━━ 705s 2s/step - accuracy: 0.8355 - loss: 0.4738 - val_accuracy: 0.8225 - val_loss: 0.4867
Epoch 10/10
381/381 ━━━━━━━━━━━━━━━━━━━━ 666s 2s/step - accuracy: 0.8412 - loss: 0.4535 - val_accuracy: 0.8402 - val_loss: 0.4864
Model Performance on Test Set:
27/27 ━━━━━━━━━━━━━━━━━━━━ 172s 6s/step - accuracy: 0.8154 - loss: 0.4894
'''