In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from transformers import TFBertModel, BertTokenizerFast, BertConfig
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
from bert import tokenization
tf.get_logger().setLevel('ERROR')
gpu = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu[0], True)
tf.config.experimental.set_memory_growth(gpu[1], True)
if gpu:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_visible_devices(gpu[1], 'GPU')
  except RuntimeError as e:
    # Visible devices must be set at program startup
    print(e)

In [2]:
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

In [3]:
train = pd.read_csv('df_train_bert.csv')
test = pd.read_csv('df_test_bert.csv')
train = train[['text_clean','target']]
test = test[['text_clean']]
train.head()

Unnamed: 0,text_clean,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13000 people receive wildfires evacuation orde...,1
4,just got sent this photo from ruby alaska as s...,1


In [4]:
test.head()

Unnamed: 0,text_clean
0,just happened a terrible car crash
1,heard about hastag earthquake is different cit...
2,there is a forest fire at spot pond geese are ...
3,apocalypse lighting hastag spokane hastag wild...
4,typhoon soudelor kills 28 in china and taiwan


In [5]:
data, data_test = train_test_split(train, test_size = 0.2, stratify = train[['target']])

In [6]:
model_name = 'bert-base-uncased'

In [7]:
train.text_clean.apply(lambda x: len(x.split())).max()

31

In [8]:
max_length = train.text_clean.apply(lambda x: len(x.split())).max()

In [9]:
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

In [10]:
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

In [11]:
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [13]:
# Load the MainLayer
bert = transformer_model.layers[0]
# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32')
token_ids = Input(shape=(max_length,), name='token_ids', dtype='int32')
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask, 'token_ids': token_ids}
#inputs = {'input_ids': input_ids}
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)
# Then build your model output
issue = Dense(units=len(data.target.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='issue')(pooled_output)
outputs = {'issue': issue}
# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')
# Take a look at the model
model.summary()

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask (InputLayer)     [(None, 31)]         0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 31)]         0                                            
__________________________________________________________________________________________________
token_ids (InputLayer)          [(None, 31)]         0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          ((None, 31, 768), (N 109482240   attention_mask[0][0]             
                                                                 input_id

In [23]:
x

{'input_ids': <tf.Tensor: shape=(5524, 31), dtype=int32, numpy=
array([[  101, 14390,  2131, ...,     0,     0,     0],
       [  101,  3050,  3349, ...,     0,     0,     0],
       [  101,  1054,  3501, ...,  5754, 14141,   102],
       ...,
       [  101,  2054,  1037, ...,     0,     0,     0],
       [  101,  2047,  2695, ...,  1057,   102,     0],
       [  101, 18691,  2135, ...,     0,     0,     0]])>, 'token_type_ids': <tf.Tensor: shape=(5524, 31), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(5524, 31), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}

In [14]:
# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss = {'issue': CategoricalCrossentropy(from_logits = True)}
metric = {'issue': CategoricalAccuracy('accuracy')}
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)
# Ready output data for the model
y_issue = to_categorical(data['target'])
# Tokenize the input (takes some time)
x = tokenizer(
    text=data['text_clean'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)
# Fit the model
history = model.fit(
    x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask'],'token_ids': x['token_type_ids']},
    #x={'input_ids': x['input_ids']},
    y={'issue': y_issue},
    validation_split=0.2,
    batch_size=64,
    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
test = pd.read_csv('df_test_bert.csv')

In [24]:
test.head()

Unnamed: 0,id,keyword,location,text,text_clean
0,0,,,Just happened a terrible car crash,just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",heard about hastag earthquake is different cit...
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting hastag spokane hastag wild...
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills 28 in china and taiwan


In [16]:
x_test = tokenizer(
        text=test['text_clean'].to_list(),
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding=True, 
        return_tensors='tf',
        return_token_type_ids = True,
        return_attention_mask = True,
        verbose = True)

In [17]:
x_test

{'input_ids': <tf.Tensor: shape=(3263, 31), dtype=int32, numpy=
array([[  101,  2074,  3047, ...,     0,     0,     0],
       [  101,  2657,  2055, ...,     0,     0,     0],
       [  101,  2045,  2003, ...,     0,     0,     0],
       ...,
       [  101,  2665,  2240, ...,     0,     0,     0],
       [  101, 12669,  3314, ...,     0,     0,     0],
       [  101,  2038, 15900, ...,     0,     0,     0]])>, 'token_type_ids': <tf.Tensor: shape=(3263, 31), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(3263, 31), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}

In [25]:
pred = model.predict({'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask'],'token_ids': x_test['token_type_ids']})

In [32]:
np.argmax(pred['issue'],axis=1).shape

(3263,)

In [33]:
sample_submission = pd.read_csv('sample_submission.csv')

In [34]:
sample_submission['target'] = np.argmax(pred['issue'], axis=1)

In [35]:
sample_submission['target'] = sample_submission['target'].astype('int')

In [36]:
sample_submission.to_csv('predictions_disaster_tweet.csv', index=False)

In [37]:
sample_submission['target'].value_counts()

0    1976
1    1287
Name: target, dtype: int64

In [38]:
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
