Using Bert Transformer

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical


In [None]:
train_df = pd.read_csv('/content/natural_disaster_tweets.csv')
print(train_df.head())
# test_df = pd.read_csv('/content/Test.csv', usecols=['id','text'])
# print(test_df.head())

   tweet_id                                         tweet_text disaster_type
0         1  RT @JHarden13: Praying for everyone in Oklahom...     hurricane
1         2  Is hurricane Sandy the beginning of whats to c...     hurricane
2         3  I booked a commercial for Chewy’s and had a fi...     hurricane
3         4  RT @CMMBTweets: New update from our team in #H...     hurricane
4         5  Giants pledge $1 million to Hurricane Harvey r...     hurricane


In [None]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

#model = pickle.load(open('model.pkl', 'rb'))
#vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

train_df['clean_text'] = train_df['tweet_text'].apply(clean_text)
# test_df['clean_text'] = test_df['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
print(train_df['clean_text'].head())

0              rt praying everyone oklahoma city crazy
1    hurricane sandy beginning whats come mid decem...
2    booked commercial chewys fitting dcota one coo...
3    rt new update team haiti initial reports indic...
4    giants pledge million hurricane harvey relief ...
Name: clean_text, dtype: object


In [None]:
train_df

Unnamed: 0,tweet_id,tweet_text,disaster_type,clean_text
0,1,RT @JHarden13: Praying for everyone in Oklahom...,hurricane,rt praying everyone oklahoma city crazy
1,2,Is hurricane Sandy the beginning of whats to c...,hurricane,hurricane sandy beginning whats come mid decem...
2,3,I booked a commercial for Chewy’s and had a fi...,hurricane,booked commercial chewys fitting dcota one coo...
3,4,RT @CMMBTweets: New update from our team in #H...,hurricane,rt new update team haiti initial reports indic...
4,5,Giants pledge $1 million to Hurricane Harvey r...,hurricane,giants pledge million hurricane harvey relief ...
...,...,...,...,...
151986,151987,@janet_08 Obviously. yes Im back HAHAHA! :-hh,unrelated,obviously yes im back hahaha hh
151987,151988,@Corrievogue ugh he promised he would let me kno,unrelated,ugh promised would let kno
151988,151989,@tracybegins This may be very realistic given ...,unrelated,may realistic given dcs recent furniture shopp...
151989,151990,DECEMBER 13/14 - MILEY CYRUS HERE I COME &lt;3,unrelated,december miley cyrus come lt


In [None]:
train_df['is_disaster'] = train_df['disaster_type'].apply(lambda x: 0 if x == 'unrelated' else 1)

In [None]:
df_disaster = train_df[train_df['is_disaster'] == 1].copy()
label_encoder = LabelEncoder()
df_disaster['disaster_label'] = label_encoder.fit_transform(df_disaster['disaster_type'])
y_multi = to_categorical(df_disaster['disaster_label'])

In [None]:
train_df.disaster_type.value_counts()

Unnamed: 0_level_0,count
disaster_type,Unnamed: 1_level_1
unrelated,46473
earthquake,39044
hurricane,30000
flood,19025
wildfire,12995
cyclone,4454


In [None]:
from transformers import AutoTokenizer,TFBertModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-uncased")
MAX_LEN = 99

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
def tokenize(texts):
    return tokenizer(
        text=texts,
        add_special_tokens=True,
        max_length=MAX_LEN,
        truncation=True,
        padding='max_length',
        return_tensors='tf',
        return_token_type_ids=False,
        return_attention_mask=True,
    )

In [None]:
tokenizer("!how are you & have a nice day!")

{'input_ids': [101, 999, 2129, 2024, 2017, 1004, 2031, 1037, 3835, 2154, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
print("max len of tweets" ,max([len(x.split())for x in train_df.tweet_text]))

max len of tweets 99


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
from tensorflow.keras.optimizers import Adam

In [None]:
def build_model(num_classes=1, activation='sigmoid', loss_fn='binary_crossentropy'):
    input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
    input_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")

    bert_model = TFBertModel.from_pretrained("google-bert/bert-large-uncased")
    bert_output = Lambda(lambda x: bert_model(x)[1], output_shape=(1024,))([input_ids, input_mask])
    x = Dropout(0.1)(bert_output)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(32, activation='relu')(x)
    y = Dense(num_classes, activation=activation)(x)

    model = Model(inputs=[input_ids, input_mask], outputs=y)
    model.compile(optimizer=Adam(2e-5), loss=loss_fn,
                  metrics=[BinaryAccuracy()] if num_classes == 1 else [CategoricalAccuracy()])
    return model

In [None]:
X_bin = tokenize(train_df['clean_text'].tolist())
y_bin = train_df['is_disaster'].values

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


In [None]:
from tensorflow.keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy, BinaryAccuracy

In [None]:
binary_model = build_model(num_classes=1, activation='sigmoid', loss_fn=BinaryCrossentropy())
binary_model.fit(
    x={'input_ids': X_bin['input_ids'], 'attention_mask': X_bin['attention_mask']},
    y=y_bin,
    epochs=3,
    batch_size=16,
    validation_split=0.2
)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/3
[1m7600/7600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2975s[0m 387ms/step - binary_accuracy: 0.8673 - loss: 0.3674 - val_binary_accuracy: 0.1900 - val_loss: 1.3653
Epoch 2/3
[1m7600/7600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2945s[0m 386ms/step - binary_accuracy: 0.8866 - loss: 0.2788 - val_binary_accuracy: 0.3641 - val_loss: 1.1325
Epoch 3/3
[1m7600/7600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2931s[0m 386ms/step - binary_accuracy: 0.8947 - loss: 0.2516 - val_binary_accuracy: 0.5173 - val_loss: 0.8983


<keras.src.callbacks.history.History at 0x78f77142ffd0>

In [None]:
binary_model.summary()

In [None]:
# X_multi = tokenize(df_disaster['clean_text'].tolist())

# multi_model = build_model(num_classes=y_multi.shape[1], activation='softmax', loss_fn=CategoricalCrossentropy())
# multi_model.fit(
#     x={'input_ids': X_multi['input_ids'], 'attention_mask': X_multi['attention_mask']},
#     y=y_multi,
#     epochs=2,
#     batch_size=16,
#     validation_split=0.2
# )

In [None]:
# multi_model.summary()

In [None]:
# def classify_tweet(text):
#     cleaned = clean_text(text)
#     tokens = tokenize([cleaned])
#     bin_pred = binary_model.predict(tokens)[0][0]
#     if bin_pred > 0.5:
#         multi_pred = multi_model.predict(tokens)[0]
#         label = label_encoder.inverse_transform([np.argmax(multi_pred)])[0]
#         return "disaster", label
#     else:
#         return "non disaster", None

In [None]:
# For binary model
binary_model.evaluate(
    x={'input_ids': X_bin['input_ids'], 'attention_mask': X_bin['attention_mask']},
    y=y_bin
)

# For multi-class model
# multi_model.evaluate(
#     x={'input_ids': X_multi['input_ids'], 'attention_mask': X_multi['attention_mask']},
#     y=y_multi
# )


In [None]:
# Save binary classifier
binary_model.save("binary_disaster_model.keras")

# Save multi-class classifier
#multi_model.save("multi_disaster_model")

# # Save the label encoder (for disaster type labels)
# import pickle
# with open("label_encoder.pkl", "wb") as f:
#     pickle.dump(label_encoder, f)


In [None]:
from google.colab import files
files.download('my_model.keras')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Save to your Drive
model.save('/content/drive/MyDrive/my_model.keras')

Building the model Architecture

In [None]:
# import tensorflow as tf
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.initializers import TruncatedNormal
# from tensorflow.keras.losses import CategoricalCrossentropy,BinaryCrossentropy
# from tensorflow.keras.metrics import CategoricalAccuracy,BinaryAccuracy
# from tensorflow.keras.utils import to_categorical,plot_model
# from tensorflow.keras.layers import Input,Dense

In [None]:
# max_len = 34

# input_ids = Input(shape=(max_len,), dtype=tf.int32, name = "input_ids")
# input_mask = Input(shape=(max_len,), dtype=tf.int32, name = "attention_mask")

# bert_output = tf.keras.layers.Lambda(lambda x: bert(x)[1], output_shape=(1024,))([input_ids, input_mask])

# output = tf.keras.layers.Dropout(0.1)(bert_output)

# output = Dense(128,activation='relu')(output)

# output = tf.keras.layers.Dropout(0.1)(output)

# output = Dense(32,activation='relu')(output)

# y = Dense(1,activation= 'sigmoid')(output)

# model = tf.keras.Model(inputs = [input_ids , input_mask], outputs = y)

# bert_layer = model.layers[2]

# bert_layer.trainable = True  # Fine-tune full BERT




In [None]:
# model.summary()

In [None]:
# optimizer = Adam(
#     learning_rate = 2e-5,
#     epsilon = 1e-8,
#     clipnorm = 1.0)
# loss = BinaryCrossentropy(from_logits= False)

# metrics = [BinaryAccuracy()]
# model.compile(
#     optimizer = optimizer,
#     loss = loss,
#     metrics = metrics)

In [None]:
# plot_model(model,show_shapes=True)

In [None]:
# train_history = model.fit(
#     x = {'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']},
#     y = y_train,
#     epochs = 20,
#     batch_size = 10,
#     validation_split=0.2
# )

In [None]:
#test_df = test_df.drop('text', axis=1)


In [None]:
# x_test = tokenizer(
#     text = test_df.text.tolist(),
#     add_special_tokens = True,
#     max_length = 34,
#     truncation = True,
#     padding = True,
#     return_tensors = 'tf',
#     return_token_type_ids = False,
#     return_attention_mask = True,
#     verbose = True)

In [None]:
# loss, accuracy = model.evaluate(
#     x={'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']}
# )
# print(f"Test Loss: {loss}")
# print(f"Test Accuracy: {accuracy}")


In [None]:
# predictions = model.predict(
#     x={'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']}
# )


In [None]:
# predicated = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})