<a href="https://colab.research.google.com/github/karengarm/NLP_Disaster_Tweets_Classification/blob/main/03_BERT_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification of Tweets During Crisis-Events using the BERT model

This notebook uses the BERT model to classify tweets as informative of non-informative. For a more detailed description of the problem, the data set, the preprocessing steps and for a basline approach, please see the "Automatic detection of crisis-related messages - NLP.ipynb" notebook.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My\ Drive/CS230

In [None]:
!pip install emoji
!pip install langdetect
!pip install tensorflow_text

In [None]:
import pandas as pd
import numpy as np
import emoji
from google_trans_new_local import google_translator
from langdetect import detect
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, auc
import tensorflow as tf
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import ParameterSampler


from numpy.random import seed
seed(1)
from tensorflow.random import set_seed
set_seed(2)

## Scenario 1
How well can a model trained on one type of disaster (i.e floods, shootings, etc.) perform on similar events of the same type?

In [None]:
def get_data(category):
  df_train = pd.read_csv('train_data.csv', header = 0, sep = 't')
  df_train = df_train.dropna(subset=['tweettext_proc'])
  df_test = pd.read_csv('test_data.csv', header = 0, sep = 't')
  df_test = df_test.dropna(subset=['tweettext_proc'])

  labels_dict = {'Not informative': 0, 'Related and informative':1}
  df_train['labels'] = df_train.cat_informativeness.replace(labels_dict)
  df_test['labels'] = df_test.cat_informativeness.replace(labels_dict)

  train = df_train.loc[df_train['categorization_type'] == category]
  test = df_test.loc[df_test['categorization_type'] == category]

  X_train = train['tweettext_proc']
  y_train = tf.keras.utils.to_categorical(train['labels'], num_classes=2)

  X_test = test['tweettext_proc']
  y_test = tf.keras.utils.to_categorical(test['labels'], num_classes=2)


  return X_train, y_train, X_test, y_test

### Defining the model
The BERT model consist of a pre-trained BERT endocing layer, a drop-out layer for regularization and a dense layer using a sigmoid activation function. The model was partly based on [this model](https://www.kaggle.com/code/sanketsonu/bert-model-nlp-with-disaster-tweets) posted on kaggle by Sanket Sonu.

The optimizer used is the Adam optimizer, which is used together with the binary crossentropy loss since it is a binary classification problem.

In [None]:
def create_bert(learning_rate=1e-3, dropout=0.1):
    # Input layer
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='Text')

    # Bert encoding
    bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3", name = "Bert_preprocessing")
    bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/4", name = "Bert_encoding")
    preprocessed_text = bert_preprocess(text_input)
    outputs = bert_encoder(preprocessed_text)

    # Classification layer with dropout for regularization
    l = tf.keras.layers.Dropout(dropout, name='Dropout')(outputs['pooled_output'])
    l = tf.keras.layers.Dense(2, activation='sigmoid', name='Classifier')(l)

    # Compile model
    model = tf.keras.Model(inputs=[text_input], outputs=[l])
    model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy',
                  metrics=tf.keras.metrics.CategoricalAccuracy(name='accuracy'))


    return model

### Evaluation

Evaluating the model using accuracy, precision, recall, f1-score and mauc.

In [None]:
def evaluate(y_test, y_pred, trhold):
    """
    Evaluation function. For each of the text in evaluation data, it reads the score from
    the predictions made. And based on this, it calculates the values of
    True positive, True negative, False positive, and False negative.

    :param y_test: true labels
    :param y_pred: predicted labels
    :return: evaluation metrics for classification: accuracy, precision, recall, f1_score and mauc
    """
    # calculating mauc
    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
    mauc = auc(fpr, tpr)

    y_pred  = np.where(y_pred > trhold, 1, 0) 
    labels = ['Not informative', 'Informative']

    # calculating accuracy_score, precision_score, recall_score, f1_score
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1score = f1_score(y_test, y_pred)
    
    
    report = classification_report(y_test, y_pred, target_names = labels)
    print(report)
    print('mauc: ', mauc)
    return accuracy, precision, recall, f1score, mauc 

### Running the model
In scenario 1 the model is trained using data from one of three categories and tested on data from the same category.

In [None]:
def run_scenario1(learning_rate=1e-3, dropout=0.1, epochs=5, batch_size=16):  
  for cat in ['Earthquake', 'Floods', 'Derailment']:

      # Get data
      X_train, y_train, X_test, y_test = get_data(cat)

      #Training
      bert_model = create_bert(learning_rate, dropout)
      hist = bert_model.fit(x=X_train, y=y_train, epochs=epochs, batch_size=batch_size, validation_split=.15)

      #Testing
      y_test_arg = np.argmax(y_test, axis=1)
      y_pred = np.argmax(bert_model.predict(X_test),axis=1)

      print(cat)
      accuracy, precision, recall, f1score, mauc = evaluate(y_test_arg, y_pred, 0.5)

In [None]:
run_scenario1()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Earthquake
                 precision    recall  f1-score   support

Not informative       0.51      0.66      0.58       373
    Informative       0.76      0.63      0.69       627

       accuracy                           0.64      1000
      macro avg       0.64      0.65      0.63      1000
   weighted avg       0.67      0.64      0.65      1000

mauc:  0.645039359304916
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Floods
                 precision    recall  f1-score   support

Not informative       0.68      0.47      0.56       841
    Informative       0.73      0.86      0.79      1359

       accuracy                           0.71      2200
      macro avg       0.70      0.67      0.67      2200
   weighted avg       0.71      0.71      0.70      2200

mauc:  0.6679637839601932
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Derailment
                 precision    recall  f1-score   support

Not informative       

## Hyperparameter tuning for scenario 1

The best hyperparameters are found by randomly choosing a combination of values for the following hyperparameters:

* Learning Rate using log scale
* Number of epochs
* Batch size
* Dropout

In [None]:
rng = np.random.RandomState(2)
param_dist = {    'dropout': [0.1, 0.3],
                  'learning_rate': loguniform.rvs(1e-5, 1e-2, size= 10),
                  'epochs': [2, 5, 10, 25],
                  'batch_size': [16, 32, 64, 128]
                  }
dict_parameters = ParameterSampler(param_distributions=param_dist, n_iter=10, random_state=rng)

In [None]:
for parameters in dict_parameters:
    print("===============Now testing the following parameters===============")
    print("Learning rate: ", parameters['learning_rate'])
    print("Dropout: ", parameters['dropout'])
    print("Epochs: ", parameters['epochs'])
    print("Batch size: ", parameters['batch_size'])
    print()

    run_scenario1(learning_rate=parameters['learning_rate'], dropout=parameters['dropout'], epochs=parameters['epochs'], batch_size=parameters['batch_size'])


Learning rate:  5.659770986133262e-05
Dropout:  0.1
Epochs:  25
Batch size:  16
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25

## Scenario 2

How well can a model trained on many type of disaster (i.e floods, shootings, etc.) perform on one disaster type?

In [None]:
def get_train_data():
  df_train = pd.read_csv('train_data.csv', header = 0, sep = 't')
  df_train = df_train.dropna(subset=['tweettext_proc'])

  labels_dict = {'Not informative': 0, 'Related and informative':1}
  df_train['labels'] = df_train.cat_informativeness.replace(labels_dict)

  X_train = df_train['tweettext_proc']
  y_train = tf.keras.utils.to_categorical(df_train['labels'], num_classes=2)

  return X_train, y_train

In [None]:
def get_test_data(category):
  df_test = pd.read_csv('test_data.csv', header = 0, sep = 't')
  df_test = df_test.dropna(subset=['tweettext_proc'])

  labels_dict = {'Not informative': 0, 'Related and informative':1}
  df_test['labels'] = df_test.cat_informativeness.replace(labels_dict)

  test = df_test.loc[df_test['categorization_type'] == category]

  X_test = test['tweettext_proc']
  y_test = tf.keras.utils.to_categorical(test['labels'], num_classes=2)

  return X_test, y_test

### Running the model
In scenario 2 the model is trained using data from all categories and tested on data for three different categories separately. The same create_model() and evaluate() function as in scenario 1 are used.

In [None]:
def run_scenario2(learning_rate=1e-3, dropout=0.1, epochs=5, batch_size=16): 

  # Get training data
  X_train, y_train = get_train_data()

  # Training
  bert_model = create_bert(learning_rate, dropout)
  hist = bert_model.fit(x=X_train, y=y_train, epochs=epochs, batch_size=batch_size, validation_split=.15)

  for cat in ['Earthquake', 'Floods', 'Derailment']:

      # Get test data
      X_test, y_test = get_test_data(cat)

      #Testing
      y_test_arg = np.argmax(y_test, axis=1)
      y_pred = np.argmax(bert_model.predict(X_test),axis=1)

      print(cat)
      accuracy, precision, recall, f1score, mauc = evaluate(y_test_arg, y_pred, 0.5)

In [None]:
run_scenario2()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Earthquake
                 precision    recall  f1-score   support

Not informative       0.64      0.78      0.70       373
    Informative       0.85      0.74      0.79       627

       accuracy                           0.75      1000
      macro avg       0.74      0.76      0.75      1000
   weighted avg       0.77      0.75      0.76      1000

mauc:  0.7579584471781453
Floods
                 precision    recall  f1-score   support

Not informative       0.66      0.75      0.70       841
    Informative       0.83      0.76      0.79      1359

       accuracy                           0.76      2200
      macro avg       0.74      0.75      0.75      2200
   weighted avg       0.76      0.76      0.76      2200

mauc:  0.7540184387520026
Derailment
                 precision    recall  f1-score   support

Not informative       0.59      0.44      0.51       271
    Informative       0.81      0.89      0.85       729

      

### Hyperparameter tuning for scenario 2
The best hyperparameters are found by randomly choosing a combination of values for the following hyperparameters:

* Learning Rate using log scale
* Number of epochs
* Batch size
* Dropout

In [None]:
rng = np.random.RandomState(4)
param_dist = {    'dropout': [0.1, 0.3],
                  'learning_rate': loguniform.rvs(1e-5, 1e-2, size= 10),
                  'epochs': [2, 5, 10, 25],
                  'batch_size': [16, 32, 64, 128]
                  }
dict_parameters = ParameterSampler(param_distributions=param_dist, n_iter=10, random_state=rng)

In [None]:
for parameters in dict_parameters:
    print("===============Now testing the following parameters===============")
    print("Learning rate: ", parameters['learning_rate'])
    print("Dropout: ", parameters['dropout'])
    print("Epochs: ", parameters['epochs'])
    print("Batch size: ", parameters['batch_size'])
    print()

    run_scenario2(learning_rate=parameters['learning_rate'], dropout=parameters['dropout'], epochs=parameters['epochs'], batch_size=parameters['batch_size'])