<a href="https://colab.research.google.com/github/karengarm/NLP_Disaster_Tweets_Classification/blob/main/02_LSTM_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_curve, auc
import re
import tensorflow as tf
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import ParameterSampler
from keras.layers.embeddings import Embedding

dir_path = '/content/drive/MyDrive/'
os.chdir(dir_path)

def evaluate(y_test, y_pred, mlflow):
    """
    Evaluation function. For each of the text in evaluation data, it reads the score from
    the predictions made. And based on this, it calculates the values of
    True positive, True negative, False positive, and False negative.

    :param y_test: true labels
    :param y_pred: predicted labels
    :param labels: list of possible labels
    :return: evaluation metrics for classification like, precision, recall, and f_score
    """
    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
    mauc = auc(fpr, tpr)
    # calculate the g-mean for each threshold
    gmeans = np.sqrt(tpr * (1-fpr))
    # locate the index of the largest g-mean
    ix = np.argmax(gmeans)
    best_trhold =thresholds[ix]

    y_pred  = np.where(y_pred > 0.5, 1, 0) 
    labels = ['Not informative', 'Related and informative']
    confusion = confusion_matrix(y_test, y_pred)


    df_cm = pd.DataFrame(confusion, index=[i for i in labels],
                         columns=[i for i in labels])
    '''
    fig, ax = plt.subplots()
    plt.figure(figsize=(10, 7))
    sn.heatmap(df_cm, annot=True)
    plt.title('Confusion Matrix')
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    mlflow.log_figure(fig, "confusion.png") 
    '''

    # importing accuracy_score, precision_score, recall_score, f1_score
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average= None)
    recall = recall_score(y_test, y_pred, average= None)
    f1score = f1_score(y_test, y_pred, average= None)
    
    
    report = classification_report(y_test, y_pred, target_names = labels)
    print(report)

    mlflow.log_metric("auc_test", mauc) 
    mlflow.log_metric("accuracy_test", accuracy)
    mlflow.log_metric("precision_NI", precision[0])  
    mlflow.log_metric("precision_I", precision[1])    
    mlflow.log_metric("recall_NI", recall[0])
    mlflow.log_metric("recall_I", recall[1])
    mlflow.log_metric("f1score_NI", f1score[0]) 
    mlflow.log_metric("f1score_I", f1score[1]) 
    return accuracy, precision, recall, f1score, mauc 
    
def mlflow_log_parameters(cat, parameter):
  # Log parameters
  mlflow.log_param("crisis_type", cat)
  mlflow.log_param("dropout", parameter['dropout'])
  mlflow.log_param("learning_rate", parameter['learning_rate'])
  mlflow.log_param("epochs", parameter['epochs'])
  mlflow.log_param("batch_size", parameter['batch_size'])

def get_input(data, vocabulary_size, max_length):
  # integer encode the documents

  tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words= vocabulary_size)
  tokenizer.fit_on_texts(data)
  sequences = tokenizer.texts_to_sequences(data)
  data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=50)
  # pad documents to a max length
  padded_docs = tf.keras.preprocessing.sequence.pad_sequences(data , maxlen=max_length, padding='post')
  return padded_docs

def create_lstm(vocabulary_size, max_length, ndropout, nlr):
  """ LSTM is composed of a main block and a classification layer with dropout
  """
  model = tf.keras.models.Sequential()
  model.add(Embedding(vocabulary_size, 50, input_length=max_length))
  model.add(tf.keras.layers.SpatialDropout1D(0.3))
  model.add(tf.keras.layers.LSTM(256, dropout=ndropout, recurrent_dropout=0.2))
  model.add(tf.keras.layers.Dense(256, activation = 'relu'))
  model.add(tf.keras.layers.Dropout(0.3))
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  # compile the model
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=nlr), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
  return model

def training_test_mae( mlflow, history):
  # Plot training and test loss at each epoch 
  fig, ax = plt.subplots()
  ax.plot(history.history['accuracy'], label='Training acc')
  plt.plot(history.history['val_accuracy'], label='Validation acc')
  plt.title('Training and validation accuracy')
  plt.xlabel('Epochs')
  plt.ylabel('accuracy')
  plt.legend()
  plt.show()
  mlflow.log_figure(fig, "training_validation_accuracy.png") 

  fig, ax = plt.subplots()
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.show()
  mlflow.log_figure(fig, "training_validation_loss.png")


In [None]:
!pip install mlflow
!databricks configure --host https://community.cloud.databricks.com/
import mlflow
import mlflow.keras

## Scenario 1

In [None]:
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/karengar@stanford.edu/LSTM_Scenario1_SpatialDropout")

In [None]:
df_test = pd.read_csv('test_data.csv', header = 0, sep = 't')
df_train= pd.read_csv('train_data.csv', header = 0, sep = 't')

rng = np.random.RandomState(0)
param_dist = {    'dropout': [0.1, 0.2, 0.3],
                  'learning_rate': loguniform.rvs(1e-5, 1e-3, size= 10),
                  'epochs': [5, 8, 25, 50],
                  'batch_size': [64, 128, 450, 512]
                  }
dict_parameters = ParameterSampler(param_distributions=param_dist, n_iter=10, random_state=rng)
vocabulary_size = 20000
max_length = 50

In [None]:
for parameter in dict_parameters:
  print(parameter)
  for cat in df_test.categorization_type.unique(): 
    tmp = df_train[df_train['categorization_type'] == cat]
    X_train = tmp['tweettext_proc'].values
    y_train = np.where(tmp.cat_informativeness =='Related and informative', 1, 0)
    tmp = df_test[df_test['categorization_type'] == cat]
    X_test = tmp['tweettext_proc'].values 
    y_test = np.where(tmp.cat_informativeness =='Related and informative', 1, 0)
    with mlflow.start_run(): 
      mlflow_log_parameters(cat, parameter)
      padded_docs = get_input(X_train, vocabulary_size, max_length)
      # define the model
      model = create_lstm(vocabulary_size, max_length, parameter['dropout'], parameter['learning_rate'])
      # fit the model
      mlflow.keras.log_model(model, "Lstm")
      history = model.fit(padded_docs, np.array(y_train), epochs= parameter['epochs'], batch_size=parameter['batch_size'], validation_split=.15)
      training_test_mae( mlflow, history)
      #Testing
      padded_docs = get_input(X_test, vocabulary_size, max_length)
      y_pred = model.predict(padded_docs, verbose=True)
      accuracy, precision, recall, f1score, mauc = evaluate(y_test, y_pred, mlflow)
      mlflow.end_run()

# Scenario 2

In [None]:
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/karengar@stanford.edu/LSTM_Scenario2_SpatialDropout")

In [None]:
df_test = pd.read_csv('test_data.csv', header = 0, sep = 't')
df_train= pd.read_csv('train_data.csv', header = 0, sep = 't')

rng = np.random.RandomState(0)
param_dist = {    'dropout': [0.1, 0.2, 0.3],
                  'learning_rate': loguniform.rvs(1e-5, 1e-3, size= 10),
                  'epochs': [5,8, 25],
                  'batch_size': [64, 128, 450, 512]
                  }
dict_parameters = ParameterSampler(param_distributions=param_dist, n_iter=1, random_state=rng)
vocabulary_size = 20000
max_length = 50

X_train = df_train['tweettext_proc'].values
y_train = np.where(df_train.cat_informativeness =='Related and informative', 1, 0)
padded_docs_train = get_input(X_train, vocabulary_size, max_length)

In [None]:
for parameter in dict_parameters:
  print(parameter)
  # define the model
  model = create_lstm(vocabulary_size, max_length, parameter['dropout'], parameter['learning_rate'])
  history = model.fit(padded_docs_train, np.array(y_train), epochs= parameter['epochs'], batch_size=parameter['batch_size'], validation_split=.15)
  training_test_mae( mlflow, history)
  for cat in df_test.categorization_type.unique(): 
      tmp = df_test[df_test['categorization_type'] == cat]
      X_test = tmp['tweettext_proc'].values 
      y_test = np.where(tmp.cat_informativeness =='Related and informative', 1, 0)
      # fit the model
      with mlflow.start_run(): 
        mlflow_log_parameters(cat, parameter)
        mlflow.keras.log_model(model, "Lstm")
        #Testing
        padded_docs = get_input(X_test, vocabulary_size, max_length)
        y_pred = model.predict(padded_docs, verbose=True)
        accuracy, precision, recall, f1score, mauc = evaluate(y_test, y_pred, mlflow)
        mlflow.end_run()