In [94]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from time_counter import time_counter
import keras_tuner as kt
import multiprocessing

In [95]:
vocab_size = 5000
def preproc(X,y) -> tuple:
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=123)
    tokenizer = Tokenizer(num_words=vocab_size)
    
    tokenizer.fit_on_texts(X_train)
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)
        
    max_seq_length = max(len(x) for x in X_train_seq)
    X_train_padded = pad_sequences(X_train_seq, maxlen=max_seq_length)
    X_test_padded = pad_sequences(X_test_seq, maxlen=max_seq_length)
    
    if y_train.nunique() != 2:
        y_train_cat = to_categorical(y_train, num_classes=y_train.nunique())
        y_test_cat = to_categorical(y_test, num_classes=y_test.nunique())
        return X_train_padded, X_test_padded, y_train_cat, y_test_cat, max_seq_length
    
    return X_train_padded, X_test_padded, np.array(y_train), np.array(y_test), max_seq_length

In [96]:
bbc_data = pd.read_csv("../data/clean_bbc_classification.csv")
sarc_data = pd.read_csv("../data/clean_sarcasm_classification.csv")

In [97]:
# Fine tuning params

num_iterations = 5
num_cv = 3

### LSTM on BBC News Classification
#### Simple LSTM

In [98]:
X = bbc_data['text']
y = bbc_data['label_ids']

X_train, X_test, y_train, y_test, maxlen = preproc(X,y)

In [99]:
num_cpus = multiprocessing.cpu_count()
num_cpus

8

In [100]:
embedding_dim = 128

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    LSTM(64),
    Dropout(0.5), 
    Dense(16, activation='relu'),  
    Dense(bbc_data['label_ids'].nunique(), activation='softmax') 
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss="categorical_crossentropy", metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1835, 128)         640000    
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 16)                1040      
                                                                 
 dense_3 (Dense)             (None, 5)                 85        
                                                                 
Total params: 690533 (2.63 MB)
Trainable params: 690533 (2.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [101]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [102]:
@time_counter
def model_fit():
    
    model.fit(X_train, y_train, epochs=100, validation_data=[X_test, y_test], callbacks=[early_stop])
    preds = model.predict(X_test)
    print("LSTM (BBC data)")
    print(classification_report(np.argmax(y_test,axis=1),np.argmax(preds, axis=1)))
    return preds

preds, runtime_lstm = model_fit()
print(f"Runtime: {runtime_lstm} seconds")
performance_lstm = classification_report(np.argmax(y_test,axis=1), np.argmax(preds, axis=1), output_dict=True)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
LSTM (BBC data)
              precision    recall  f1-score   support

           0       0.97      0.91      0.94        65
           1       0.95      0.99      0.97       103
           2       1.00      1.00      1.00       114
           3       0.89      0.97      0.93        88
           4       0.97      0.88      0.92        75

    accuracy                           0.96       445
   macro avg       0.96      0.95      0.95       445
weighted avg       0.96      0.96      0.96       445

Runtime: 402.97 seconds


#### Fine-tuned LSTM

In [103]:
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=hp.Int('embedding_dim', min_value=64, max_value=256, step=32), 
                        input_length=maxlen))
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=256, step=32)))
    model.add(Dropout(rate=hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(units=hp.Int('dense_units', min_value=16, max_value=128, step=16), activation='relu'))
    model.add(Dense(bbc_data['label_ids'].nunique(), activation='softmax'))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Initialize the tuner
tuner = kt.RandomSearch(build_model, 
                        objective='val_accuracy', 
                        max_trials=num_iterations, 
                        executions_per_trial=num_cv, 
                        directory='lstm', 
                        project_name='lstm_tuning_bbc')

In [104]:
@time_counter
def model_tuning():
    tuner.search(X_train, y_train, epochs=20, validation_data=(X_test, y_test), callbacks=[early_stop])
    best_model = tuner.get_best_models(num_models=1)[0]
    preds = best_model.predict(X_test)
    print("Fine-tuned LSTM (BBC data)")
    print(classification_report(np.argmax(y_test, axis=1), np.argmax(preds, axis=1)))
    return preds

preds, runtime_lstm_tuned = model_tuning()
print(f"Runtime: {runtime_lstm_tuned} seconds")
performance_lstm_tuned = classification_report(np.argmax(y_test, axis=1), np.argmax(preds, axis=1), output_dict=True)

Trial 5 Complete [00h 40m 17s]
val_accuracy: 0.9318352142969767

Best val_accuracy So Far: 0.9415730436642965
Total elapsed time: 05h 35m 45s
Fine-tuned LSTM (BBC data)
              precision    recall  f1-score   support

           0       0.95      0.94      0.95        65
           1       0.94      0.98      0.96       103
           2       0.98      0.99      0.99       114
           3       0.95      0.93      0.94        88
           4       0.93      0.91      0.92        75

    accuracy                           0.96       445
   macro avg       0.95      0.95      0.95       445
weighted avg       0.95      0.96      0.95       445

Runtime: 20148.4 seconds


In [105]:
data = pd.read_csv("../data/evaluation_data_2.csv")
data = pd.concat([data,
                  pd.DataFrame({
                      'Model': ['LSTM', 'LSTM fine-tuned'],
                      'Runtime': [runtime_lstm, runtime_lstm_tuned],
                      'Accuracy': [performance_lstm['accuracy'], performance_lstm_tuned['accuracy']],
                      'F1': [performance_lstm['weighted avg']['f1-score'], performance_lstm_tuned['weighted avg']['f1-score']],
                      'data': ['bbc news', 'bbc news']
                  })], ignore_index=True)
data

Unnamed: 0,Model,Runtime,Accuracy,F1,data
0,Decision Tree,0.73,0.849438,0.849514,bbc news
1,Decision Tree fine-tuned,8.08,0.617978,0.632672,bbc news
2,Decision Tree,14.38,0.730084,0.729078,sarcasm detection
3,Decision Tree fine-tuned,8.67,0.615828,0.567973,sarcasm detection
4,Random Forest,0.42,0.970787,0.970774,bbc news
5,Random Forest fine-tuned,8.9,0.959551,0.959489,bbc news
6,Random Forest,10.27,0.767121,0.764883,sarcasm detection
7,Random Forest fine-tuned,82.99,0.771139,0.770207,sarcasm detection
8,LSTM,402.97,0.957303,0.9571,bbc news
9,LSTM fine-tuned,20148.4,0.955056,0.95487,bbc news


### ON SARC DATA
#### Simple LSTM

In [106]:
sarc_data = sarc_data.dropna()
X = sarc_data['text']
y = sarc_data['is_sarcastic']

X_train, X_test, y_train, y_test, maxlen = preproc(X,y)

In [107]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    LSTM(64),
    Dropout(0.5), 
    Dense(16, activation='relu'),  
    Dense(1, activation='sigmoid') 
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss="binary_crossentropy", metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 73, 128)           640000    
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 16)                1040      
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 690465 (2.63 MB)
Trainable params: 690465 (2.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [108]:
@time_counter
def model_fit():
    model.fit(X_train, y_train, epochs=100, validation_data=[X_test, y_test], callbacks=[early_stop])
    preds = (model.predict(X_test) >= 0.5).astype(int)
    print("LSTM (Sarcasm Detection)")
    print(classification_report(y_test,preds))
    return preds

preds, runtime_lstm = model_fit()
print(f"Runtime: {runtime_lstm} seconds")
performance_lstm = classification_report(y_test,preds, output_dict=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
LSTM (Sarcasm Detection)
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      2978
           1       0.78      0.79      0.78      2746

    accuracy                           0.79      5724
   macro avg       0.79      0.79      0.79      5724
weighted avg       0.79      0.79      0.79      5724

Runtime: 138.59 seconds


#### Fine-tuned LSTM

In [109]:
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=hp.Int('embedding_dim', min_value=64, max_value=256, step=32), 
                        input_length=maxlen))
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=256, step=32)))
    model.add(Dropout(rate=hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(units=hp.Int('dense_units', min_value=16, max_value=128, step=16), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Initialize the tuner
tuner = kt.RandomSearch(build_model, 
                        objective='val_accuracy', 
                        max_trials=num_iterations, 
                        executions_per_trial=num_cv, 
                        directory='lstm_sarc', 
                        project_name='lstm_tuning_sarc')

In [110]:
@time_counter
def model_tuning():
    tuner.search(X_train, y_train, epochs=20, validation_data=(X_test, y_test), callbacks=[early_stop])
    best_model = tuner.get_best_models(num_models=1)[0]
    preds = (best_model.predict(X_test) >= 0.5).astype(int)
    print("Fine-tuned LSTM (Sarcasm detection data)")
    print(classification_report(y_test, preds))
    return preds

preds, runtime_lstm_tuned = model_tuning()
print(f"Runtime: {runtime_lstm_tuned} seconds")
performance_lstm_tuned = classification_report(y_test, preds, output_dict=True)

Trial 5 Complete [00h 21m 00s]
val_accuracy: 0.520265519618988

Best val_accuracy So Far: 0.520265519618988
Total elapsed time: 00h 55m 50s
Fine-tuned LSTM (Sarcasm detection data)
              precision    recall  f1-score   support

           0       0.52      1.00      0.68      2978
           1       0.00      0.00      0.00      2746

    accuracy                           0.52      5724
   macro avg       0.26      0.50      0.34      5724
weighted avg       0.27      0.52      0.36      5724

Runtime: 3356.2 seconds


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [111]:
data = pd.concat([data,
                  pd.DataFrame({
                      'Model': ['LSTM', 'LSTM fine-tuned'],
                      'Runtime': [runtime_lstm, runtime_lstm_tuned],
                      'Accuracy': [performance_lstm['accuracy'], performance_lstm_tuned['accuracy']],
                      'F1': [performance_lstm['weighted avg']['f1-score'], performance_lstm_tuned['weighted avg']['f1-score']],
                      'data': ['sarcasm detection', 'sarcasm detection']
                  })], ignore_index=True)
data

Unnamed: 0,Model,Runtime,Accuracy,F1,data
0,Decision Tree,0.73,0.849438,0.849514,bbc news
1,Decision Tree fine-tuned,8.08,0.617978,0.632672,bbc news
2,Decision Tree,14.38,0.730084,0.729078,sarcasm detection
3,Decision Tree fine-tuned,8.67,0.615828,0.567973,sarcasm detection
4,Random Forest,0.42,0.970787,0.970774,bbc news
5,Random Forest fine-tuned,8.9,0.959551,0.959489,bbc news
6,Random Forest,10.27,0.767121,0.764883,sarcasm detection
7,Random Forest fine-tuned,82.99,0.771139,0.770207,sarcasm detection
8,LSTM,402.97,0.957303,0.9571,bbc news
9,LSTM fine-tuned,20148.4,0.955056,0.95487,bbc news


In [112]:
data.to_csv("../data/evaluation_data_final.csv", index=False)