In [55]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from time import time

In [57]:
vocab_size = 5000
def preproc(X,y) -> tuple:
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=123)
    tokenizer = Tokenizer(num_words=vocab_size)
    
    tokenizer.fit_on_texts(X_train)
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)
        
    max_seq_length = max(len(x) for x in X_train_seq)
    X_train_padded = pad_sequences(X_train_seq, maxlen=max_seq_length)
    X_test_padded = pad_sequences(X_test_seq, maxlen=max_seq_length)
    
    if y_train.nunique() != 2:
        y_train_cat = to_categorical(y_train, num_classes=y_train.nunique())
        y_test_cat = to_categorical(y_test, num_classes=y_test.nunique())
        return X_train_padded, X_test_padded, y_train_cat, y_test_cat, max_seq_length
    
    return X_train_padded, X_test_padded, np.array(y_train), np.array(y_test), max_seq_length

In [58]:
bbc_data = pd.read_csv("../data/clean_bbc_classification.csv")
sarc_data = pd.read_csv("../data/clean_sarcasm_classification.csv")

In [59]:
X = bbc_data['text']
y = bbc_data['label_ids']

X_train, X_test, y_train, y_test, maxlen = preproc(X,y)

In [60]:
embedding_dim = 128

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    LSTM(64),
    Dropout(0.5), 
    Dense(32, activation='relu'), 
    Dense(16, activation='relu'),  
    Dense(bbc_data['label_ids'].nunique(), activation='softmax') 
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss="categorical_crossentropy", metrics=['accuracy'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 1835, 128)         640000    
                                                                 
 lstm_11 (LSTM)              (None, 64)                49408     
                                                                 
 dropout_11 (Dropout)        (None, 64)                0         
                                                                 
 dense_33 (Dense)            (None, 32)                2080      
                                                                 
 dense_34 (Dense)            (None, 16)                528       
                                                                 
 dense_35 (Dense)            (None, 5)                 85        
                                                                 
Total params: 692101 (2.64 MB)
Trainable params: 6921

In [61]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [62]:
t0 = time()
model.fit(X_train, y_train, epochs=100, validation_data=[X_test, y_test], callbacks=[early_stop])
preds = model.predict(X_test)
print("LSTM (BBC data)")
print(classification_report(np.argmax(y_test,axis=1),np.argmax(preds, axis=1)))
t1 = time()
runtime_lstm = round(t1-t0, 2)
print(f"Runtime: {runtime_lstm} seconds")
performance_lstm = classification_report(np.argmax(y_test,axis=1), np.argmax(preds, axis=1), output_dict=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
LSTM (BBC data)
              precision    recall  f1-score   support

           0       0.98      0.89      0.94        65
           1       0.91      0.97      0.94       103
           2       0.96      0.96      0.96       114
           3       0.92      0.92      0.92        88
           4       0.91      0.91      0.91        75

    accuracy                           0.93       445
   macro avg       0.94      0.93      0.93       445
weighted avg       0.94      0.93      0.93       445

Runtime: 675.72 seconds


In [65]:
data = pd.read_csv("../data/evaluation_data_2.csv")
data = pd.concat([data,
                  pd.DataFrame({
                      'Model': ['LSTM'],
                      'Runtime': [runtime_lstm],
                      'Accuracy': [performance_lstm['accuracy']],
                      'F1': [performance_lstm['weighted avg']['f1-score']],
                      'data': ['bbc news']
                  })], ignore_index=True)
data

Unnamed: 0,Model,Runtime,Accuracy,F1,data
0,Decision Tree,0.7,0.840449,0.840706,bbc news
1,Decision Tree fine-tuned,14.45,0.817978,0.822761,bbc news
2,Decision Tree,11.28,0.725891,0.725137,sarcasm detection
3,Decision Tree fine-tuned,58.69,0.629804,0.590813,sarcasm detection
4,Random Forest,3.13,0.975281,0.975275,bbc news
5,Random Forest,63.46,0.763277,0.76107,sarcasm detection
6,Random Forest fine-tuned,465.43,0.776555,0.776098,sarcasm detection
7,LSTM,675.72,0.934831,0.934832,bbc news


### ON SARC DATA

In [66]:
sarc_data = sarc_data.dropna()
X = sarc_data['text']
y = sarc_data['is_sarcastic']

X_train, X_test, y_train, y_test, maxlen = preproc(X,y)

In [67]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    LSTM(64),
    Dropout(0.5), 
    Dense(32, activation='relu'), 
    Dense(16, activation='relu'),  
    Dense(1, activation='sigmoid') 
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss="binary_crossentropy", metrics=['accuracy'])
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 73, 128)           640000    
                                                                 
 lstm_13 (LSTM)              (None, 64)                49408     
                                                                 
 dropout_13 (Dropout)        (None, 64)                0         
                                                                 
 dense_39 (Dense)            (None, 32)                2080      
                                                                 
 dense_40 (Dense)            (None, 16)                528       
                                                                 
 dense_41 (Dense)            (None, 1)                 17        
                                                                 
Total params: 692033 (2.64 MB)
Trainable params: 6920

In [71]:
t0 = time()
model.fit(X_train, y_train, epochs=100, validation_data=[X_test, y_test], callbacks=[early_stop])
preds = (model.predict(X_test) >= 0.5).astype(int)
print("LSTM (Sarcasm Detection)")
print(classification_report(y_test,preds))
t1 = time()
runtime_lstm = round(t1-t0, 2)
print(f"Runtime: {runtime_lstm} seconds")
performance_lstm = classification_report(y_test,preds, output_dict=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
LSTM (Sarcasm Detection)
              precision    recall  f1-score   support

           0       0.79      0.80      0.80      2978
           1       0.78      0.77      0.78      2746

    accuracy                           0.79      5724
   macro avg       0.79      0.79      0.79      5724
weighted avg       0.79      0.79      0.79      5724

Runtime: 115.17 seconds


In [72]:
data = pd.concat([data,
                  pd.DataFrame({
                      'Model': ['LSTM'],
                      'Runtime': [runtime_lstm],
                      'Accuracy': [performance_lstm['accuracy']],
                      'F1': [performance_lstm['weighted avg']['f1-score']],
                      'data': ['sarcasm detection']
                  })], ignore_index=True)
data

Unnamed: 0,Model,Runtime,Accuracy,F1,data
0,Decision Tree,0.7,0.840449,0.840706,bbc news
1,Decision Tree fine-tuned,14.45,0.817978,0.822761,bbc news
2,Decision Tree,11.28,0.725891,0.725137,sarcasm detection
3,Decision Tree fine-tuned,58.69,0.629804,0.590813,sarcasm detection
4,Random Forest,3.13,0.975281,0.975275,bbc news
5,Random Forest,63.46,0.763277,0.76107,sarcasm detection
6,Random Forest fine-tuned,465.43,0.776555,0.776098,sarcasm detection
7,LSTM,675.72,0.934831,0.934832,bbc news
8,LSTM,115.17,0.786688,0.786593,sarcasm detection


In [73]:
data.to_csv("../data/evaluation_data_final.csv", index=False)