In [1]:
import pandas as pd
import re
import nltk
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import numpy as np
from typing import List
from nltk.tokenize import word_tokenize

In [2]:
def filter_stop_words(text: List[str])->List[str]:
    stop_words = set(stopwords.words("english"))
    return [word for word in text if word not in stop_words]


dialogues = pd.read_csv('../code/emo_context_train.txt', sep="\t")
dialogues = dialogues[dialogues["label"]!="others"]
labels = dialogues.label
cols = ["turn1", "turn2", "turn3"]
dialogues["combined"] = dialogues[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
#dialogues["combined"] = dialogues["combined"].apply(str.lower).apply(word_tokenize).apply(filter_stop_words).apply(lambda x: " ".join(x))
dialogues["combined"] = dialogues["combined"].apply(str.lower).apply(word_tokenize).apply(lambda x: " ".join(x))
dialogues["seq_len"] = dialogues["combined"].apply(len)
print(dialogues.columns)

max_len = dialogues["seq_len"].max()
input_text = dialogues.combined.values

Index(['id', 'turn1', 'turn2', 'turn3', 'label', 'combined', 'seq_len'], dtype='object')


In [3]:
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import metrics
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import keras_tuner as kt
import math 
import sklearn
from sklearn import metrics

2023-05-01 00:36:57.708526: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
X = input_text
label_map = {'happy':0,'sad':1,'angry':2}
y = dialogues['label'].map(label_map)

y = np.asarray(y).astype('float32')
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X)

1        2
3        2
10       2
13       1
15       0
        ..
30146    0
30148    2
30150    1
30152    2
30156    1
Name: label, Length: 15212, dtype: int64
<class 'numpy.ndarray'>
15212


In [79]:
def build_model(hp):
    embed_dim = 128
    lstm_out = 196
    max_features = 2000
    model = Sequential()
    model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    activation=hp.Choice("activation", ["relu", "softmax"])
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    model.add(Dense(units=hp_units, activation='softmax'))
    model.add(Dense(10))
    #model.add(Dense(3,activation='softmax'))
    model.compile(optimizer='adam',loss = 'sparse_categorical_crossentropy',metrics=['accuracy'])
    print(model.summary())
    return model
build_model(kt.HyperParameters())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 67, 128)           256000    
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 67, 128)          0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 196)               254800    
                                                                 
 dense_2 (Dense)             (None, 32)                6304      
                                                                 
 dense_3 (Dense)             (None, 10)                330       
                                                                 
Total params: 517,434
Trainable params: 517,434
Non-trainable params: 0
________________________________________________

<keras.engine.sequential.Sequential at 0x7fbcb5865900>

In [9]:
hyperparameters = {
    'embed_dim': [32, 64, 128],
    'lstm_out': [32, 64, 128],
    'dense_units': [16, 32, 64],
    'dropout_rate': [0.2, 0.4, 0.6],
    'learning_rate': [1e-4, 1e-3, 1e-2]
}

# Define the model-building function
def build_model1(hp):
    max_features = 2000
    model = Sequential()
    model.add(Embedding(max_features, hp.Int('embed_dim', min_value=32, max_value=128, step=32), input_length=X.shape[1]))
    model.add(SpatialDropout1D(hp.Float('dropout_rate', min_value=0.2, max_value=0.6, step=0.1)))
    model.add(LSTM(hp.Int('lstm_out', min_value=32, max_value=128, step=32), dropout=0.2, recurrent_dropout=0.2))
    activation=hp.Choice("activation", ["relu", "softmax"])
    model.add(Dense(hp.Int('dense_units', min_value=16, max_value=64, step=16), activation=activation))
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')), metrics=['accuracy'])
    return model
                         
build_model1(kt.HyperParameters())

<keras.engine.sequential.Sequential at 0x7fd21f702350>

In [11]:
tuner = kt.Hyperband(build_model1,
                     objective='accuracy',
                     max_epochs=10,
                     factor=3,
                     overwrite=True,
                     directory='dir',
                     project_name='project')


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16)
batch_size = 32
tuner.search(X_train, y_train, epochs=5)
tuner.search_space_summary()

Trial 27 Complete [00h 03m 24s]
accuracy: 0.9727175831794739

Best accuracy So Far: 0.9777302742004395
Total elapsed time: 00h 35m 38s

Search: Running Trial #28

Value             |Best Value So Far |Hyperparameter
128               |128               |embed_dim
0.5               |0.2               |dropout_rate
128               |64                |lstm_out
softmax           |softmax           |activation
64                |32                |dense_units
0.0076576         |0.0019265         |learning_rate
10                |10                |tuner/epochs
0                 |4                 |tuner/initial_epoch
0                 |2                 |tuner/bracket
0                 |2                 |tuner/round

Epoch 1/10
Epoch 2/10

In [None]:
tuner.results_summary()