### Load data

In [1]:
!wget  https://www.kaggle.com/api/v1/datasets/download/shashwatwork/consume-complaints-dataset-fo-nlp

--2025-01-19 11:22:53--  https://www.kaggle.com/api/v1/datasets/download/shashwatwork/consume-complaints-dataset-fo-nlp
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://storage.googleapis.com:443/kaggle-data-sets/1363681/2265312/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20250119%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250119T112254Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=9f66b6397a7273a6e2aa5a8e5449d32222ced9b917dcab0aac1e8ed7fbb0b5d7e25937254ee6a8a05e85f963a659c54b38b74c05e6bca2e398f1957623723d4e6157de80028e94fc7cd88175b4a9aa917b9ff69ace01073cf486710027dcdf715a0608d6ec81f24f2cb7be673f7c7c145dee2bd327e6a6a217cea0875360255c3cbfc4edc654444c902d617647284d26dff2e13a555b9fbc91ce1648f24833847b52fdb85b7ddc5ba7cd06be5549e8

In [2]:
# Import series of helper functions for the notebook
# Import helper function
!wget https://raw.githubusercontent.com/ositawisdomchinedu/helper-function/refs/heads/main/helper_function.py

from helper_function import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

--2025-01-19 11:22:57--  https://raw.githubusercontent.com/ositawisdomchinedu/helper-function/refs/heads/main/helper_function.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10715 (10K) [text/plain]
Saving to: ‘helper_function.py’


2025-01-19 11:22:57 (79.1 MB/s) - ‘helper_function.py’ saved [10715/10715]



In [3]:
# Unzip data
unzip_data("consume-complaints-dataset-fo-nlp")

### Visualizing a text dataset

In [4]:
import pandas as pd
df = pd.read_csv("complaints_processed.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [5]:
print(f"Total number of complaints: {len(df)}")

Total number of complaints: 162421


In [6]:
# How many examples of each class?
df["product"].value_counts()

Unnamed: 0_level_0,count
product,Unnamed: 1_level_1
credit_reporting,91179
debt_collection,23150
mortgages_and_loans,18990
credit_card,15566
retail_banking,13536


In [7]:
# Shuffle dataframe
df = df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility
df.head()

Unnamed: 0.1,Unnamed: 0,product,narrative
156566,156566,mortgages_and_loans,penfed asking copy driver license finalizing l...
1498,1498,credit_reporting,collection account removed credit report frank...
134991,134991,credit_reporting,bureau falsely reporting alleged debt fdcpa se...
56391,56391,mortgages_and_loans,va mortgage well fargo bank since meet conditi...
9067,9067,credit_reporting,bank xxxxi credit card mine


In [8]:
# check for missing values
df.isna().sum()

Unnamed: 0,0
Unnamed: 0,0
product,0
narrative,10


In [9]:
# remove missing values
df = df.dropna()
df.isna().sum()

Unnamed: 0,0
Unnamed: 0,0
product,0
narrative,0


In [10]:
# Let's visualize some random examples
import random
random_index = random.randint(0, len(df)-5) # create random indexes not higher than the total number of samples
for row in df[["narrative", "product"]][random_index:random_index+5].itertuples():
  _, narrative, product = row
  print(f"Product: {product}", f"Narrative: {narrative}", sep=" | ")
  print("---")


Product: credit_card | Narrative: citi bank closed credit card without asking gotten message submit complaint cfpb previous case id number one agent called discussed situation pretty sure even though account got closed bank even reopening failed still redeem point statement credit called supervisor citi thankyou point center told lost point really ridiculous
---
Product: mortgages_and_loans | Narrative: starting year ago tried reach reach sallie mae see start settling student loan trouble getting touch someone help get information account meantime grandfather loan harassed debt collector threatening sue giving specific much owed settle loan eventually figured debt got sold since reached get started repayment option lead shady website broken link even given disconnected contact number ongoing process year pointed credit repair service thought could help get bottom situation would talk service claimed loan shown none reported downloaded pdf reflects service working month get taken record

### Split data into training and test sets

In [11]:
from sklearn.model_selection import train_test_split

train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    df["narrative"], df["product"], test_size=0.2, random_state=42
)

In [12]:
train_sentences[:10], train_labels[:10]

(82729     shocked reviewed credit report found day late ...
 44103     found identity compromised wallet stolen movie...
 155634                            formerly date amount owed
 52565     trying absolute best fix issue credit report m...
 156845    several issue credit report need attention dea...
 120840    quicken sent loan estimate took back gave high...
 98201     credit score equifax dropped point one week re...
 30789     filing complaint following collection company ...
 149450    well fargo credit worse late payment year tryi...
 145642    transunion closed dispute never investigated d...
 Name: narrative, dtype: object,
 82729             credit_card
 44103        credit_reporting
 155634       credit_reporting
 52565        credit_reporting
 156845       credit_reporting
 120840    mortgages_and_loans
 98201        credit_reporting
 30789        credit_reporting
 149450        debt_collection
 145642       credit_reporting
 Name: product, dtype: object)

### Make Numeric Label(Label Encoder)

In [13]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the labels
train_label_encoded = label_encoder.fit_transform(train_labels.to_numpy())
test_label_encoded = label_encoder.transform(test_labels.to_numpy())

In [14]:
# Get class names and number of classes from LabelEncoder instance
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
num_classes, class_names

(5,
 array(['credit_card', 'credit_reporting', 'debt_collection',
        'mortgages_and_loans', 'retail_banking'], dtype=object))

### One Hot Label

In [15]:
# One hot encode labels
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse_output=False)
train_labels_one_hot = one_hot_encoder.fit_transform(train_labels.to_numpy().reshape(-1, 1))
test_labels_one_hot = one_hot_encoder.transform(test_labels.to_numpy().reshape(-1, 1))

train_labels_one_hot[:10], test_labels_one_hot[:10]

(array([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0.]]),
 array([[0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.]]))

In [16]:
train_label_encoded[:100]

array([0, 1, 1, 1, 1, 3, 1, 1, 2, 1, 2, 0, 0, 1, 1, 3, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 3, 1, 1, 0, 1, 1, 1, 2, 3, 1, 0, 1, 3, 0, 1, 3,
       1, 1, 1, 1, 1, 1, 1, 4, 1, 2, 1, 4, 4, 2, 0, 1, 1, 1, 3, 1, 1, 1,
       1, 4, 4, 1, 1, 3, 1, 1, 1, 1, 1, 4, 1, 4, 1, 1, 2, 4, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 0, 1])

In [17]:
# View the first 10 training sentences and their labels
train_sentences[:5], train_label_encoded[:5]

(82729     shocked reviewed credit report found day late ...
 44103     found identity compromised wallet stolen movie...
 155634                            formerly date amount owed
 52565     trying absolute best fix issue credit report m...
 156845    several issue credit report need attention dea...
 Name: narrative, dtype: object,
 array([0, 1, 1, 1, 1]))

In [18]:
train_sentences= train_sentences.to_numpy()
test_sentences = test_sentences.to_numpy()

In [19]:
train_sentences[:5]

array(['shocked reviewed credit report found day late payment sure happened believe made payment received statement thought monthly statement get',
       'found identity compromised wallet stolen movie theater didnt take proper measure afterwards got duplicate driver license copy social security card several year later im receiving letter letter several unpaid account never opened started investigate pulled credit check open account mine please remove',
       'formerly date amount owed',
       'trying absolute best fix issue credit report make sure everyone everything perfect course come last one amazing bank become known give dang anyone anything car loan company year perfect payment every month according affinity late day disputed late day even worse late payment year ago course affinity decides add way account ever late much due fault would neglect payment day paid every month year time',
       'several issue credit report need attention dealing current address actual name name 

In [20]:
# Find average number of tokens (words) in training narrative
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

80

In [21]:
# Find the max token
max([len(i.split()) for i in train_sentences])

2568

In [22]:
# Find the total number of words in all training sentences
total_words_in_train_sentences = sum(len(sentence.split()) for sentence in train_sentences)
print(total_words_in_train_sentences)

10420013


In [23]:
from collections import Counter

# Use Counter to find unique word frequencies
word_count = Counter(train_sentences)
num_unique_words = len(word_count)

print("Number of unique words:", num_unique_words)

Number of unique words: 101789


### Setup a Text Vectorizer

In [24]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# Setup text vectorization with custom variables
max_vocab_length = num_unique_words
max_length = 80
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    standardize="lower_and_strip_punctuation",
                                    output_sequence_length=max_length)

In [25]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [26]:
# let's choose a random sentence from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
collection company answer back validity debt violated federal state fcra fdcpa pull credit report found receivable management system original creditor account filing complaint violation wrote letter validation certified mail return receipt also wrote credit bureau validity item receivable management system following show proof agreement original creditor collect alleged debt copy signed contract allegedly agreed pay charge medical bill reveals personal medical information violation health insurance portability accountability act rule prohibit disclosing individual protected health information unless specifically authorized creditor reporting history inaccurately last date reported last activity date wrong case u court appeal ninth circuit defamation causing financial injury fine extent damage incurred wronged party deemed court collection agency reporting date last activity instead date first delinquency collection agency validated debt still continue report credit burea

<tf.Tensor: shape=(1, 80), dtype=int64, numpy=
array([[  23,    9,  358,   34,  754,   13,  588,  163,  100,  121,  613,
         615,    2,    4,  179, 3134,  663,  197,  107,   75,    3,  552,
          62,  122,  737,   16,  325,  286,  133,  356,  339,   24,  737,
           2,   30,  754,   42, 3134,  663,  197,  134,  149,   84,  263,
         107,   75,  334,  193,   13,   89,  262,  170, 1100,  373,   52,
          43,  469,  112, 5260,  178,  469,    5,  122, 1090,  212, 5698,
        2854,   76,  752, 3187, 3509,  505, 1779, 1090,    5,  823,  807,
         343,   75,    7]])>

In [27]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 41720
Top 5 most common words: ['', '[UNK]', 'credit', 'account', 'report']
Bottom 5 least common words: ['aall', 'aaid', 'aafees', 'aaccount', 'aaarrrrgghhhhhh']


### Creating an Embedding using an Embedding Layer

In [28]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             embeddings_initializer="uniform",
                             name="embedding_0")

In [29]:
# Get a random sentence from training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
credit score dropped point basically couple day activity justify illegal tampering credit score credit report believe someone illegal tampered credit would like investigation done credit score corrected      

Embedded version:


<tf.Tensor: shape=(1, 80, 128), dtype=float32, numpy=
array([[[ 0.03963456, -0.01034046, -0.01585178, ..., -0.02006817,
          0.01519399,  0.0086733 ],
        [-0.04468935, -0.02270653, -0.02090411, ...,  0.04813968,
          0.04433249,  0.00546809],
        [-0.04253482,  0.04666607,  0.01810433, ...,  0.00967672,
         -0.0395952 , -0.01870422],
        ...,
        [-0.04568076, -0.00421392, -0.04913706, ..., -0.02715926,
          0.01133179,  0.02953133],
        [-0.04568076, -0.00421392, -0.04913706, ..., -0.02715926,
          0.01133179,  0.02953133],
        [-0.04568076, -0.00421392, -0.04913706, ..., -0.02715926,
          0.01133179,  0.02953133]]], dtype=float32)>

### Modelling of text dataset

#### Model 0: Naive Bayes(Baseline)

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_label_encoded)

In [43]:
baseline_score = model_0.score(test_sentences, test_label_encoded)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 80.86%


In [44]:
# Make predictions
baseline_preds = model_0.predict(test_sentences)
baseline_preds[:20]


array([1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 1])

### Creating an evaluation function for our model experiments

In [45]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [46]:
# Get baseline results
baseline_results = calculate_results(y_true=test_label_encoded,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 80.8576794015331,
 'precision': 0.8138630867833732,
 'recall': 0.8085767940153311,
 'f1': 0.7949590552193432}

### Model 1: Conv1D

### Create datasets (as fast as possible)

In [33]:
# Turn our data into TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot))

train_dataset, test_dataset

(<_TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(5,), dtype=tf.float64, name=None))>,
 <_TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(5,), dtype=tf.float64, name=None))>)

In [34]:
# Take the TensorSliceDataset's and turn them into prefetched batches
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

train_dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 5), dtype=tf.float64, name=None))>

In [None]:
len(train_dataset)

4061

In [47]:
tf.random.set_seed(42)
from tensorflow.keras import layers
model_1_embeeding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     name="embedding_1")

# Create 1-dimensional convolutional layer to model sequences
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_1_embeeding(x)
x = layers.Conv1D(filters=64, kernel_size=5, activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(5, activation="softmax")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_conv1d")

# compile the model
model_1.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Get a summary of the model
model_1.summary()

In [None]:
# Create directory to save TensorBoard logs
SAVE_DIR = "model_logs"

# Fit the model
model_1_history = model_1.fit(train_dataset,
                              epochs=5,
                              validation_data=test_dataset,
                              callbacks=[create_tensorboard_callback(SAVE_DIR, "conv1D")])


Saving TensorBoard log files to: model_logs/conv1D/20250119-115509
Epoch 1/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m826s[0m 203ms/step - accuracy: 0.8105 - loss: 0.5332 - val_accuracy: 0.8731 - val_loss: 0.3640
Epoch 2/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m877s[0m 207ms/step - accuracy: 0.8977 - loss: 0.3012 - val_accuracy: 0.8770 - val_loss: 0.3647
Epoch 3/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m833s[0m 205ms/step - accuracy: 0.9360 - loss: 0.2040 - val_accuracy: 0.8738 - val_loss: 0.4031
Epoch 4/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m864s[0m 206ms/step - accuracy: 0.9632 - loss: 0.1297 - val_accuracy: 0.8733 - val_loss: 0.4502
Epoch 5/5
[1m 864/4061[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m10:52[0m 204ms/step - accuracy: 0.9753 - loss: 0.0980

In [None]:
# Evaluate on whole validation dataset (we only validated on 10% of batches during training)
model_1.evaluate(test_dataset)

[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.8660 - loss: 0.5256


[0.5172379612922668, 0.866607129573822]

In [None]:
# Make predictions (our model outputs prediction probabilities for each class)
model_1_pred_probs = model_1.predict(test_dataset)
model_1_pred_probs

[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step


array([[2.3636135e-06, 9.9997085e-01, 2.6560290e-05, 3.7052782e-08,
        1.3552983e-07],
       [3.2481548e-05, 3.5342148e-01, 6.4636594e-01, 1.8000814e-04,
        1.6335161e-07],
       [3.2925282e-08, 9.9999982e-01, 1.6180432e-07, 3.0550023e-09,
        3.6953062e-12],
       ...,
       [8.1120834e-06, 9.8276353e-01, 1.6718645e-02, 5.0977466e-04,
        1.0007583e-11],
       [1.2233122e-01, 9.8960036e-03, 8.6755544e-01, 1.9795027e-04,
        1.9338198e-05],
       [2.1619359e-08, 9.9999678e-01, 3.2624591e-06, 4.2365600e-10,
        2.0185867e-10]], dtype=float32)

In [None]:
# Convert pred probs to classes
model_1_preds = tf.argmax(model_1_pred_probs, axis=1)
model_1_preds

<tf.Tensor: shape=(32483,), dtype=int64, numpy=array([1, 2, 1, ..., 1, 2, 1])>

In [None]:
# Explicitly place the tensor on the CPU using tf.identity
with tf.device("/CPU:0"):
  model_1_preds = tf.identity(model_1_preds)

# Convert to NumPy array
model_1_preds = model_1_preds.numpy()

# Get baseline results
model_1_results = calculate_results(y_true=test_label_encoded,
                                     y_pred=model_1_preds)
model_1_results

{'accuracy': 86.6607148354524,
 'precision': 0.8650127255016565,
 'recall': 0.8666071483545239,
 'f1': 0.863504618108457}

### Model 2 (Add more Layers to model 1)

In [None]:
tf.random.set_seed(42)
from tensorflow.keras import layers
model_2_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_2")

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_2_embedding(x)
x = layers.Conv1D(filters=64, kernel_size=5, activation="relu", kernel_regularizer="l2")(x)
x = layers.Dropout(0.5)(x)  # Add dropout for regularization
x = layers.GlobalMaxPool1D()(x)
x = layers.Dense(128, activation="relu", kernel_regularizer="l2")(x)
x = layers.Dropout(0.5)(x)  # Additional dropout
outputs = layers.Dense(5, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_conv1d_dropout")

# Compile Conv1D model
model_2.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Get a summary of our 1D convolution model
model_2.summary()




In [None]:
# Fit the model
model_2_history = model_2.fit(train_dataset,
                              epochs=5,
                              validation_data=(test_dataset),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     "Conv1D")])

Saving TensorBoard log files to: model_logs/Conv1D/20250118-142240
Epoch 1/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m604s[0m 148ms/step - accuracy: 0.7646 - loss: 0.8792 - val_accuracy: 0.8455 - val_loss: 0.5590
Epoch 2/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m618s[0m 147ms/step - accuracy: 0.8522 - loss: 0.5113 - val_accuracy: 0.8552 - val_loss: 0.5072
Epoch 3/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m603s[0m 149ms/step - accuracy: 0.8644 - loss: 0.4763 - val_accuracy: 0.8546 - val_loss: 0.5033
Epoch 4/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m585s[0m 140ms/step - accuracy: 0.8720 - loss: 0.4520 - val_accuracy: 0.8532 - val_loss: 0.4983
Epoch 5/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m628s[0m 141ms/step - accuracy: 0.8781 - loss: 0.4362 - val_accuracy: 0.8512 - val_loss: 0.4978


In [None]:
model_1.evaluate(test_dataset)

[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.8660 - loss: 0.5256


[0.5172379612922668, 0.866607129573822]

In [None]:
model_2_pred_probs = model_2.predict(test_dataset)
model_2_pred_probs

[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step


array([[0.03582832, 0.9733689 , 0.1482688 , 0.02982315, 0.00347476],
       [0.07495371, 0.8871956 , 0.52219373, 0.05292203, 0.01043321],
       [0.02562962, 0.9815835 , 0.09932404, 0.02884042, 0.00213011],
       ...,
       [0.02922964, 0.9552782 , 0.39827496, 0.05124141, 0.00258343],
       [0.5300608 , 0.411433  , 0.64823264, 0.04185761, 0.09817946],
       [0.03939976, 0.9724404 , 0.2005597 , 0.01992576, 0.00310837]],
      dtype=float32)

In [None]:
model_2_preds = tf.argmax(model_2_pred_probs, axis=1)
model_2_preds

<tf.Tensor: shape=(32483,), dtype=int64, numpy=array([1, 1, 1, ..., 1, 2, 1])>

In [None]:
# Explicitly place the tensor on the CPU using tf.identity
with tf.device("/CPU:0"):
  model_2_preds = tf.identity(model_1_preds)

# Convert to NumPy array
model_2_preds = model_2_preds.numpy()

# Get baseline results
model_2_results = calculate_results(y_true=test_label_encoded,
                                     y_pred=model_1_preds)
model_2_results

{'accuracy': 86.6607148354524,
 'precision': 0.8650127255016565,
 'recall': 0.8666071483545239,
 'f1': 0.863504618108457}

### Model 3: TensorFlow Hub Pretrained Sentence Encoder (Universal Sentence Encoder)

In [36]:
# We can use this encoding layer in place of our text_vectorizer and embedding layer
import tensorflow_hub as hub
tf_hub_embedding_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2",)

tf_hub_embedding_layer._name = "universal_sentence_encoder"
tf_hub_embedding_layer.trainable = False

In [37]:
# Define feature extractor model using TF Hub layer
from tensorflow.keras import layers
inputs = layers.Input(shape=(), dtype=tf.string)
pretrained_embedding = layers.Lambda(
    lambda x: tf_hub_embedding_layer(x),
    output_shape=(512,),  # Universal Sentence Encoder outputs 512-dimensional vectors
    name="embedding_lambda"
)(inputs)
x = layers.Dense(128, activation="relu")(pretrained_embedding)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(5, activation="softmax")(x)
model_3 = tf.keras.Model(inputs=inputs,
                        outputs=outputs)

# Compile the model
model_3.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Get a summary of the model
model_3.summary()

In [38]:
# Fit the model
SAVE_DIR = "model_logs"
model_3_history = model_3.fit(train_dataset,
                              epochs=5,
                              validation_data=(test_dataset),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     "USE")])

Saving TensorBoard log files to: model_logs/USE/20250119-112717
Epoch 1/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 46ms/step - accuracy: 0.7252 - loss: 0.8211 - val_accuracy: 0.8293 - val_loss: 0.4722
Epoch 2/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 37ms/step - accuracy: 0.8087 - loss: 0.5432 - val_accuracy: 0.8365 - val_loss: 0.4523
Epoch 3/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 37ms/step - accuracy: 0.8183 - loss: 0.5179 - val_accuracy: 0.8399 - val_loss: 0.4414
Epoch 4/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 36ms/step - accuracy: 0.8212 - loss: 0.5080 - val_accuracy: 0.8426 - val_loss: 0.4341
Epoch 5/5
[1m4061/4061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 40ms/step - accuracy: 0.8257 - loss: 0.4970 - val_accuracy: 0.8446 - val_loss: 0.4306


In [39]:
model_3.evaluate(test_dataset)

[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 29ms/step - accuracy: 0.8450 - loss: 0.4324


[0.43057096004486084, 0.8446264266967773]

In [40]:
model_3_pred_probs = model_3.predict(test_dataset)
model_3_pred_probs

[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 43ms/step


array([[2.3896603e-03, 9.6783561e-01, 2.7564995e-02, 1.5653598e-03,
        6.4421195e-04],
       [5.7339990e-03, 6.4536905e-01, 3.3969244e-01, 8.3254930e-03,
        8.7909692e-04],
       [6.5575499e-04, 9.7824502e-01, 2.0072317e-02, 9.8442216e-04,
        4.2541171e-05],
       ...,
       [7.7899679e-04, 6.6349626e-01, 3.3038393e-01, 5.1258057e-03,
        2.1492194e-04],
       [1.0816603e-01, 2.9426959e-01, 5.8199835e-01, 9.1243582e-03,
        6.4416947e-03],
       [7.6606224e-04, 9.5830280e-01, 3.7234034e-02, 3.5240576e-03,
        1.7309877e-04]], dtype=float32)

In [41]:
model_3_preds = tf.argmax(model_3_pred_probs, axis=1)
model_3_preds

<tf.Tensor: shape=(32483,), dtype=int64, numpy=array([1, 1, 1, ..., 1, 2, 1])>

### Comparing the performance of each of our models

In [None]:
# Combine model results into a DataFrame
all_model_results = pd.DataFrame({"baseline": baseline_results,
                                  "naive_bayes": model_0_results,
                                   "conv1d": model_1_results,
                                  "conv1d_dropout": model_2_results,
                                  "tf_hub_sentence_encoder": model_3_results,
                                  })
all_model_results = all_model_results.transpose()
all_model_results

In [None]:
# Reduce the accuracy to same scale as other metrics
all_model_results["accuracy"] = all_model_results["accuracy"]/100

In [None]:
# Plot and compare all of the model results
all_model_results.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0));