Text Classification with Keras

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import word_tokenize, regexp_tokenize

import tensorflow as tf

from sklearn.metrics import confusion_matrix, classification_report

In [30]:
reviews = pd.read_csv('complaints_preprocessed.csv').dropna()

reviews.head() 

Unnamed: 0,Consumer complaint narrative,Issue,Issue_Code
0,name complaint made error neither made third p...,Incorrect information on your report,3.0
1,search point websit legitim believ websit wher...,Fraud or scam,2.0
2,particular account state owe list credit repor...,Incorrect information on your report,3.0
3,suppli proof doctrin estoppel silenc engelhard...,Attempts to collect debt not owed,0.0
4,hello write regard account credit report belon...,Incorrect information on your report,3.0


In [31]:
reviews['Issue'].value_counts(dropna=False) 

Incorrect information on your report    18609
Attempts to collect debt not owed        5169
Communication tactics                    1278
Struggling to pay mortgage               1184
Fraud or scam                             861
Name: Issue, dtype: int64

In [32]:
X = reviews[['Consumer complaint narrative']]
y = reviews['Issue']

le = LabelEncoder().fit(y)
y = le.transform(y)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state = 321, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state = 321, stratify = y_train_val)

In [33]:
vect = TfidfVectorizer()

X_train_vec = vect.fit_transform(X_train['Consumer complaint narrative']).toarray()
X_val_vec = vect.transform(X_val['Consumer complaint narrative']).toarray()
X_test_vec = vect.transform(X_test['Consumer complaint narrative']).toarray() 

In [34]:
#Start with a sequential model
model = tf.keras.Sequential()

# Then add dense layer with relu activation
model.add(tf.keras.layers.Dense(units = 256,
                                input_shape = (vect.get_feature_names_out().shape[0],),
                                activation = 'relu'))
# And end with an output layer with sigmoid activation 
model.add(tf.keras.layers.Dense(5, activation = 'softmax'))

# Finally, compile the model
model.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3) 

In [35]:
history = model.fit(X_train_vec, y_train,
                    validation_data = (X_val_vec, y_val),
                    epochs=100, callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [36]:
# n.argmax axis = 1
y_pred = np.argmax(model.predict(X_test_vec), axis = 1)

confusion_matrix(y_test, y_pred) 



array([[ 834,   66,   13,  375,    4],
       [  74,  219,    1,   26,    0],
       [  12,    3,  183,   17,    0],
       [ 363,   17,    5, 4255,   13],
       [  10,    3,    0,   27,  256]])

In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.65      0.65      1292
           1       0.71      0.68      0.70       320
           2       0.91      0.85      0.88       215
           3       0.91      0.91      0.91      4653
           4       0.94      0.86      0.90       296

    accuracy                           0.85      6776
   macro avg       0.82      0.79      0.81      6776
weighted avg       0.85      0.85      0.85      6776



In [40]:
le.inverse_transform([0]) 

array(['Attempts to collect debt not owed'], dtype=object)

In [39]:
le.inverse_transform([3]) 

array(['Incorrect information on your report'], dtype=object)

Text Classification Using an LSTM

In [41]:
X = [[x for x in regexp_tokenize(sentence.lower(), pattern = r"[a-z]+")] for sentence in reviews['Consumer complaint narrative'].tolist()] 

In [42]:
tokens = sorted(set([x for sentence in X for x in sentence]))

token_index = {word:i + 1 for i, word in enumerate(tokens)}
index_token = {i + 1:word for i, word in enumerate(tokens)}

# We need to reserve the zero token for when we pad our sequences.
vocab_size = len(token_index) + 1

In [43]:
X = [[token_index[word] for word in sentence] for sentence in X]

In [44]:
pd.Series([len(sentence) for sentence in X]).describe()

count    27101.000000
mean        79.047895
std        101.831073
min          1.000000
25%         26.000000
50%         48.000000
75%         93.000000
max       2785.000000
dtype: float64

In [45]:
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen = 100)

In [46]:
X[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,  8538,  2661,
        7796,  4572,  8638,  7796, 13080,  9428,  3411,  9557,  9606,
         507,  9638,  2635, 14664,  1082, 13650, 14081, 12022, 11560,
        8894,  1981, 14081,  9638,  6388,  6740,   790,  5712, 11659,
        8298, 12544,  3116,   131,  7328,  7007,  5480, 13577,  2475,
        9616, 10312,  2839, 10903,  8664,  5589, 14760,  2804, 10903,
         744,  2839, 10903,  3326,  9124,  1215,  9802,  3491,   131,
         908], dtype=int32)

In [50]:
y = reviews['Issue']

le = LabelEncoder().fit(y)
y = le.transform(y)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state = 321, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state = 321, stratify = y_train_val)

In [51]:
lstm_model = tf.keras.models.Sequential()

lstm_model.add(tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = 128, mask_zero = True))
lstm_model.add(tf.keras.layers.LSTM(32))
lstm_model.add(tf.keras.layers.Dense(32))
lstm_model.add(tf.keras.layers.Dense(5, activation = 'softmax'))

lstm_model.compile(optimizer = 'rmsprop', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [52]:
history = lstm_model.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs=100, callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


In [54]:
y_pred = np.argmax(lstm_model.predict(X_test_vec), axis = 1)

confusion_matrix(y_test, y_pred)



array([[1291,    0,    0,    1,    0],
       [ 320,    0,    0,    0,    0],
       [ 215,    0,    0,    0,    0],
       [4647,    0,    0,    6,    0],
       [ 296,    0,    0,    0,    0]])

In [55]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.19      1.00      0.32      1292
           1       0.00      0.00      0.00       320
           2       0.00      0.00      0.00       215
           3       0.86      0.00      0.00      4653
           4       0.00      0.00      0.00       296

    accuracy                           0.19      6776
   macro avg       0.21      0.20      0.06      6776
weighted avg       0.62      0.19      0.06      6776



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


LSTM layer Bidirectional

In [56]:
lstm_model = tf.keras.models.Sequential()

lstm_model.add(tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = 128, mask_zero = True))
lstm_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)))
lstm_model.add(tf.keras.layers.Dense(32))
lstm_model.add(tf.keras.layers.Dense(5, activation = 'softmax'))

lstm_model.compile(optimizer = 'rmsprop', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [57]:
history = lstm_model.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs=100, callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [58]:
y_pred = np.argmax(lstm_model.predict(X_test_vec), axis = 1)

confusion_matrix(y_test, y_pred) 



array([[ 635,  229,  427,    1,    0],
       [ 174,   58,   88,    0,    0],
       [  71,   36,  108,    0,    0],
       [2368, 1060, 1219,    6,    0],
       [  58,   44,  194,    0,    0]])

In [59]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.19      0.49      0.28      1292
           1       0.04      0.18      0.07       320
           2       0.05      0.50      0.10       215
           3       0.86      0.00      0.00      4653
           4       0.00      0.00      0.00       296

    accuracy                           0.12      6776
   macro avg       0.23      0.24      0.09      6776
weighted avg       0.63      0.12      0.06      6776



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
