Text Classification with Keras

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import word_tokenize, regexp_tokenize

import tensorflow as tf

from sklearn.metrics import confusion_matrix, classification_report

In [6]:
reviews = pd.read_csv('complaints_preprocessed.csv').dropna()

reviews.head() 

Unnamed: 0,Consumer complaint narrative,Issue,Issue_Code
0,name complaint made error neither made third p...,Incorrect information on your report,3.0
1,search point websit legitim believ websit wher...,Fraud or scam,2.0
2,particular account state owe list credit repor...,Incorrect information on your report,3.0
3,suppli proof doctrin estoppel silenc engelhard...,Attempts to collect debt not owed,0.0
4,hello write regard account credit report belon...,Incorrect information on your report,3.0


In [7]:
reviews['Issue'].value_counts(dropna=False) 

Incorrect information on your report    5384
Attempts to collect debt not owed       1584
Struggling to pay mortgage               334
Communication tactics                    300
Fraud or scam                            227
Name: Issue, dtype: int64

In [8]:
X = reviews[['Consumer complaint narrative']]
y = reviews['Issue']

le = LabelEncoder().fit(y)
y = le.transform(y)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state = 321, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state = 321, stratify = y_train_val)

In [9]:
vect = TfidfVectorizer()

X_train_vec = vect.fit_transform(X_train['Consumer complaint narrative']).toarray()
X_val_vec = vect.transform(X_val['Consumer complaint narrative']).toarray()
X_test_vec = vect.transform(X_test['Consumer complaint narrative']).toarray() 

In [10]:
#Start with a sequential model
model = tf.keras.Sequential()

# Then add dense layer with relu activation
model.add(tf.keras.layers.Dense(units = 256,
                                input_shape = (vect.get_feature_names_out().shape[0],),
                                activation = 'relu'))
# And end with an output layer with sigmoid activation 
model.add(tf.keras.layers.Dense(5, activation = 'softmax'))

# Finally, compile the model
model.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3) 

In [11]:
history = model.fit(X_train_vec, y_train,
                    validation_data = (X_val_vec, y_val),
                    epochs=100, callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


In [12]:
# n.argmax axis = 1
y_pred = np.argmax(model.predict(X_test_vec), axis = 1)

confusion_matrix(y_test, y_pred) 



array([[ 219,    9,    2,  163,    3],
       [  26,   38,    1,   10,    0],
       [   5,    0,   44,    8,    0],
       [  80,    1,    0, 1258,    7],
       [   2,    0,    0,   14,   68]])

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.55      0.60       396
           1       0.79      0.51      0.62        75
           2       0.94      0.77      0.85        57
           3       0.87      0.93      0.90      1346
           4       0.87      0.81      0.84        84

    accuracy                           0.83      1958
   macro avg       0.83      0.72      0.76      1958
weighted avg       0.82      0.83      0.82      1958



In [14]:
le.inverse_transform([0]) 

array(['Attempts to collect debt not owed'], dtype=object)

In [15]:
le.inverse_transform([3]) 

array(['Incorrect information on your report'], dtype=object)

Text Classification Using an LSTM

In [16]:
X = [[x for x in regexp_tokenize(sentence.lower(), pattern = r"[a-z]+")] for sentence in reviews['Consumer complaint narrative'].tolist()] 

In [17]:
tokens = sorted(set([x for sentence in X for x in sentence]))

token_index = {word:i + 1 for i, word in enumerate(tokens)}
index_token = {i + 1:word for i, word in enumerate(tokens)}

# We need to reserve the zero token for when we pad our sequences.
vocab_size = len(token_index) + 1

In [18]:
X = [[token_index[word] for word in sentence] for sentence in X]

In [19]:
pd.Series([len(sentence) for sentence in X]).describe()

count    7829.000000
mean       77.850556
std       103.816742
min         1.000000
25%        25.000000
50%        45.000000
75%        88.000000
max      1671.000000
dtype: float64

In [20]:
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen = 100)

In [21]:
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 4854,
       1491, 4442, 2578, 4901, 4442, 7370, 5308, 1906, 5379, 5405,  271,
       5424, 1473, 8232,  592, 7647, 7897, 6754, 6495, 5022, 1111, 7897,
       5424, 3655, 3834,  412, 3258, 6562, 4734, 7054, 1744,   81, 4167,
       3992, 3117, 7616, 1390, 5410, 5800, 1597, 6137, 4916, 3185, 8300,
       1576, 6137,  391, 1597, 6137, 1856, 5151,  670, 5512, 1958,   81,
        487], dtype=int32)

In [22]:
y = reviews['Issue']

le = LabelEncoder().fit(y)
y = le.transform(y)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state = 321, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state = 321, stratify = y_train_val)

In [23]:
lstm_model = tf.keras.models.Sequential()

lstm_model.add(tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = 128, mask_zero = True))
lstm_model.add(tf.keras.layers.LSTM(32))
lstm_model.add(tf.keras.layers.Dense(32))
lstm_model.add(tf.keras.layers.Dense(5, activation = 'softmax'))

lstm_model.compile(optimizer = 'rmsprop', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [24]:
history = lstm_model.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs=100, callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [25]:
y_pred = np.argmax(lstm_model.predict(X_test), axis = 1)

confusion_matrix(y_test, y_pred) 



array([[ 186,   39,    2,  166,    3],
       [  20,   43,    6,    5,    1],
       [   2,    4,   41,    3,    7],
       [  83,    5,    5, 1246,    7],
       [   3,    1,    3,   18,   59]])

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.47      0.54       396
           1       0.47      0.57      0.51        75
           2       0.72      0.72      0.72        57
           3       0.87      0.93      0.90      1346
           4       0.77      0.70      0.73        84

    accuracy                           0.80      1958
   macro avg       0.69      0.68      0.68      1958
weighted avg       0.80      0.80      0.80      1958



LSTM layer Bidirectional

In [27]:
lstm_model = tf.keras.models.Sequential()

lstm_model.add(tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = 128, mask_zero = True))
lstm_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)))
lstm_model.add(tf.keras.layers.Dense(32))
lstm_model.add(tf.keras.layers.Dense(5, activation = 'softmax'))

lstm_model.compile(optimizer = 'rmsprop', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [28]:
history = lstm_model.fit(X_train, y_train,
                    validation_data = (X_val, y_val),
                    epochs=100, callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


In [29]:
y_pred = np.argmax(lstm_model.predict(X_test), axis = 1)

confusion_matrix(y_test, y_pred) 



array([[ 215,   24,    4,  152,    1],
       [  24,   45,    1,    5,    0],
       [   1,    5,   40,    9,    2],
       [  87,    4,    2, 1247,    6],
       [   0,    4,    2,   19,   59]])

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.54      0.59       396
           1       0.55      0.60      0.57        75
           2       0.82      0.70      0.75        57
           3       0.87      0.93      0.90      1346
           4       0.87      0.70      0.78        84

    accuracy                           0.82      1958
   macro avg       0.75      0.69      0.72      1958
weighted avg       0.81      0.82      0.81      1958

