In [1]:
import theano
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)

import pandas as pd
df = pd.read_csv('complaints.csv')
df.head()

Using gpu device 0: GeForce GTX 1060 3GB (CNMeM is disabled, cuDNN not available)
Using Theano backend.


Unnamed: 0,Consumer complaint narrative,Product
0,Received Capital One charge card offer XXXX. A...,Credit card
1,I do n't know how they got my cell number. I t...,Debt collection
2,I 'm a longtime member of Charter One Bank/RBS...,Credit card
3,"After looking at my credit report, I saw a col...",Credit reporting
4,I received a call from a XXXX XXXX from XXXX @...,Debt collection


In [2]:
def complaint_to_words(comp):
    
    words = RegexpTokenizer('\w+').tokenize(comp)
    num = RegexpTokenizer('\d+').tokenize(comp)
    words = [w for w in words if w not in num]
    words = [w.lower() for w in words]
    
    return words

In [3]:
all_words = list()
for comp in df['Consumer complaint narrative']:
    for w in complaint_to_words(comp):
        all_words.append(w)

In [4]:
len(set(all_words)), len(all_words)

(62943, 19974942)

In [5]:
index_dict = dict()
count = 0
for word in set(all_words):
    index_dict[word] = count
    count += 1

In [6]:
data_list = list()
for comp in df['Consumer complaint narrative']:
    l = list()
    for w in complaint_to_words(comp):
        l.append(index_dict[w])
    data_list.append(l)

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['Product'])
df['Target'] = le.transform(df['Product'])
df.head()

Unnamed: 0,Consumer complaint narrative,Product,Target
0,Received Capital One charge card offer XXXX. A...,Credit card,2
1,I do n't know how they got my cell number. I t...,Debt collection,4
2,I 'm a longtime member of Charter One Bank/RBS...,Credit card,2
3,"After looking at my credit report, I saw a col...",Credit reporting,3
4,I received a call from a XXXX XXXX from XXXX @...,Debt collection,4


In [8]:
from keras.utils.np_utils import to_categorical
y_binary = to_categorical(df['Target'].values)
y_binary.shape

(105504, 12)

In [9]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(numpy.array(data_list), y_binary, 
    test_size=0.4, random_state=0)



In [10]:
type(X_train), type(X_test)

(numpy.ndarray, numpy.ndarray)

In [11]:
len(X_train[0]), len(X_train[1]), len(X_train[2])

(28, 327, 433)

In [12]:
# truncate and pad input sequences
max_review_length = 750
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [13]:
len(X_train[0]), len(X_train[1]), len(X_train[2])

(750, 750, 750)

In [14]:
from keras.layers import Dropout
from keras.layers import GRU

In [None]:
# create the model
top_words = 62943
embedding_vecor_length = 100
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Dropout(0.2))
model.add(LSTM(30, dropout_W=0.2, dropout_U=0.2))
model.add(Dropout(0.2))
model.add(Dense(12, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=1, batch_size=64)

In [15]:
# create the model
top_words = 62943
embedding_vecor_length = 100
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Dropout(0.2))
model.add(GRU(32, dropout_W=0.2, dropout_U=0.2))
model.add(Dropout(0.2))
model.add(Dense(12, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=20, batch_size=128)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 750, 100)      6294300     embedding_input_1[0][0]          
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 750, 100)      0           embedding_1[0][0]                
____________________________________________________________________________________________________
gru_1 (GRU)                      (None, 32)            12768       dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 32)            0           gru_1[0][0]                      
___________________________________________________________________________________________

<keras.callbacks.History at 0x7f6787869c50>

In [16]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 83.06%


In [17]:
scores = model.predict(X_test, verbose=0)

In [18]:
pred = numpy.argmax(scores, axis=1)
y_test = numpy.argmax(y_test, axis=1)

In [19]:
df.Target.unique()

array([ 2,  4,  3, 10,  6,  8,  0,  1,  5,  9,  7, 11])

In [20]:
mapping = dict(zip(range(len(le.classes_)), le.classes_))
mapping

{0: 'Bank account or service',
 1: 'Consumer Loan',
 2: 'Credit card',
 3: 'Credit reporting',
 4: 'Debt collection',
 5: 'Money transfers',
 6: 'Mortgage',
 7: 'Other financial service',
 8: 'Payday loan',
 9: 'Prepaid card',
 10: 'Student loan',
 11: 'Virtual currency'}

In [21]:
from sklearn.metrics import classification_report
target_names = [mapping[i] for i in range(len(le.classes_))]
print(classification_report(y_test, pred, target_names=target_names))

                         precision    recall  f1-score   support

Bank account or service       0.76      0.74      0.75      3719
          Consumer Loan       0.68      0.64      0.66      2348
            Credit card       0.77      0.79      0.78      4954
       Credit reporting       0.87      0.88      0.88      8139
        Debt collection       0.83      0.83      0.83     10753
        Money transfers       0.66      0.53      0.59       377
               Mortgage       0.91      0.94      0.93      9217
Other financial service       0.00      0.00      0.00        71
            Payday loan       0.47      0.42      0.44       440
           Prepaid card       0.71      0.70      0.71       429
           Student loan       0.85      0.84      0.85      1751
       Virtual currency       0.00      0.00      0.00         4

            avg / total       0.83      0.83      0.83     42202



  'precision', 'predicted', average, warn_for)
