In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [3]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer,LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense,Activation,Dropout
from keras.preprocessing import text, sequence
from keras import utils

print(tf.__version__)

2.11.0


In [4]:
df=pd.read_csv('complaints_processed.csv',encoding='latin=1')
df.head()

Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [5]:
df

Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...
...,...,...,...
162416,162416,debt_collection,name
162417,162417,credit_card,name
162418,162418,debt_collection,name
162419,162419,credit_card,name


In [6]:
df.isnull().sum()

Unnamed: 0     0
product        0
narrative     10
dtype: int64

In [7]:
col=['narrative','product']
df=df[col]
df=df[pd.notnull(df['narrative'])]
df.head()

Unnamed: 0,narrative,product
0,purchase order day shipping amount receive pro...,credit_card
1,forwarded message date tue subject please inve...,credit_card
2,forwarded message cc sent friday pdt subject f...,retail_banking
3,payment history missing credit report speciali...,credit_reporting
4,payment history missing credit report made mis...,credit_reporting


In [8]:
df.isnull().sum()

narrative    0
product      0
dtype: int64

In [9]:
df['product'].value_counts()

credit_reporting       91172
debt_collection        23148
mortgages_and_loans    18990
credit_card            15566
retail_banking         13535
Name: product, dtype: int64

In [10]:
train_size=int(len(df)*.8)
print("train size:%d" % train_size)
print("test size:%d" % (len(df)-train_size))

train size:129928
test size:32483


In [11]:
train_narrative = df['narrative'][:train_size]
train_product = df['product'][:train_size]

test_narrative = df['narrative'][train_size:]
test_product = df['product'][train_size:]

In [12]:
max_words=1000
tokenize=text.Tokenizer(num_words=max_words,char_level=False)

tokenize.fit_on_texts(train_narrative)
x_train = tokenize.texts_to_matrix(train_narrative)
x_test = tokenize.texts_to_matrix(test_narrative)

In [13]:
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)

In [14]:
num_classes = np.max(y_train)+1
y_train = utils.to_categorical(y_train,num_classes)
y_test = utils.to_categorical(y_test,num_classes)

In [15]:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('t_test shape:', y_test.shape)

x_train shape: (129928, 1000)
x_test shape: (32483, 1000)
y_train shape: (129928, 5)
t_test shape: (32483, 5)


In [17]:
model=Sequential()
model.add(Dense(512,input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [19]:
batch_size=32
epochs=5

history=model.fit(x_train,y_train,batch_size=batch_size,verbose=1,validation_split=0.1)



In [20]:
score=model.evaluate(x_test,y_test,batch_size=batch_size,verbose=1)
print(score)
print('Test score:',score[0])
print('Test accuracy:',score[1])

[0.4095647931098938, 0.8486900925636292]
Test score: 0.4095647931098938
Test accuracy: 0.8486900925636292


In [23]:
text_labels=encoder.classes_

for i in range(10):
    prediction=model.predict(np.array([x_test[i]]))
    predicted_label=text_labels[np.argmax(prediction)]
    print(test_narrative.iloc[i][:50],"...")
    print('Actual label:' +test_product.iloc[i])
    print("Predicted label:" + predicted_label +"\n")

following copy email message sent yesterday top ex ...
Actual label:credit_card
Predicted label:credit_reporting

following detailed account distress frustration we ...
Actual label:mortgages_and_loans
Predicted label:mortgages_and_loans

following list event occurred national credit syst ...
Actual label:debt_collection
Predicted label:debt_collection

following list credit card sychrony bank follows c ...
Actual label:credit_card
Predicted label:credit_card

following short summary complaint filed today flor ...
Actual label:credit_reporting
Predicted label:credit_reporting

following timeline event detailing issue complaint ...
Actual label:credit_reporting
Predicted label:credit_reporting

following timeline event detailing issue complaint ...
Actual label:credit_reporting
Predicted label:credit_reporting

following timeline event detailing issue complaint ...
Actual label:credit_reporting
Predicted label:credit_reporting

following incorrect personal information please re ...
Actu