In [11]:
import pandas as pd
import numpy as np
from tensorflow import keras

In [12]:
df = pd.read_csv("consumer_complaints.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
df.columns

Index(['date_received', 'product', 'sub_product', 'issue', 'sub_issue',
       'consumer_complaint_narrative', 'company_public_response', 'company',
       'state', 'zipcode', 'tags', 'consumer_consent_provided',
       'submitted_via', 'date_sent_to_company', 'company_response_to_consumer',
       'timely_response', 'consumer_disputed?', 'complaint_id'],
      dtype='object')

In [14]:
df = df[['product','consumer_complaint_narrative']]

In [15]:
df.shape

(555957, 2)

In [16]:
df.dropna(axis=0,inplace=True)

In [17]:
df.shape

(66806, 2)

In [18]:
df['product'].value_counts()

Debt collection            17552
Mortgage                   14919
Credit reporting           12526
Credit card                 7929
Bank account or service     5711
Consumer Loan               3678
Student loan                2128
Prepaid card                 861
Payday loan                  726
Money transfers              666
Other financial service      110
Name: product, dtype: int64

In [19]:
df['consumer_complaint_narrative']

190126    XXXX has claimed I owe them {$27.00} for XXXX ...
190135    Due to inconsistencies in the amount owed that...
190155    In XX/XX/XXXX my wages that I earned at my job...
190207    I have an open and current mortgage with Chase...
190208    XXXX was submitted XX/XX/XXXX. At the time I s...
                                ...                        
553084    XXXX XXXX is reporting incorrectly, payments h...
553085    Reflecting incorrect payment status. Have been...
553086    I have been paying {$180.00} a month through d...
553090    I recently became aware that Amerisave Mortgag...
553096    Bank of America has demonstrated an on-going l...
Name: consumer_complaint_narrative, Length: 66806, dtype: object

In [13]:
# !pip install spacy
# !pip install nltk

In [1]:
import re
import spacy

In [5]:
# !python -m spacy download en_core_web_sm

In [6]:
spacy_nlp = spacy.load('en_core_web_sm')

In [51]:
doc = spacy_nlp(df['consumer_complaint_narrative'][190126])
tokens = ' '.join([token.lemma_ for token in doc if not token.is_stop])

In [52]:
tokens

'XXXX claim owe { $ 27.00 } XXXX year despite proof PAYMENT send : cancel check ownpaid INVOICE { $ 27.00 } ! \n continue insist owe collection agency . \n stop harassment bill pay year ago ? \n'

In [31]:
df['consumer_complaint_narrative'][190126]

'XXXX has claimed I owe them {$27.00} for XXXX years despite the PROOF of PAYMENT I sent them : canceled check and their ownPAID INVOICE for {$27.00}! \nThey continue to insist I owe them and collection agencies are after me. \nHow can I stop this harassment for a bill I already paid four years ago? \n'

In [34]:

' '.join([token.lemma_ for token in doc])

'XXXX have claim I owe they { $ 27.00 } for XXXX year despite the proof of PAYMENT I send they : cancel check and their ownpaid INVOICE for { $ 27.00 } ! \n they continue to insist I owe they and collection agency be after I . \n how can I stop this harassment for a bill I already pay four year ago ? \n'

In [35]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

In [36]:
REPLACE_BY_SPACE_RE

re.compile(r'[/(){}\[\]\|@,;]', re.UNICODE)

In [37]:
mytext = df['consumer_complaint_narrative'][190126]

In [39]:
re.sub(REPLACE_BY_SPACE_RE, "", string=mytext)

'XXXX has claimed I owe them $27.00 for XXXX years despite the PROOF of PAYMENT I sent them : canceled check and their ownPAID INVOICE for $27.00! \nThey continue to insist I owe them and collection agencies are after me. \nHow can I stop this harassment for a bill I already paid four years ago? \n'

In [56]:
def clean_text(text):
    text = text.lower()
    text = re.sub(REPLACE_BY_SPACE_RE, "", string=text)
    text = re.sub(BAD_SYMBOLS_RE,"", string=text)
    text = text.replace("x","")
    doc = spacy_nlp(text)
    lemmatized_no_stopwords = ' '.join([token.lemma_ for token in doc if not token.is_stop])
    
    return lemmatized_no_stopwords

In [57]:
df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].apply(clean_text)
df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].str.replace('\d+', '')

  df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].str.replace('\d+', '')


In [58]:
df.to_csv("consumer_complaints_preprocessed.csv",index=False)

In [60]:
df.head()

Unnamed: 0,product,consumer_complaint_narrative
190126,Debt collection,claim owe year despite proof payment send...
190135,Consumer Loan,inconsistency owe tell m t bank report credi...
190155,Mortgage,wage earn job decrease half know trouble h...
190207,Mortgage,open current mortgage chase bank # chase rep...
190208,Mortgage,submit time submit complaint deal rushmore...


# Apply LSTM model

In [3]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp39-cp39-win_amd64.whl (6.9 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Collecting scipy>=0.19.1
  Downloading scipy-1.7.1-cp39-cp39-win_amd64.whl (33.8 MB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-0.24.2 scipy-1.7.1 threadpoolctl-2.2.0


In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv("consumer_complaints_preprocessed.csv")

In [3]:
df[df['consumer_complaint_narrative'].isnull()]

Unnamed: 0,product,consumer_complaint_narrative
19849,Debt collection,
45598,Debt collection,
58667,Debt collection,


In [4]:
df.isnull().sum()

product                         0
consumer_complaint_narrative    3
dtype: int64

In [5]:
df.dropna(inplace=True,axis=0)

In [6]:
num_vocab = 5000
tokenizer = Tokenizer(num_words = num_vocab,lower=True, oov_token='OOV')
tokenizer.fit_on_texts(df['consumer_complaint_narrative'].values)
word_index = tokenizer.word_index
print(len(word_index))

53427


In [7]:
X = tokenizer.texts_to_sequences(df['consumer_complaint_narrative'].values)

In [8]:
check = X[0]
sentence = ' '
for c in check:
    word = list(word_index.keys())[list(word_index.values()).index(c)]
    sentence = sentence + ' ' + word
print(sentence)
# check for OOV

  claim owe year despite proof payment send cancel check OOV invoice continue insist owe collection agency stop harassment bill pay year ago


In [9]:
df['consumer_complaint_narrative'][0] #raw value

'  claim owe    year despite proof payment send   cancel check ownpaid invoice  continue insist owe collection agency stop harassment bill pay year ago'

In [10]:
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100

X = pad_sequences(X,maxlen=MAX_SEQUENCE_LENGTH)

In [11]:
y = pd.get_dummies(df['product']).values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 42, stratify=y)

In [13]:
from tensorflow.keras.layers import SpatialDropout1D, Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Optimizer

In [14]:
X_train.shape

(53442, 250)

In [15]:
y_train.shape

(53442, 11)

In [16]:
model = Sequential()
model.add(Embedding(num_vocab, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.1))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64


In [17]:
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
model.save("keyword_model.h5")

In [20]:
import pickle
with open("tokenizer.pkl","wb") as myfile:
    pickle.dump(tokenizer, myfile)

In [21]:
with open("tokenizer.pkl","rb") as r:
    t = pickle.load(r)

In [23]:
t.texts_to_sequences(df['consumer_complaint_narrative'][0])

[[],
 [],
 [825],
 [1363],
 [2701],
 [2038],
 [109],
 [],
 [1778],
 [1812],
 [646],
 [],
 [],
 [],
 [],
 [2749],
 [646],
 [2701],
 [1997],
 [],
 [240],
 [646],
 [23],
 [1811],
 [2038],
 [452],
 [646],
 [],
 [1811],
 [1997],
 [1778],
 [1778],
 [1628],
 [],
 [1811],
 [2701],
 [2749],
 [109],
 [646],
 [1731],
 [452],
 [],
 [23],
 [646],
 [1731],
 [240],
 [],
 [],
 [],
 [825],
 [2701],
 [1731],
 [825],
 [646],
 [1363],
 [],
 [825],
 [3046],
 [646],
 [825],
 [1710],
 [],
 [1778],
 [1812],
 [1731],
 [1811],
 [2701],
 [2038],
 [240],
 [],
 [2038],
 [1731],
 [2004],
 [1778],
 [2038],
 [825],
 [646],
 [],
 [],
 [825],
 [1778],
 [1731],
 [452],
 [2038],
 [1731],
 [1760],
 [646],
 [],
 [2038],
 [1731],
 [23],
 [2038],
 [23],
 [452],
 [],
 [1778],
 [1812],
 [646],
 [],
 [825],
 [1778],
 [1363],
 [1363],
 [646],
 [825],
 [452],
 [2038],
 [1778],
 [1731],
 [],
 [2701],
 [1482],
 [646],
 [1731],
 [825],
 [2749],
 [],
 [23],
 [452],
 [1778],
 [1811],
 [],
 [3046],
 [2701],
 [1997],
 [2701],
 [23],
 [2

In [24]:
new_complaint = ['I am a victim of identity theft and someone stole my identity and personal information to open up a Visa credit card account with Bank of America. The following Bank of America Visa credit card account do not belong to me : XXXX.']
seq = t.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['Credit reporting, credit repair services, or other personal consumer reports', 'Debt collection', 'Mortgage', 'Credit card or prepaid card', 'Student loan', 'Bank account or service', 'Checking or savings account', 'Consumer Loan', 'Payday loan, title loan, or personal loan', 'Vehicle loan or lease', 'Money transfer, virtual currency, or money service', 'Money transfers', 'Prepaid card']
print(pred, labels[np.argmax(pred)])

[[6.0859401e-02 5.8444473e-03 8.7901622e-01 8.9431554e-03 1.8662320e-02
  5.8058678e-04 3.0937444e-03 5.5169128e-04 1.4589055e-04 2.2226304e-02
  7.6301374e-05]] Mortgage


In [25]:
MAX_SEQUENCE_LENGTH

250

In [33]:
y_train.shape[1]

11

In [32]:
len(pd.get_dummies(df['product']).columns)

11

In [31]:
df['consumer_complaint_narrative'][0]

'  claim owe    year despite proof payment send   cancel check ownpaid invoice  continue insist owe collection agency stop harassment bill pay year ago'