In [1]:
# https://www.kaggle.com/willianbecker/

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout
from sklearn.model_selection import train_test_split
import re
import numpy as np 
import pandas as pd
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

Using TensorFlow backend.


In [2]:
import pandas as pd
df = pd.read_csv("../input/us-consumer-finance-complaints/consumer_complaints.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555957 entries, 0 to 555956
Data columns (total 18 columns):
date_received                   555957 non-null object
product                         555957 non-null object
sub_product                     397635 non-null object
issue                           555957 non-null object
sub_issue                       212622 non-null object
consumer_complaint_narrative    66806 non-null object
company_public_response         85124 non-null object
company                         555957 non-null object
state                           551070 non-null object
zipcode                         551452 non-null object
tags                            77959 non-null object
consumer_consent_provided       123458 non-null object
submitted_via                   555957 non-null object
date_sent_to_company            555957 non-null object
company_response_to_consumer    555957 non-null object
timely_response                 555957 non-null object
consumer_dis

In [4]:
print(df["product"].value_counts())

Mortgage                   186475
Debt collection            101052
Credit reporting            91854
Credit card                 66468
Bank account or service     62563
Consumer Loan               20990
Student loan                15839
Payday loan                  3877
Money transfers              3812
Prepaid card                 2470
Other financial service       557
Name: product, dtype: int64


In [5]:
# texto do usuario
df = df[df["consumer_complaint_narrative"].isnull() == False]

In [6]:
print(df["product"].value_counts())

Debt collection            17552
Mortgage                   14919
Credit reporting           12526
Credit card                 7929
Bank account or service     5711
Consumer Loan               3678
Student loan                2128
Prepaid card                 861
Payday loan                  726
Money transfers              666
Other financial service      110
Name: product, dtype: int64


In [7]:
df.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
190126,03/19/2015,Debt collection,"Other (i.e. phone, health club, etc.)",Cont'd attempts collect debt not owed,Debt was paid,XXXX has claimed I owe them {$27.00} for XXXX ...,,"Diversified Consultants, Inc.",NY,121XX,Older American,Consent provided,Web,03/19/2015,Closed with explanation,Yes,No,1290516
190135,03/19/2015,Consumer Loan,Vehicle loan,Managing the loan or lease,,Due to inconsistencies in the amount owed that...,,M&T Bank Corporation,VA,221XX,Servicemember,Consent provided,Web,03/19/2015,Closed with explanation,Yes,No,1290492
190155,03/19/2015,Mortgage,Conventional fixed mortgage,"Loan modification,collection,foreclosure",,In XX/XX/XXXX my wages that I earned at my job...,,Wells Fargo & Company,CA,946XX,,Consent provided,Web,03/19/2015,Closed with explanation,Yes,Yes,1290524
190207,03/19/2015,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,I have an open and current mortgage with Chase...,,JPMorgan Chase & Co.,CA,900XX,Older American,Consent provided,Web,03/19/2015,Closed with explanation,Yes,Yes,1290253
190208,03/19/2015,Mortgage,Conventional fixed mortgage,Credit decision / Underwriting,,XXXX was submitted XX/XX/XXXX. At the time I s...,,Rushmore Loan Management Services LLC,CA,956XX,Older American,Consent provided,Web,03/19/2015,Closed with explanation,Yes,Yes,1292137


In [8]:
# realiza a limpeza nos dados (lowecase, remocao de caracteres e stopwords)
remove_caracteres = re.compile('[^0-9a-z #+_]')
replace_espaco = re.compile('[/(){}\[\]\|@,;]')
df = df.reset_index(drop=True)

def pre_processamento(text):
    text = text.lower()
    text = remove_caracteres.sub('', text)
    text = replace_espaco.sub(' ', text)
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].apply(pre_processamento)

In [9]:
n_max_palavras = 5000
tamanho_maximo_sent = 250
embedding_dimensions = 100

tokenizer = Tokenizer(num_words=n_max_palavras, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['consumer_complaint_narrative'].values)
word_index = tokenizer.word_index
print(' %s tokens unicos.' % len(word_index))

 64786 tokens unicos.


In [11]:
X = tokenizer.texts_to_sequences(df['consumer_complaint_narrative'].values)
X = pad_sequences(X, maxlen=tamanho_maximo_sent)
print("shape X", X.shape)

shape X (66806, 250)


In [12]:
Y = pd.get_dummies(df["product"]).values
print("shape Y", Y.shape)

shape Y (66806, 11)


In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)
print(len(X_train))
print(len(X_test))

53444
13362


In [15]:
model = Sequential()
model.add(Embedding(n_max_palavras, embedding_dimensions, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(11, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 100)          500000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 250, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 11)                1111      
Total params: 581,511
Trainable params: 581,511
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
epochs = 2
batch_size = 512

model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 48099 samples, validate on 5345 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7f010ed9d240>