In [1]:
import numpy as np
import pandas as pd
import cv2
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [2]:
file = pd.read_csv("Complaints_Tickets.csv")
file.head(8)

Unnamed: 0,Complaints,Product
0,Communication tactics,Debt collection
1,Cont'd attempts collect debt not owed,Debt collection
2,"Application, originator, mortgage broker",Mortgage
3,Other,Credit card
4,Cont'd attempts collect debt not owed,Debt collection
5,Communication tactics,Debt collection
6,Managing the loan or lease,Consumer loan
7,Communication tactics,Debt collection


In [3]:
file["Product"].value_counts() #to check the number of unique categories

Debt collection            7494
Mortgage                   6612
Credit reporting           5717
Credit card                2830
Bank account or service    2609
Consumer loan              1314
Student loan                785
Payday loan                 348
Money transfers             232
Prepaid card                175
Other financial service      40
Name: Product, dtype: int64

In [4]:
#1. Text Normalizations
file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28156 entries, 0 to 28155
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Complaints  28154 non-null  object
 1   Product     28156 non-null  object
dtypes: object(2)
memory usage: 440.1+ KB


In [5]:
#replacing null values with text
file["Complaints"].replace({np.nan: ""}, inplace = True)
file["clean_complaints"] = file["Complaints"].apply(lambda x : x.lower())
file["clean_complaints"][2]

'application, originator, mortgage broker'

In [6]:
#removing punctuation marks
file["clean_complaints"] = file["clean_complaints"].apply(lambda x: re.sub("[^A-Za-z0-9]", " ", x))
file["clean_complaints"][2]

'application  originator  mortgage broker'

In [7]:
#removing addtional blank spaces, and blank spaces at the begining or end of sentences if any
file["clean_complaints"] = file["clean_complaints"].apply(lambda x: re.sub(r"\s+" , " ", x))
file["clean_complaints"] = file["clean_complaints"].apply(lambda x: x.strip())

In [8]:
file["clean_complaints"][2]

'application originator mortgage broker'

In [9]:
#tokenizing the words
file["clean_complaints"] = file["clean_complaints"].apply(lambda x: word_tokenize(x))
file["clean_complaints"][0]

['communication', 'tactics']

In [10]:
#removing stopwrds
stop_words = stopwords.words("english")
file["clean_complaints"] = file["clean_complaints"].apply(lambda x: [word for word in x if word not in stop_words])
file["clean_complaints"]

0                            [communication, tactics]
1               [cont, attempts, collect, debt, owed]
2         [application, originator, mortgage, broker]
3                                                  []
4               [cont, attempts, collect, debt, owed]
                             ...                     
28151           [cont, attempts, collect, debt, owed]
28152                           [taking, loan, lease]
28153    [loan, servicing, payments, escrow, account]
28154                  [stop, charges, bank, account]
28155                           [transaction, issues]
Name: clean_complaints, Length: 28156, dtype: object

In [11]:
#lemmatizing the words
from nltk.stem import WordNetLemmatizer
lemmatize = WordNetLemmatizer()
file["clean_complaints"] = file["clean_complaints"].apply(lambda x: [lemmatize.lemmatize(word) for word in x])

In [12]:
#joining it back
file["clean_complaints"] = file["clean_complaints"].apply(lambda x: " ".join(map(str,x)))
file["clean_complaints"]

0                          communication tactic
1                cont attempt collect debt owed
2        application originator mortgage broker
3                                              
4                cont attempt collect debt owed
                          ...                  
28151            cont attempt collect debt owed
28152                         taking loan lease
28153     loan servicing payment escrow account
28154                  stop charge bank account
28155                         transaction issue
Name: clean_complaints, Length: 28156, dtype: object

###### Text Vectorization using TFidf

In [13]:
#splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(file["clean_complaints"], file["Product"], test_size=0.15, random_state= 2)
tfidf = TfidfVectorizer(use_idf=True)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

In [15]:
#printing tfidf result for 1st row
pd.DataFrame(X_train_vec[0].T.todense(), index = tfidf.get_feature_names_out(),columns = ["TFIDF"]).sort_values(by = "TFIDF", ascending = False).head(5)

Unnamed: 0,TFIDF
false,0.582563
representation,0.582563
statement,0.566782
problem,0.0
owed,0.0


##### Using Deep Learning for classification

In [19]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, LSTM, SpatialDropout1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


In [21]:
#Step 1 : Tokenizing the clean_complaints
max_tokens = 1000
tokenizer = Tokenizer(num_words = max_tokens , filters= '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(file["clean_complaints"].values)
#getting the length of tokenized words
len(tokenizer.word_index)  #hence there are only 158 unique words

158

In [25]:
#paddind the sequneces to make it fit a standard length
pad_len = 15
padded_sent = tokenizer.texts_to_sequences(file["clean_complaints"].values)
X = pad_sequences(padded_sent, maxlen = pad_len)
X.shape

(28156, 15)

In [24]:
#using pd.getdummies on file["Product"]
y = pd.get_dummies(file["Product"])
y.shape

(28156, 11)

In [26]:
#splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=2)
print(X_train.shape, y_train.shape)

(23932, 15) (23932, 11)


In [27]:
#building the model
model = Sequential()
model.add(Embedding(max_tokens, 64, input_length= X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(11, activation = "softmax"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 64)            64000     
                                                                 
 spatial_dropout1d (SpatialD  (None, 15, 64)           0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 128)               98816     
                                                                 
 dense (Dense)               (None, 11)                1419      
                                                                 
Total params: 164,235
Trainable params: 164,235
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])
keras_callbacks = EarlyStopping(monitor="val_loss", min_delta = 0.01, patience = 3)

In [31]:
model.fit(X_train, y_train, epochs = 4, batch_size = 64, callbacks=keras_callbacks, validation_data=(X_test, y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x16aef614880>