### Analysis On The Raw Complaints Text From the CFPB to Determine the Topics that Bank of America Customers Complain About Most.

In [1]:
#Dependencies
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

### Data Exploration and Cleaning

In [2]:
#Importing the csv file
file = ('./CFPB-Data.csv')
df = pd.read_csv(file, encoding="iso-8859-1")
df

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,3/4/17,Bank account or service,Checking account,"Account opening, closing, or management",,My wife and I visited the Chase Bank branch at...,,JPMORGAN CHASE & CO.,KY,423XX,,Consent provided,Web,3/4/17,Closed with explanation,Yes,Yes,2371744
1,4/12/17,Bank account or service,Checking account,Problems caused by my funds being low,,I have documentation that shows that US Bank w...,Company has responded to the consumer and the ...,U.S. BANCORP,CA,928XX,,Consent provided,Web,4/12/17,Closed with explanation,Yes,No,2431565
2,3/4/17,Credit card,,Balance transfer,,Around XX/XX/XXXX I accepted a credit card off...,,JPMORGAN CHASE & CO.,TN,376XX,Older American,Consent provided,Web,3/4/17,Closed with monetary relief,Yes,No,2371616
3,4/14/17,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,Regarding PMI : we paid up front at closing to...,Company has responded to the consumer and the ...,"WELLS FARGO BANK, NATIONAL ASSOCIATION",TX,750XX,,Consent provided,Web,4/14/17,Closed with explanation,Yes,No,2436277
4,4/8/17,Consumer Loan,Vehicle lease,Taking out the loan or lease,,I have leased a vehicle XX/XX/2015 from an aut...,Company has responded to the consumer and the ...,ALLY FINANCIAL INC.,CA,926XX,,Consent provided,Web,4/8/17,Closed with explanation,Yes,No,2426136
5,3/31/17,Mortgage,Home equity loan or line of credit,"Loan servicing, payments, escrow account",,In XX/XX/2005 I obtained a home equity mortgag...,Company has responded to the consumer and the ...,"BANK OF AMERICA, NATIONAL ASSOCIATION",MP,554XX,,Consent provided,Web,4/3/17,Closed with explanation,Yes,No,2412732
6,4/4/17,Bank account or service,Checking account,"Making/receiving payments, sending money",,I have been deprived of my income as Bank of A...,Company has responded to the consumer and the ...,"BANK OF AMERICA, NATIONAL ASSOCIATION",MA,024XX,,Consent provided,Web,4/4/17,Closed with explanation,Yes,No,2419713
7,4/5/17,Bank account or service,Savings account,"Account opening, closing, or management",,Ally Bank turned me down for deceased relative...,Company has responded to the consumer and the ...,ALLY FINANCIAL INC.,IL,600XX,,Consent provided,Web,4/5/17,Closed with explanation,Yes,No,2420825
8,3/23/17,Consumer Loan,Pawn loan,Applied for loan/did not receive money,,Several years ago I got a line of credit loan ...,Company has responded to the consumer and the ...,"WELLS FARGO BANK, NATIONAL ASSOCIATION",PA,182XX,Servicemember,Consent provided,Web,3/23/17,Closed with explanation,Yes,No,2401871
9,4/16/17,Bank account or service,Checking account,"Account opening, closing, or management",,I submitted a complaint in XX/XX/XXXX regardin...,Company has responded to the consumer and the ...,U.S. BANCORP,WI,532XX,,Consent provided,Web,4/16/17,Closed with non-monetary relief,Yes,No,2437217


In [3]:
#Checking the shape of the data
df.shape

(35897, 18)

In [4]:
# to view the columns and the corresponding data types in each column, use the enumerate function:
for index, (name, dtype) in enumerate(zip(df.columns, df.dtypes)):
    print('{}.{}({})'.format(index,name,dtype))
    

0.Date received(object)
1.Product(object)
2.Sub-product(object)
3.Issue(object)
4.Sub-issue(object)
5.Consumer complaint narrative(object)
6.Company public response(object)
7.Company(object)
8.State(object)
9.ZIP code(object)
10.Tags(object)
11.Consumer consent provided?(object)
12.Submitted via(object)
13.Date sent to company(object)
14.Company response to consumer(object)
15.Timely response?(object)
16.Consumer disputed?(object)
17.Complaint ID(int64)


In [5]:
#Renamed two columns from the data frame
df_new = df.rename(columns={'Date received':'Date_received',
                            'Consumer complaint narrative':'Consumer_complaint_narrative'})

In [6]:
df_new.head()

Unnamed: 0,Date_received,Product,Sub-product,Issue,Sub-issue,Consumer_complaint_narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,3/4/17,Bank account or service,Checking account,"Account opening, closing, or management",,My wife and I visited the Chase Bank branch at...,,JPMORGAN CHASE & CO.,KY,423XX,,Consent provided,Web,3/4/17,Closed with explanation,Yes,Yes,2371744
1,4/12/17,Bank account or service,Checking account,Problems caused by my funds being low,,I have documentation that shows that US Bank w...,Company has responded to the consumer and the ...,U.S. BANCORP,CA,928XX,,Consent provided,Web,4/12/17,Closed with explanation,Yes,No,2431565
2,3/4/17,Credit card,,Balance transfer,,Around XX/XX/XXXX I accepted a credit card off...,,JPMORGAN CHASE & CO.,TN,376XX,Older American,Consent provided,Web,3/4/17,Closed with monetary relief,Yes,No,2371616
3,4/14/17,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,Regarding PMI : we paid up front at closing to...,Company has responded to the consumer and the ...,"WELLS FARGO BANK, NATIONAL ASSOCIATION",TX,750XX,,Consent provided,Web,4/14/17,Closed with explanation,Yes,No,2436277
4,4/8/17,Consumer Loan,Vehicle lease,Taking out the loan or lease,,I have leased a vehicle XX/XX/2015 from an aut...,Company has responded to the consumer and the ...,ALLY FINANCIAL INC.,CA,926XX,,Consent provided,Web,4/8/17,Closed with explanation,Yes,No,2426136


In [7]:
#Create a subset of the dataframe with varibles of interest
df_new_sub = df_new[['Date_received','Product','Consumer_complaint_narrative','Company']]
df_new_sub

Unnamed: 0,Date_received,Product,Consumer_complaint_narrative,Company
0,3/4/17,Bank account or service,My wife and I visited the Chase Bank branch at...,JPMORGAN CHASE & CO.
1,4/12/17,Bank account or service,I have documentation that shows that US Bank w...,U.S. BANCORP
2,3/4/17,Credit card,Around XX/XX/XXXX I accepted a credit card off...,JPMORGAN CHASE & CO.
3,4/14/17,Mortgage,Regarding PMI : we paid up front at closing to...,"WELLS FARGO BANK, NATIONAL ASSOCIATION"
4,4/8/17,Consumer Loan,I have leased a vehicle XX/XX/2015 from an aut...,ALLY FINANCIAL INC.
5,3/31/17,Mortgage,In XX/XX/2005 I obtained a home equity mortgag...,"BANK OF AMERICA, NATIONAL ASSOCIATION"
6,4/4/17,Bank account or service,I have been deprived of my income as Bank of A...,"BANK OF AMERICA, NATIONAL ASSOCIATION"
7,4/5/17,Bank account or service,Ally Bank turned me down for deceased relative...,ALLY FINANCIAL INC.
8,3/23/17,Consumer Loan,Several years ago I got a line of credit loan ...,"WELLS FARGO BANK, NATIONAL ASSOCIATION"
9,4/16/17,Bank account or service,I submitted a complaint in XX/XX/XXXX regardin...,U.S. BANCORP


In [8]:
#But I'm more interested in Consumer_complaint_narrative against Bank of America
bank_of_america_file = df_new_sub[df_new_sub['Company']=='BANK OF AMERICA, NATIONAL ASSOCIATION']
bank_of_america_file.head()

Unnamed: 0,Date_received,Product,Consumer_complaint_narrative,Company
5,3/31/17,Mortgage,In XX/XX/2005 I obtained a home equity mortgag...,"BANK OF AMERICA, NATIONAL ASSOCIATION"
6,4/4/17,Bank account or service,I have been deprived of my income as Bank of A...,"BANK OF AMERICA, NATIONAL ASSOCIATION"
10,3/31/17,Other financial service,I bought a money order from XXXX XXXX to pay a...,"BANK OF AMERICA, NATIONAL ASSOCIATION"
11,4/5/17,Credit card,My business debit cards were stolen from my ma...,"BANK OF AMERICA, NATIONAL ASSOCIATION"
13,4/14/17,Bank account or service,I have opened the account of Bank of America a...,"BANK OF AMERICA, NATIONAL ASSOCIATION"


In [9]:
# To extract the Consumer_complaint_narrative (documents)
documents = bank_of_america_file['Consumer_complaint_narrative']
documents.head()

5     In XX/XX/2005 I obtained a home equity mortgag...
6     I have been deprived of my income as Bank of A...
10    I bought a money order from XXXX XXXX to pay a...
11    My business debit cards were stolen from my ma...
13    I have opened the account of Bank of America a...
Name: Consumer_complaint_narrative, dtype: object

In [10]:
#Create a list of the documents
documents_list = list(documents)

In [11]:
#Number of the list
documents_list_num = len(documents_list)
documents_list_num

7196

### Feature Extraction

In [12]:
# the sklearn has a future extraction functionality that does the heavy lifting.
no_features = 1000

In [13]:
# NMF is able to use tf-idf
#tf-idf will scale down the impact of tokens that occur very frequently in a corpus that are empirically less informative

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [14]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

### Fitting both the NMF and LDA Algorithms on the Text Data.

In [15]:
# both algorithms cannot determine the number of topics, so the value must be set when running the algorithm
no_topics = 10

In [16]:
# NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf

NMF(alpha=0.1, beta=1, eta=0.1, init='nndsvd', l1_ratio=0.5, max_iter=200,
  n_components=10, nls_max_iter=2000, random_state=1, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [17]:
# LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
lda

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=10, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

### Create a Method to Display the Topics that Bank of America Customers Comlain About Most

In [18]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])
)

In [19]:
no_top_words = 10

In [20]:
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
xxxx bank america 2015 00 2016 debt property received letter
Topic 1:
told called said did money phone asked time number xxxx
Topic 2:
xx xxxx 2016 2017 received 00 bankruptcy letter bofa report
Topic 3:
loan mortgage modification home bank america property foreclosure sale documents
Topic 4:
credit card report america reporting score cards bank charge debt
Topic 5:
00 fee overdraft fees charged charge balance 35 charges account
Topic 6:
boa xxxx account cfpb funds years complaint transfer rep mortgage
Topic 7:
account bank america closed checking accounts open money opened close
Topic 8:
payment late payments pay month paid make mortgage 00 minimum
Topic 9:
check deposited deposit funds checks bank cash money cleared cashed


In [21]:
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
xxxx bank america sale property mortgage foreclosure loan documents home
Topic 1:
credit bank america report account card information reporting debt reported
Topic 2:
xxxx bank account america told check called did money xx
Topic 3:
card charges xxxx 00 charge bank credit transaction america debit
Topic 4:
loan bank america xxxx mortgage home modification time years told
Topic 5:
checks insurance 00 escrow paid taxes tax company bank pay
Topic 6:
xxxx xx bofa bank america 00 mortgage number complaint case
Topic 7:
payment 00 payments late balance pay credit paid month bank
Topic 8:
boa mortgage program bankruptcy hamp attorney payments appeal filed banks
Topic 9:
account bank xxxx 00 america fee fees checking charged money


### Create Dataframe for Both Results and Compare The First 10 Topics

In [22]:
##Dataframe for the NMF Algorithm results
data_nmf = {
    'topics':['topic 0','topic 1','topic 2','topic 3','topic 4','topic 5','topic 6','topic 7','topic 8','topic 9'],
    'documents_nmf':['xxxx 2015 00 bank wells fargo received court debt letter',
                'account checking bank wells fargo accounts closed money opened funds',
                'xx xxxx 2016 2017 received 00 letter wf date dated',
                'wells mortgage fargo home modification foreclosure sale property documents house',
                'credit card report wells fargo reporting score debt balance cards',
            'payment payments late month mortgage monthly pay make paid escrow',
                '00 fees overdraft fee charged balance charge 35 charges transactions',
                'told said called did money asked phone bank time just',
                'check funds deposited deposit hold checks bank wells fargo cashed',
                'loan modification rate loans income years student payments refinance application']
    
}
df_topics_nmf = pd.DataFrame(data_nmf, columns=['topics','documents_nmf'])
df_topics_nmf

Unnamed: 0,topics,documents_nmf
0,topic 0,xxxx 2015 00 bank wells fargo received court d...
1,topic 1,account checking bank wells fargo accounts clo...
2,topic 2,xx xxxx 2016 2017 received 00 letter wf date d...
3,topic 3,wells mortgage fargo home modification foreclo...
4,topic 4,credit card report wells fargo reporting score...
5,topic 5,payment payments late month mortgage monthly p...
6,topic 6,00 fees overdraft fee charged balance charge 3...
7,topic 7,told said called did money asked phone bank ti...
8,topic 8,check funds deposited deposit hold checks bank...
9,topic 9,loan modification rate loans income years stud...


In [23]:
# Dataframe for the LDA Algorithm results
data_lda = {
    'topics':['topic 0','topic 1','topic 2','topic 3','topic 4','topic 5','topic 6','topic 7','topic 8','topic 9'],
    'documents_lda':['00 xxxx payment payments wells balance fargo month paid pay',
                'xxxx wells fargo mortgage loan property home bank documents foreclosure',
                'xxxx loan wells home fargo modification mortgage help time told',
                'xx xxxx sale short offer date house mortgage property received',
                'account fargo wells credit debt accounts report information reporting xxxx',
                'wf xxxx late fargo insurance wells fees services car pay',
                'xxxx wells fargo received letter sent number bank 2015 called',
                'account wells fargo bank check money xxxx funds 00 checking',
                'loan wells fargo credit xxxx mortgage rate payments years loans',
                'card xxxx told credit wells fargo did account called said']
    
}
df_topics_lda = pd.DataFrame(data_lda, columns=['topics','documents_lda'])
df_topics_lda

Unnamed: 0,topics,documents_lda
0,topic 0,00 xxxx payment payments wells balance fargo m...
1,topic 1,xxxx wells fargo mortgage loan property home b...
2,topic 2,xxxx loan wells home fargo modification mortga...
3,topic 3,xx xxxx sale short offer date house mortgage p...
4,topic 4,account fargo wells credit debt accounts repor...
5,topic 5,wf xxxx late fargo insurance wells fees servic...
6,topic 6,xxxx wells fargo received letter sent number b...
7,topic 7,account wells fargo bank check money xxxx fund...
8,topic 8,loan wells fargo credit xxxx mortgage rate pay...
9,topic 9,card xxxx told credit wells fargo did account ...


In [24]:
#Merging both dataframes
nmf_lda_merge = pd.merge(df_topics_nmf,df_topics_lda, on='topics')
nmf_lda_merge

Unnamed: 0,topics,documents_nmf,documents_lda
0,topic 0,xxxx 2015 00 bank wells fargo received court d...,00 xxxx payment payments wells balance fargo m...
1,topic 1,account checking bank wells fargo accounts clo...,xxxx wells fargo mortgage loan property home b...
2,topic 2,xx xxxx 2016 2017 received 00 letter wf date d...,xxxx loan wells home fargo modification mortga...
3,topic 3,wells mortgage fargo home modification foreclo...,xx xxxx sale short offer date house mortgage p...
4,topic 4,credit card report wells fargo reporting score...,account fargo wells credit debt accounts repor...
5,topic 5,payment payments late month mortgage monthly p...,wf xxxx late fargo insurance wells fees servic...
6,topic 6,00 fees overdraft fee charged balance charge 3...,xxxx wells fargo received letter sent number b...
7,topic 7,told said called did money asked phone bank ti...,account wells fargo bank check money xxxx fund...
8,topic 8,check funds deposited deposit hold checks bank...,loan wells fargo credit xxxx mortgage rate pay...
9,topic 9,loan modification rate loans income years stud...,card xxxx told credit wells fargo did account ...


### To Discover Patterns in the Text Data I Applied the K-means Clustering Technique

Recall, with the tf-idf matrix above, I can run a K-means clustering algorithms to discover the hidden structure within the complaints data. 
Set initial number of clusters to 5. 

In [25]:
#Import dependencies
from sklearn.cluster import KMeans

In [26]:
#Initialize number of clusters
num_clusters = 5

In [27]:
km = KMeans(n_clusters=num_clusters)

In [28]:
#recall
tfidf = tfidf_vectorizer.fit_transform(documents)

In [29]:
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

In [30]:
km.fit(tfidf_matrix)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [31]:
clusters = km.labels_.tolist()

### Create a Dataframe of Document and Cluster

In [51]:

topics = {'document':documents,'cluster':clusters}

frame = pd.DataFrame(topics, columns = ['document','cluster'])

frame.head()

Unnamed: 0,document,cluster
5,In XX/XX/2005 I obtained a home equity mortgag...,1
6,I have been deprived of my income as Bank of A...,0
10,I bought a money order from XXXX XXXX to pay a...,2
11,My business debit cards were stolen from my ma...,0
13,I have opened the account of Bank of America a...,0


In [33]:
#number of documents per cluster (clusters from 0 to 4)
frame['cluster'].value_counts()

1    1890
0    1740
2    1541
4    1261
3     764
Name: cluster, dtype: int64