In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
cf=pd.read_csv("mortgag.csv",names=["complaintdate","product","subproduct" ,"issue","subissue","consumer_narrative","company_public_response","company","state" ,"zip","tags","companyconsent","via","send_to_company_date","company_respons e_consumer","timely","consumer_disputed","id"],skiprows=1)

In [4]:
cf=cf[~cf.consumer_narrative.isnull()]
cf[['consumer_narrative']]

Unnamed: 0,consumer_narrative
9,Started the refinance of home mortgage process...
10,"In XXXX, I and my ex-husband applied for a ref..."
11,Mortgage was transferred to Nationstar as of X...
26,Need to move into a XXXX facility. Can no long...
36,I had an FHA loan at US Bank that was paid off...
38,I went through a divorce several years ago and...
42,Select Portfolio Servicing has been deceptive ...
55,I got recent modification ( XXXX/XXXX/2015 ) f...
58,I was late on my mortgage payments and decided...
59,Requested a payoff quote by fax and certified ...


In [5]:
from nltk.tokenize import RegexpTokenizer 
tokenizer = RegexpTokenizer(r'\w+') 
cf["tokens"] = cf["consumer_narrative"].apply(tokenizer.tokenize) 
cf.head()

Unnamed: 0,complaintdate,product,subproduct,issue,subissue,consumer_narrative,company_public_response,company,state,zip,tags,companyconsent,via,send_to_company_date,company_respons e_consumer,timely,consumer_disputed,id,tokens
9,11/18/2016,Mortgage,Conventional fixed mortgage,Settlement process and costs,,Started the refinance of home mortgage process...,,AMERICAN NEIGHBORHOOD MORTGAGE,NJ,088XX,,Consent provided,Web,11/21/2016,Closed with monetary relief,No,No,2216206,"[Started, the, refinance, of, home, mortgage, ..."
10,07/16/2015,Mortgage,Conventional fixed mortgage,"Application, originator, mortgage broker",,"In XXXX, I and my ex-husband applied for a ref...",,HSBC NORTH AMERICA HOLDINGS INC.,IL,625XX,,Consent provided,Web,07/16/2015,Closed with explanation,Yes,No,1472017,"[In, XXXX, I, and, my, ex, husband, applied, f..."
11,04/27/2016,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,Mortgage was transferred to Nationstar as of X...,,NATIONSTAR MORTGAGE,CA,954XX,,Consent provided,Web,04/27/2016,Closed with explanation,Yes,Yes,1898476,"[Mortgage, was, transferred, to, Nationstar, a..."
26,06/30/2016,Mortgage,Conventional fixed mortgage,Credit decision / Underwriting,,Need to move into a XXXX facility. Can no long...,,"VANDERBILT MORTGAGE & FINANCE, INC",GA,300XX,Older American,Consent provided,Web,07-01-2016,Closed with explanation,Yes,No,1992309,"[Need, to, move, into, a, XXXX, facility, Can,..."
36,07/20/2015,Mortgage,FHA mortgage,"Loan servicing, payments, escrow account",,I had an FHA loan at US Bank that was paid off...,Company chooses not to provide a public response,U.S. BANCORP,TN,370XX,,Consent provided,Web,07/20/2015,Closed with explanation,Yes,No,1474887,"[I, had, an, FHA, loan, at, US, Bank, that, wa..."


In [6]:
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 
stemmer = PorterStemmer() 
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer();

In [7]:
mortgage_dict=['refinance','borrower','mortgage','lender','reporting','refinanced','financed','finance']

In [8]:
def stemandlemma(consumer_narrative):
    preprocess_list = list()
    words = consumer_narrative.lower().split()
    for w in words:
        if(w not in mortgage_dict):
            preprocess_list.append( lemmatizer.lemmatize(w) )
        else:
            preprocess_list.append(w)
    return " ".join(preprocess_list)

In [9]:
def removeNER(consumer_narrative):
    preprocess_list = list()
    words = consumer_narrative.lower().split()
    for w in words:
        preprocess_list.append( stemmer.stem(w) )
    return preprocess_list

In [10]:
def preprocess_consumer_narrative( consumer_narrative):
    preprocess_list = list()
    for text in consumer_narrative:
        letters_only = re.sub("[^a-zA-Z]", " ", text) 
        letters_only = re.sub("XX","",letters_only)
        words = letters_only.lower().split()                             
        stops = set(stopwords.words("english"))                  
        meaningful_words = [w for w in words if not w in stops]
        preprocess_list.append(" ".join( meaningful_words ))
    return preprocess_list  

In [11]:
cf['consumer_newnarrative'] = preprocess_consumer_narrative( cf.consumer_narrative)
cf['consumer_newnarrative'] = cf['consumer_newnarrative'].map(stemandlemma)

In [12]:
cf.iloc[7].consumer_newnarrative

'got recent modification mortgage ocwen service house located tx ocwen adding something principal think much money added please help clarify kind charge addition mortgage loan thanks'

In [13]:
from sklearn.model_selection import train_test_split
cf_consumer_narrative = pd.DataFrame(cf['consumer_newnarrative'])
y =cf['issue'] 
X_train, X_test, y_train, y_test = train_test_split(cf_consumer_narrative, y, test_size=0.3)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer 
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 
train_data_features = vectorizer.fit_transform(X_train.consumer_newnarrative)

In [15]:
train_data_features.shape

(27813, 5000)

In [16]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [17]:
model = tree.DecisionTreeClassifier()

In [18]:
model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [19]:
model = model.fit(train_data_features, y_train)

In [20]:
test_data_features = vectorizer.transform(X_test.consumer_newnarrative)

In [21]:
result=model.predict(test_data_features)

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,result)

0.45289824679137658

In [29]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, result)
print(confusion_matrix)

[[ 406    5   58   37   81    0    2    2  169  241    0    0  120   26
    36    1]
 [   5    0    1    1    0    0    0    0    4    3    0    0    5    1
     1    0]
 [  62    0    5    6   18    0    0    0   37   52    0    0   23    8
     9    0]
 [  45    0    6   14   13    0    0    0   27   49    0    0   14    4
    12    0]
 [  83    0   20   17   38    0    0    1   62  101    0    1   29    7
    19    0]
 [   0    0    0    0    0    0    0    0    0    2    0    0    1    0
     0    0]
 [   1    0    0    0    0    0    0    0    0    1    0    0    0    0
     0    0]
 [   5    0    0    0    3    0    0    2    8   21    2    0    1    3
     8    0]
 [ 156    1   31   25   41    0    0    5 1801  631    3    2  112  302
   105    1]
 [ 173    2   40   39   87    0    0   13  623 2760    5    0  149  109
   433    3]
 [   0    0    0    0    0    0    0    0    0    6    0    0    1    0
     0    0]
 [   0    0    0    0    0    0    0    0    0    0    0    0    

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1,\
                            stop_words = 'english', \
                            max_features = 5000)
train_data_features = vectorizer.fit_transform(X_train.consumer_newnarrative)

In [34]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,result)

0.44803288314738698

In [None]:
//////////////////////////////////////////////Logistic Regression///////////////////////////////////////////////////////////////

In [36]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(penalty='l2',C=1)

In [37]:
model

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
model = model.fit(train_data_features, y_train)

In [39]:
test_data_features = vectorizer.transform(X_test.consumer_newnarrative)

In [40]:
result=model.predict(test_data_features)

In [41]:
from sklearn.metrics import accuracy_score 
accuracy_score(y_test,result)

0.61177753544165758

In [42]:
from sklearn.metrics import classification_report
print(classification_report(y_test, result))

                                                                                  precision    recall  f1-score   support

                                        Application, originator, mortgage broker       0.54      0.56      0.55      1184
                                                         Applying for a mortgage       0.00      0.00      0.00        21
                     Applying for a mortgage or refinancing an existing mortgage       0.00      0.00      0.00       220
                                                           Closing on a mortgage       0.50      0.01      0.01       184
                                                  Credit decision / Underwriting       0.23      0.01      0.02       378
                         Credit monitoring or identity theft protection services       0.00      0.00      0.00         3
                                                     Improper use of your report       0.00      0.00      0.00         2
                       

  'precision', 'predicted', average, warn_for)


In [43]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, result)
print(confusion_matrix)

[[ 662    0    0    0    4    0    0    0  208  239    0   70    0    1
     0]
 [  11    0    0    0    1    0    0    0    4    5    0    0    0    0
     0]
 [ 115    0    0    0    6    0    0    0   36   55    0    5    2    1
     0]
 [  53    0    0    1    0    0    0    0   35   67    0   25    0    3
     0]
 [ 160    0    0    0    5    0    0    0   68  128    0   15    1    1
     0]
 [   0    0    0    0    0    0    0    0    0    3    0    0    0    0
     0]
 [   1    0    0    0    0    0    0    0    0    1    0    0    0    0
     0]
 [   1    0    0    0    0    0    0    0    9   42    0    0    0    1
     0]
 [  29    0    0    0    0    0    0    0 2581  580    0   15    7    4
     0]
 [  41    0    0    0    4    0    0    0  480 3881    0   19    6    5
     0]
 [   0    0    0    0    0    0    0    0    0    7    0    0    0    0
     0]
 [ 125    0    0    1    1    0    0    0  172  256    0  128    2    0
     0]
 [   9    0    0    0    0    0    0    

In [44]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1,\
                            stop_words = 'english', \
                            max_features = 5000) 
train_data_features = vectorizer.fit_transform(X_train.consumer_newnarrative)

In [51]:
from sklearn.metrics import accuracy_score 
accuracy_score(y_test,result)

0.61177753544165758