In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df=pd.read_csv('2012-2013_cleaned.csv', index_col=0)

In [3]:
y=df['loan_status']

In [4]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation and digits
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if (char not in string.punctuation) and (not char.isdigit())]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [16]:
words=df['emp_title'][1]

In [22]:
words

'Team Leadern Customer Ops & Systems'

In [25]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize('Leadern')

'Systems'

In [15]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(df['emp_title'])


In [16]:
print(len(bow_transformer.vocabulary_))

44964


In [17]:
df['emp_title'][3]

'Area Sales Manager'

In [18]:
bow4 = bow_transformer.transform([df['emp_title'][3]])
print(bow4)
print(bow4.shape)

  (0, 2331)	1
  (0, 24545)	1
  (0, 35033)	1
(1, 44964)


In [19]:
print(bow_transformer.get_feature_names()[2331])
print(bow_transformer.get_feature_names()[24545])
print(bow_transformer.get_feature_names()[35033])

area
manager
sales


In [20]:
messages_bow=bow_transformer.transform(df['emp_title'])

In [51]:
messages_bow.shape

(176414, 44964)

In [52]:
print(messages_bow[3])

  (0, 2331)	1
  (0, 24545)	1
  (0, 35033)	1


In [53]:
columns=bow_transformer.get_feature_names()

In [54]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)

In [55]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(176414, 44964)


In [56]:
print(messages_tfidf[3])

  (0, 35033)	0.5458201127835374
  (0, 24545)	0.41312461761961544
  (0, 2331)	0.7289776778459085


In [57]:
X_train, X_test, y_train, y_test = train_test_split(messages_tfidf, y, test_size=0.3)

In [58]:
nlp = MultinomialNB().fit(X_train, y_train)

In [59]:
predictions=nlp.predict(X_test)

In [60]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))


[[44678   113]
 [ 8105    29]]
             precision    recall  f1-score   support

        0.0       0.85      1.00      0.92     44791
        1.0       0.20      0.00      0.01      8134

avg / total       0.75      0.84      0.78     52925



In [61]:
columns

['aa',
 'aaa',
 'aaaa',
 'aaatlchealth',
 'aabco',
 'aabpch',
 'aac',
 'aacoa',
 'aacres',
 'aadco',
 'aadociate',
 'aadsen',
 'aadusd',
 'aadvanced',
 'aadvantage',
 'aadvisory',
 'aaf',
 'aafes',
 'aafirs',
 'aai',
 'aakash',
 'aalas',
 'aalfs',
 'aallcare',
 'aam',
 'aamc',
 'aamco',
 'aamcore',
 'aand',
 'aap',
 'aapcho',
 'aappliance',
 'aapt',
 'aar',
 'aarai',
 'aaron',
 'aarons',
 'aarp',
 'aashto',
 'aassistant',
 'aasu',
 'aaustin',
 'aauto',
 'aav',
 'aaviation',
 'ab',
 'aba',
 'abacus',
 'abadata',
 'abair',
 'abatment',
 'abaxis',
 'abb',
 'abba',
 'abbadabbas',
 'abbae',
 'abbate',
 'abbco',
 'abbett',
 'abbeville',
 'abbey',
 'abbott',
 'abbottnorthweternhospital',
 'abbottt',
 'abbvie',
 'abc',
 'abcbs',
 'abcdisney',
 'abco',
 'abctv',
 'abcusd',
 'abd',
 'abdeck',
 'abdo',
 'abe',
 'abear',
 'abec',
 'abeinsa',
 'abel',
 'abell',
 'abelman',
 'abengoa',
 'abeo',
 'abercrombie',
 'aberdeen',
 'abertsons',
 'abes',
 'abf',
 'abftransportation',
 'abhow',
 'abi',
 'abig

In [63]:
df2=pd.read_csv('2012-2013_cleaned2.csv')

In [83]:
df2.dropna(subset=['title'], inplace=True)

In [84]:
X_train, X_test, y_train, y_test = train_test_split(df2['title'], df2['loan_status'], test_size=0.3)

In [85]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [87]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x1a0f4a5bf8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [88]:
predictions = pipeline.predict(X_test)

In [89]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))


[[45392   121]
 [ 8561    17]]
             precision    recall  f1-score   support

        0.0       0.84      1.00      0.91     45513
        1.0       0.12      0.00      0.00      8578

avg / total       0.73      0.84      0.77     54091



In [92]:
predictions.sum()

138.0

In [6]:
df.drop(columns=['emp_title','issue_d','zip_code','loan_status'], inplace=True)

In [9]:
df.to_sparse()

Unnamed: 0,loan_amnt,int_rate,installment,sub_grade,emp_length,annual_inc,dti,delinq_2yrs,earliest_cr_line,fico_range_low,...,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY
0,12000.0,7.62,373.94,32,3.0,96500.0,12.61,0.0,123,705.0,...,0,0,1,0,0,0,0,0,0,0
1,27050.0,10.99,885.46,28,10.0,55000.0,22.87,0.0,326,730.0,...,0,0,0,0,0,0,0,0,0,0
2,12000.0,11.99,398.52,27,10.0,130000.0,13.03,0.0,193,715.0,...,0,0,0,0,0,0,0,0,0,0
3,28000.0,7.62,872.52,32,5.0,325000.0,18.55,0.0,229,745.0,...,0,0,0,0,0,0,0,0,0,0
4,27600.0,19.97,730.78,15,6.0,73000.0,23.13,1.0,294,665.0,...,0,0,0,0,0,0,0,0,0,0
5,12000.0,10.99,392.81,28,4.0,60000.0,4.62,0.0,48,720.0,...,0,0,0,0,0,0,0,0,0,0
6,11100.0,14.98,384.68,22,10.0,90000.0,3.73,1.0,150,690.0,...,0,0,0,0,0,0,0,0,0,0
7,12000.0,13.53,407.40,25,10.0,40000.0,16.94,0.0,182,660.0,...,0,0,0,0,0,0,0,0,0,0
8,9750.0,13.98,333.14,24,1.0,26000.0,25.12,0.0,83,670.0,...,0,0,0,0,0,0,0,0,0,0
9,4800.0,10.99,157.13,28,2.0,39600.0,2.49,0.0,220,755.0,...,0,0,1,0,0,0,0,0,0,0
