In [1]:
#Purpose: We're going to build a classifier to predict whether each loan from
#the kiva dataset will be defaulted or paid

In [2]:
import pandas as pd

The following is the same setup as the start of the LDA topic modeling workbook

In [39]:
#import kiva dataset (requested loans)
kiva_df = pd.read_csv("kiva_cleaned.csv")

In [40]:
#there happen to be some rows with nothing in them...remove them
kiva_df = kiva_df.dropna()
kiva_df = kiva_df.reset_index(drop=True)

In [41]:
kiva_df.head()
#en_clean is our text field of primary interest

Unnamed: 0,loan_id,status,sector,en,country,gender,loan_amount,nonpayment,en_clean
0,0,defaulted,Agriculture,<i>This description is written by Rowland Amul...,Kenya,M,500.0,lender,"Robert, 40, is married and has 6 children. In ..."
1,1,defaulted,Food,<i>This description is written by Rowland Amul...,Kenya,F,500.0,lender,"Petronilla, 30, was deserted by her husband an..."
2,2,defaulted,Food,<i>This description was written by Richard Maz...,Kenya,M,500.0,lender,"Tom Mung'ahu, 45, is married and has 6 childre..."
3,3,defaulted,Services,<i>This description was written by Rowland Amu...,Kenya,F,500.0,lender,"Benedina, 42, is married and has 4 girls. In a..."
4,4,defaulted,Construction,<i>This description was written by Rowland Amu...,Kenya,M,500.0,lender,"Vincent Ondego 40, is married and has 8 childr..."


In [42]:
#import text mining packages
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import unidecode

#define stop words (notice we include spanish due to its df presence)
stop_words = set(stopwords.words('english') + stopwords.words('spanish'))

#lemmatizer
lemmer = WordNetLemmatizer()

#preprocessing function
def preprocess(x):
    x = x.lower() #lower case everything
    
    x = re.sub(r'[^\w\s]', '', x) #remove any punctuation
    
    x = unidecode.unidecode(x)
    
    x = re.sub(r'\d+', '', x) #remove any numbers
    
    x = [lemmer.lemmatize(w) for w in x.split() if w not in stop_words]
        
    return ' '.join(x)

kiva_df['en_clean_pre'] = kiva_df['en_clean'].apply(preprocess)

In [43]:
#check out first couple rows now
kiva_df.head(2)
#observations: notice that en_clean_pre now shows post-preprocessing (we have
#a list of terms)

Unnamed: 0,loan_id,status,sector,en,country,gender,loan_amount,nonpayment,en_clean,en_clean_pre
0,0,defaulted,Agriculture,<i>This description is written by Rowland Amul...,Kenya,M,500.0,lender,"Robert, 40, is married and has 6 children. In ...",robert married child addition family take care...
1,1,defaulted,Food,<i>This description is written by Rowland Amul...,Kenya,F,500.0,lender,"Petronilla, 30, was deserted by her husband an...",petronilla deserted husband responsible upbrin...


In [44]:
#original text field
kiva_df.iloc[0, :].en_clean

'Robert, 40, is married and has 6 children. In addition to his family of 8, he takes care of his mother and 5 brothers. Robert started by planting vegetables and selling at the local market. He then diversified with a tea nursery which is more profitable. Given a loan of $500, Robert will be able to improve his activities by buying fertilizers, pesticides, a pump and seedlings. The pump will remove the uncertainties of the weather. The current capital in his venture is Ksh 10500 or $150, which does not allow Robert to maximize his potential. Robert completed high school and has never got employment. He got apprentice training from a tea extension officer. He is a go getter, and his main hobby is teaching music.'

In [45]:
#new (preprocessed) text field
kiva_df.iloc[0, :].en_clean_pre

'robert married child addition family take care mother brother robert started planting vegetable selling local market diversified tea nursery profitable given loan robert able improve activity buying fertilizer pesticide pump seedling pump remove uncertainty weather current capital venture ksh allow robert maximize potential robert completed high school never got employment got apprentice training tea extension officer go getter main hobby teaching music'

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

#set up TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df = 0.5, min_df = 0.05,
                            max_features = 1000, ngram_range = [1,3])

#parameters: max_df is the max % of documents a word can be in
            #min_df is the min % of documents a word can be in
            #max_features means only keep the x amount of frequent words
            #ngram_range specifies what level of n-grams to keep
            
#pass in our data
dtm = vectorizer.fit_transform(kiva_df['en_clean_pre'])

In [47]:
dtm.shape #observations: 270 features were kept

(6802, 270)

In [48]:
#let's look at what these features were
vectorizer.get_feature_names()
#notice quite a few interesting words, some n-grams as well

['able',
 'active',
 'active member',
 'activity',
 'add',
 'additional',
 'age',
 'ago',
 'allow',
 'also',
 'always',
 'amount',
 'another',
 'applied',
 'applied loan',
 'area',
 'attend',
 'back',
 'basic',
 'bean',
 'began',
 'belief',
 'better',
 'born',
 'bought',
 'boy',
 'business selling',
 'buy',
 'buying',
 'capital',
 'care',
 'cement',
 'child school',
 'childrens',
 'church',
 'city',
 'client',
 'clothes',
 'clothing',
 'college',
 'community',
 'continue',
 'cost',
 'could',
 'cow',
 'currently',
 'customer',
 'dairy',
 'dairy cow',
 'daughter',
 'day',
 'decided',
 'demand',
 'different',
 'domingo',
 'dominican',
 'dream',
 'due',
 'earn',
 'educate',
 'education',
 'enable',
 'enough',
 'entrepreneur',
 'every',
 'every day',
 'expand',
 'expand business',
 'expense',
 'experience',
 'explains',
 'family',
 'farm',
 'farmer',
 'farming',
 'father',
 'fee',
 'feed',
 'first',
 'first loan',
 'five',
 'five child',
 'food',
 'four',
 'four child',
 'friend',
 'fruit',

In [67]:
#create bow and join to our original kiva features
bow_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names(), index=kiva_df.index)

kiva_df_bow = pd.concat([kiva_df, bow_df], axis=1)
kiva_df_bow.shape #6802 rows and 280 variables (270 from bow)

(6802, 280)

In [68]:
#let's create some lexicon features from the text
import textstat

#how many characters does it have
kiva_df_bow['len'] = kiva_df_bow['en_clean'].apply(lambda x: len(x))
#how many syllables
kiva_df_bow['syllable_count'] = kiva_df_bow['en_clean'].apply(lambda x: textstat.syllable_count(x))
#flesch reading metric (how difficult a passage is to understand)
kiva_df_bow['flesch_reading_ease'] = kiva_df_bow['en_clean'].apply(lambda x: textstat.flesch_reading_ease(x))

In [69]:
#preview updated df
kiva_df_bow.head()
#observations: we see our original features, bow features, and 3 new lexicon features

Unnamed: 0,loan_id,status,sector,en,country,gender,loan_amount,nonpayment,en_clean,en_clean_pre,...,would,would like,year ago,year old,year old married,yearold,young,len,syllable_count,flesch_reading_ease
0,0,defaulted,Agriculture,<i>This description is written by Rowland Amul...,Kenya,M,500.0,lender,"Robert, 40, is married and has 6 children. In ...",robert married child addition family take care...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,719,187,67.15
1,1,defaulted,Food,<i>This description is written by Rowland Amul...,Kenya,F,500.0,lender,"Petronilla, 30, was deserted by her husband an...",petronilla deserted husband responsible upbrin...,...,0.114361,0.0,0.0,0.0,0.0,0.0,0.0,1096,275,73.47
2,2,defaulted,Food,<i>This description was written by Richard Maz...,Kenya,M,500.0,lender,"Tom Mung'ahu, 45, is married and has 6 childre...",tom mungahu married child child attending elem...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,688,175,74.49
3,3,defaulted,Services,<i>This description was written by Rowland Amu...,Kenya,F,500.0,lender,"Benedina, 42, is married and has 4 girls. In a...",benedina married girl addition family also tak...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,591,150,65.12
4,4,defaulted,Construction,<i>This description was written by Rowland Amu...,Kenya,M,500.0,lender,"Vincent Ondego 40, is married and has 8 childr...",vincent ondego married child beside family vin...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,659,172,76.01


In [72]:
#dropping columns that have nothing to do with textual loan, primarily 
#for illustration purposes (can also keep them)
kiva_df_bow = kiva_df_bow.drop(['loan_id', 'sector', 'country', 'gender', 
                                'loan_amount', 'nonpayment', 'en', 'en_clean',
                                'en_clean_pre'], axis=1)

In [73]:
from sklearn.model_selection import train_test_split

y = kiva_df_bow['status'] #our label/target
X = kiva_df_bow.drop(['status'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1024)

In [74]:
#decision tree
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state = 1024, min_samples_split = 10,
                            min_samples_leaf = 10, max_depth = 6)

clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=10, min_samples_split=10,
                       random_state=1024)

In [75]:
#use the model on our test data
y_pred_dt = clf.predict(X_test)

In [76]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

#observations: Not too bad, 75% accuracy

[[532 294]
 [129 746]]
              precision    recall  f1-score   support

   defaulted       0.80      0.64      0.72       826
        paid       0.72      0.85      0.78       875

    accuracy                           0.75      1701
   macro avg       0.76      0.75      0.75      1701
weighted avg       0.76      0.75      0.75      1701



In [77]:
#can print out tree
from sklearn.tree import export_text

print(export_text(clf))

|--- feature_192 <= 0.05
|   |--- feature_76 <= 0.02
|   |   |--- feature_193 <= 0.05
|   |   |   |--- feature_246 <= 0.06
|   |   |   |   |--- feature_31 <= 0.16
|   |   |   |   |   |--- feature_67 <= 0.26
|   |   |   |   |   |   |--- class: paid
|   |   |   |   |   |--- feature_67 >  0.26
|   |   |   |   |   |   |--- class: defaulted
|   |   |   |   |--- feature_31 >  0.16
|   |   |   |   |   |--- feature_270 <= 619.00
|   |   |   |   |   |   |--- class: defaulted
|   |   |   |   |   |--- feature_270 >  619.00
|   |   |   |   |   |   |--- class: defaulted
|   |   |   |--- feature_246 >  0.06
|   |   |   |   |--- feature_148 <= 0.07
|   |   |   |   |   |--- feature_163 <= 0.10
|   |   |   |   |   |   |--- class: defaulted
|   |   |   |   |   |--- feature_163 >  0.10
|   |   |   |   |   |   |--- class: paid
|   |   |   |   |--- feature_148 >  0.07
|   |   |   |   |   |--- feature_11 <= 0.11
|   |   |   |   |   |   |--- class: defaulted
|   |   |   |   |   |--- feature_11 >  0.11
|   | 