In [11]:
#Purpose: We're going to use LDA (latent Dirichlet allocation) to analyze the
#kiva dataset (requests for loans) and to extrapolate relevant topics from
#the loan requests

In [None]:
import pandas as pd

In [12]:
#import kiva dataset (requested loans)
kiva_df = pd.read_csv("kiva_cleaned.csv")

In [13]:
kiva_df.info()
#observations: nearly 7000 rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6818 entries, 0 to 6817
Data columns (total 9 columns):
loan_id        6818 non-null int64
status         6818 non-null object
sector         6818 non-null object
en             6818 non-null object
country        6818 non-null object
gender         6818 non-null object
loan_amount    6818 non-null float64
nonpayment     6818 non-null object
en_clean       6802 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 479.5+ KB


In [14]:
#there happen to be some rows with nothing in them...remove them
kiva_df = kiva_df.dropna()
kiva_df = kiva_df.reset_index(drop=True)

In [15]:
kiva_df.head()
#en_clean is our text field of primary interest

Unnamed: 0,loan_id,status,sector,en,country,gender,loan_amount,nonpayment,en_clean
0,0,defaulted,Agriculture,<i>This description is written by Rowland Amul...,Kenya,M,500.0,lender,"Robert, 40, is married and has 6 children. In ..."
1,1,defaulted,Food,<i>This description is written by Rowland Amul...,Kenya,F,500.0,lender,"Petronilla, 30, was deserted by her husband an..."
2,2,defaulted,Food,<i>This description was written by Richard Maz...,Kenya,M,500.0,lender,"Tom Mung'ahu, 45, is married and has 6 childre..."
3,3,defaulted,Services,<i>This description was written by Rowland Amu...,Kenya,F,500.0,lender,"Benedina, 42, is married and has 4 girls. In a..."
4,4,defaulted,Construction,<i>This description was written by Rowland Amu...,Kenya,M,500.0,lender,"Vincent Ondego 40, is married and has 8 childr..."


In [19]:
#import text mining packages
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import unidecode

#define stop words (notice we include spanish due to its df presence)
stop_words = set(stopwords.words('english') + stopwords.words('spanish'))

#lemmatizer
lemmer = WordNetLemmatizer()

#preprocessing function
def preprocess(x):
    x = x.lower() #lower case everything
    
    x = re.sub(r'[^\w\s]', '', x) #remove any punctuation
    
    x = unidecode.unidecode(x)
    
    x = re.sub(r'\d+', '', x) #remove any numbers
    
    x = [lemmer.lemmatize(w) for w in x.split() if w not in stop_words]
        
    return ' '.join(x)

kiva_df['en_clean_pre'] = kiva_df['en_clean'].apply(preprocess)

In [21]:
#check out first couple rows now
kiva_df.head(2)
#observations: notice that en_clean_pre now shows post-preprocessing (we have
#a list of terms)

Unnamed: 0,loan_id,status,sector,en,country,gender,loan_amount,nonpayment,en_clean,en_clean_pre
0,0,defaulted,Agriculture,<i>This description is written by Rowland Amul...,Kenya,M,500.0,lender,"Robert, 40, is married and has 6 children. In ...",robert married child addition family take care...
1,1,defaulted,Food,<i>This description is written by Rowland Amul...,Kenya,F,500.0,lender,"Petronilla, 30, was deserted by her husband an...",petronilla deserted husband responsible upbrin...


In [23]:
#original text field
kiva_df.iloc[0, :].en_clean

'Robert, 40, is married and has 6 children. In addition to his family of 8, he takes care of his mother and 5 brothers. Robert started by planting vegetables and selling at the local market. He then diversified with a tea nursery which is more profitable. Given a loan of $500, Robert will be able to improve his activities by buying fertilizers, pesticides, a pump and seedlings. The pump will remove the uncertainties of the weather. The current capital in his venture is Ksh 10500 or $150, which does not allow Robert to maximize his potential. Robert completed high school and has never got employment. He got apprentice training from a tea extension officer. He is a go getter, and his main hobby is teaching music.'

In [24]:
#new (preprocessed) text field
kiva_df.iloc[0, :].en_clean_pre

'robert married child addition family take care mother brother robert started planting vegetable selling local market diversified tea nursery profitable given loan robert able improve activity buying fertilizer pesticide pump seedling pump remove uncertainty weather current capital venture ksh allow robert maximize potential robert completed high school never got employment got apprentice training tea extension officer go getter main hobby teaching music'

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

#set up TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df = 0.5, min_df = 0.05,
                            max_features = 1000, ngram_range = [1,3])

#parameters: max_df is the max % of documents a word can be in
            #min_df is the min % of documents a word can be in
            #max_features means only keep the x amount of frequent words
            #ngram_range specifies what level of n-grams to keep
            
#pass in our data
dtm = vectorizer.fit_transform(kiva_df['en_clean_pre'])

In [28]:
dtm.shape #observations: 270 features were kept

(6802, 270)

In [30]:
#let's look at what these features were
vectorizer.get_feature_names()
#notice quite a few interesting words, some n-grams as well

['able',
 'active',
 'active member',
 'activity',
 'add',
 'additional',
 'age',
 'ago',
 'allow',
 'also',
 'always',
 'amount',
 'another',
 'applied',
 'applied loan',
 'area',
 'attend',
 'back',
 'basic',
 'bean',
 'began',
 'belief',
 'better',
 'born',
 'bought',
 'boy',
 'business selling',
 'buy',
 'buying',
 'capital',
 'care',
 'cement',
 'child school',
 'childrens',
 'church',
 'city',
 'client',
 'clothes',
 'clothing',
 'college',
 'community',
 'continue',
 'cost',
 'could',
 'cow',
 'currently',
 'customer',
 'dairy',
 'dairy cow',
 'daughter',
 'day',
 'decided',
 'demand',
 'different',
 'domingo',
 'dominican',
 'dream',
 'due',
 'earn',
 'educate',
 'education',
 'enable',
 'enough',
 'entrepreneur',
 'every',
 'every day',
 'expand',
 'expand business',
 'expense',
 'experience',
 'explains',
 'family',
 'farm',
 'farmer',
 'farming',
 'father',
 'fee',
 'feed',
 'first',
 'first loan',
 'five',
 'five child',
 'food',
 'four',
 'four child',
 'friend',
 'fruit',

In [32]:
#now we can apply LDA
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 20, doc_topic_prior=None,
                                     topic_word_prior= None, max_iter=200,
                                     learning_method="batch", random_state=1024,
                                     n_jobs=2, verbose=0)

#n_components is # of topics we want to identify

lda_output = lda_model.fit(dtm)

In [34]:
#check how well it works, look at log likelihood
lda_model.score(dtm)
#observations: (higher values are better)

-223934.59825649523

In [35]:
#perplexity score as well
lda_model.perplexity(dtm)
#observations: (lower values are better)

545.3753103152957

In [36]:
#get theta matrix (by transforming the tdm matrix)
theta = pd.DataFrame(lda_model.transform(dtm))

In [37]:
theta.head()
#observations: columns are our 20 topics and rows are all our documents
#entry is saying what was the membership of that topic in that document
#ex: topic 0 was hardly at all in document 0
#ex: topic 7 was alot in document 4 (0.82)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.835625,0.008651,0.008651,0.008651
1,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.029796,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.837044,0.007398,0.007398
2,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.243813,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.607688,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825
3,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.80431,0.010299,0.010299,0.010299
4,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.821479,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396


In [38]:
#get beta matrix
beta = pd.DataFrame(lda_model.components_)

In [39]:
beta.head()
#observations: shows us how much each word is in each topic
#words are columns and rows are topics

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,260,261,262,263,264,265,266,267,268,269
0,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
1,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
2,14.300158,0.05,0.05,0.241028,0.223391,2.343714,0.05,2.091495,0.05,13.9998,...,0.05,10.16035,5.554609,3.237253,0.946131,1.007789,2.001923,0.05,1.167519,0.443704
3,4.316497,0.730651,0.05,3.171225,4.121851,0.836469,1.629435,7.81208,3.326142,6.754259,...,0.05,8.038774,4.762683,2.535545,2.016488,5.113907,7.684086,1.222312,4.539481,3.665516
4,34.877355,0.740758,0.05,4.790612,3.798218,6.497496,4.56763,37.430035,16.636247,45.824983,...,0.05,33.452215,10.244747,23.988039,11.035844,34.714183,58.693398,11.685873,1.70333,4.462702


In [41]:
#this code will give us which words are in which topics nicely
feature_names = vectorizer.get_feature_names()
termss = list()
for topic_id, topic in enumerate(lda_model.components_):
    terms = ' '.join([feature_names[i] for i in topic.argsort()[:-6 - 1:-1]])
    termss.append(terms)
    
topic_summary = pd.DataFrame({'TopicID': range(0, len(termss)), 'Terms': termss})

In [42]:
topic_summary
#observations: here are our 20 topics and the top 6 words in that topic to
#give us an idea of what they are about
#ex: topic 1 seems to be about grocery products, topic 3 seems like the DR

Unnamed: 0,TopicID,Terms
0,0,fee applied farming dairy repay married child
1,1,rice bean grocery store basic milk
2,2,second second loan first loan first kiva product
3,3,santo domingo santo domingo community entrepre...
4,4,product home store merchandise old year old
5,5,expand expand business father living fee need
6,6,stock primary school able primary school increase
7,7,group usd member school woman group woman
8,8,boy entrepreneur community husband product cus...
9,9,fee applied farming dairy repay married child
