In [1]:
import pandas as pd
import gensim
import nltk
import gensim.corpora as corpora
from gensim.models import LdaModel, CoherenceModel, Phrases
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import re

In [2]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
import pandas as pd
data = pd.read_csv('kiva_train.csv')

In [5]:
data.head()

Unnamed: 0,id,country,en,gender,loan_amount,nonpayment,sector,status
0,1,Ecuador,<h4>Business Description</h4> \r\n <p> Don Mau...,M,825,lender,Food,1
1,2,Dominican Republic,Rosa Iris is a brilliant entrepreneur who sell...,F,450,partner,Retail,0
2,3,Kenya,Sirote is married with six children. Two of he...,F,600,lender,Agriculture,1
3,4,Kenya,David Mwangi Kimani is 33 years old and marri...,M,650,lender,Food,1
4,5,Dominican Republic,Nilda is a very persistent woman who has learn...,F,325,partner,Food,0


In [6]:
data.shape

(5454, 8)

In [7]:
# Initialize stopwords and lemmatizer
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# 1. Preprocessing Function (Simplified with Bigram within the function)
def preprocess(text):
    if not isinstance(text, str):
        text = ''
    
    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Convert to lowercase and tokenize
    tokens = simple_preprocess(text.lower(), deacc=True)
    
    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Create bigrams
    bigram = Phrases([tokens], min_count=5, threshold=100)
    tokens = bigram[tokens]
    
    return tokens

In [8]:
# Apply preprocessing
data['processed_en'] = data['en'].apply(preprocess)

In [9]:
print(data['en'][0])

<h4>Business Description</h4> 
 <p> Don Mauro co-owns a restaurant alongside his wife; it is located at an excellent location in one of the principal streets of his neighborhood.  The restaurant is open for breakfast, lunch, and dinner and also sells empanadas [corn cakes]. </p>

<h4>Loan Use</h4> 
<p> He will use the loan proceeds to better his restaurant by buying more fixed assets such as: chairs, tables and other furniture as needed. </p>

<h4> Personal Information</h4> 
<p> He is 49 years old, he is married and has three sons, his house is built of cement. His principal goal is to grow his business by reinvesting the revenue obtained. </p>




 
<p> <b>Translated from Spanish by Felipe Salcedo, Kiva Volunteer.</b><p>


In [10]:
print(data['processed_en'][0])

['business', 'description', 'mauro', 'co', 'owns', 'restaurant', 'alongside', 'wife', 'located', 'excellent', 'location', 'one', 'principal', 'street', 'neighborhood', 'restaurant', 'open', 'breakfast', 'lunch', 'dinner', 'also', 'sell', 'empanadas', 'corn', 'cake', 'loan', 'use', 'use', 'loan', 'proceeds', 'better', 'restaurant', 'buying', 'fixed', 'asset', 'chair', 'table', 'furniture', 'needed', 'personal', 'information', 'year', 'old', 'married', 'three', 'son', 'house', 'built', 'cement', 'principal', 'goal', 'grow', 'business', 'reinvesting', 'revenue', 'obtained', 'translated', 'spanish', 'felipe', 'salcedo', 'kiva', 'volunteer']


In [11]:
# Create Dictionary and Corpus
dictionary = corpora.Dictionary(data['processed_en'])
# Filter out extremes
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in data['processed_en']]

In [12]:
# Build LDA Model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=5, 
                     random_state=100,
                     update_every=1,
                     chunksize=4000,
                     passes=10,
                     alpha='auto',
                     eta='auto',
                     per_word_topics=True)

In [13]:
# 4. Print the 25 most significant terms per topic
for i in range(5):
    print(f'Topic {i+1}:')
    print([term for term, _ in lda_model.show_topic(i, topn=25)])
    print('\n')

Topic 1:
['u', 'school', 'family', 'buy', 'able', 'two', 'purchase', 'also', 'income', 'requesting', 'expand', 'married', 'customer', 'stock', 'due', 'need', 'many', 'one', 'pay', 'kenya', 'clothes', 'market', 'husband', 'would', 'shop']


Topic 2:
['group', 'cow', 'dairy', 'milk', 'member', 'school', 'usd', 'married', 'farming', 'woman', 'buy', 'income', 'month', 'church', 'font', 'also', 'maize', 'repay', 'two', 'farm', 'per', 'farmer', 'active', 'weec', 'mary']


Topic 3:
['group', 'member', 'able', 'school', 'one', 'woman', 'family', 'small', 'married', 'stock', 'community', 'esperanza', 'first', 'buy', 'span', 'also', 'started', 'life', 'two', 'support', 'selling', 'pay', 'pemci', 'steer', 'three']


Topic 4:
['rice', 'farmer', 'land', 'baba', 'sector', 'area', 'also', 'crop', 'water', 'program', 'part', 'bank', 'rural', 'mifex', 'communal', 'people', 'many', 'santa', 'month', 'lucia', 'farm', 'production', 'association', 'finance', 'use']


Topic 5:
['product', 'buy', 'work', 'st

In [14]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics using pyLDAvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)

# Display the pyLDAvis visualization
pyLDAvis.display(vis)

In [15]:
# Get the topic distribution for each document
topic_distributions = []
for bow in corpus:
    topic_distribution = lda_model.get_document_topics(bow, minimum_probability=0)
    topic_distributions.append([weight for _, weight in topic_distribution])

# Create a DataFrame
topic_df = pd.DataFrame(topic_distributions, columns=[f'Topic_{i+1}' for i in range(lda_model.num_topics)])

# Add the processed text to the DataFrame
topic_df.insert(0, 'Processed_Text', data['processed_en'])

# Add a "Dominant_Topic" column with the name of the topic with the highest weight
topic_df['Dominant_Topic'] = topic_df.iloc[:, 1:].idxmax(axis=1)

topic_df

Unnamed: 0,Processed_Text,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Dominant_Topic
0,"[business, description, mauro, co, owns, resta...",0.001262,0.024441,0.001353,0.000504,0.972439,Topic_5
1,"[rosa, iris, brilliant, entrepreneur, sell, si...",0.001079,0.000682,0.001158,0.000431,0.996649,Topic_5
2,"[sirote, married, six, child, two, child, marr...",0.001046,0.000661,0.995980,0.000418,0.001896,Topic_3
3,"[david, mwangi, kimani, year, old, married, on...",0.994863,0.000829,0.001406,0.000524,0.002378,Topic_1
4,"[nilda, persistent, woman, learned, strive, be...",0.001173,0.000742,0.047621,0.000469,0.949994,Topic_5
...,...,...,...,...,...,...,...
5449,"[mary, kanyi, married, five, child, attended, ...",0.001238,0.383899,0.612124,0.000495,0.002244,Topic_3
5450,"[irene, general, store, sell, clothes, school,...",0.000779,0.000493,0.000836,0.000311,0.997581,Topic_5
5451,"[kevin, year, old, orphan, lost, parent, hiv, ...",0.491629,0.348973,0.154791,0.000832,0.003775,Topic_1
5452,"[business, description, last, twelve, year, se...",0.000870,0.000550,0.000933,0.043122,0.954524,Topic_5


In [16]:
topic_df['Dominant_Topic'].value_counts()

Dominant_Topic
Topic_5    2618
Topic_1    1233
Topic_3     881
Topic_2     596
Topic_4     126
Name: count, dtype: int64