In [83]:
import pandas as pd
from gensim.models import TfidfModel, LdaModel, CoherenceModel, Nmf
from pprint import pprint

In [84]:
df = pd.read_csv('feed.csv')

In [85]:
projects = []

for i in range(len(df)):
    project = str(df['project_description'][i]) + str(df['fields_of_science'][i]) + str(df['keywords'][i])
    projects.append(project)

In [19]:
import helpers

projects_clean = helpers.preprocess_documents(projects)
projects_tfidf, projects_bow, dictionary = helpers.get_tfidf_representation(projects_clean)

In [92]:
number_of_topics = 12

In [93]:
lda_model = LdaModel(corpus=projects_bow,
                     id2word=dictionary,
                     num_topics=number_of_topics,
                     passes=10,
                     iterations=100)


In [94]:
# Coherence Score for LDA Model with n topics
coherence_model_lda = CoherenceModel(model=lda_model, texts=projects_clean, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score of LDA model with {number_of_topics} topics: {coherence_lda:.2f}')


Coherence Score of LDA model with 12 topics: 0.34


In [None]:
topics = lda_model.show_topics(num_topics=10, num_words=10)
for topic in topics:
    print(topic)

In [45]:
lda_model.get_document_topics(projects_bow[61])

[(2, 0.9879956)]

In [46]:
pprint(projects[2])

('The purpose of SMAP-SOC is to provide BLM natural resource managers with an '
 'application to model the current and future distributions of freshwater '
 'species of concern (SOC), across large management regions. These models will '
 'be based on occurrences from both crowdsourced data and detections using '
 'environmental DNA (e-DNA). These occurrences will then be related to Earth '
 'observations and other spatial data so distributions can be predicted across '
 'landscapes.Animals, Ecology and environment,  Nature and outdoorsfreshwater, '
 'fish, Alaska, rivers, lakes, threatened and endangered')


In [80]:
# apply Non-negative matrix Factorizaion with TF-IDF and 15 topics
nmf_model = Nmf(corpus=projects_tfidf,
                id2word=dictionary,
                num_topics = number_of_topics,
                random_state = 42)
    

In [81]:
topics = nmf_model.show_topics(num_topics=number_of_topics, num_words=10)
for topic in topics:
    print(topic)

(0, '0.008*"wildlife" + 0.006*"weather" + 0.006*"climate" + 0.006*"survey" + 0.005*"bird" + 0.005*"data" + 0.005*"geology" + 0.005*"earth" + 0.004*"computer" + 0.004*"exploration"')
(1, '0.011*"air" + 0.009*"community" + 0.007*"environmental" + 0.007*"quality" + 0.006*"monitoring" + 0.006*"sensor" + 0.005*"water" + 0.005*"epa" + 0.005*"whale" + 0.005*"data"')
(2, '0.010*"park" + 0.008*"bioblitz" + 0.007*"national" + 0.007*"taxonomic" + 0.007*"explore" + 0.007*"organism" + 0.007*"group" + 0.006*"site" + 0.006*"water" + 0.005*"education"')
(3, '0.008*"capital" + 0.006*"region" + 0.006*"history" + 0.006*"historic" + 0.005*"park" + 0.005*"school" + 0.005*"student" + 0.005*"explore" + 0.005*"planet" + 0.005*"national"')
(4, '0.010*"forest" + 0.007*"bird" + 0.006*"national" + 0.006*"bioblitz" + 0.006*"pollinator" + 0.006*"monitoring" + 0.006*"specie" + 0.005*"park" + 0.005*"scientist" + 0.005*"insect"')


In [82]:
# Coherence Score for NMF Model
coherence_model_nmf = CoherenceModel(model=nmf_model, texts=projects_clean, dictionary=dictionary, coherence='c_v')
coherence_nmf = coherence_model_nmf.get_coherence()
print(f'Coherence Score of NMF model: {coherence_nmf:.2f}')


Coherence Score of NMF model: 0.33
