# Notebook for unsupervised learning and topic modeling with NLTK and Gensim

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import sklearn as sk
import nltk
import gensim
import spacy
import ast

In [None]:
docs = pd.read_csv('phraseadded.csv').iloc[:, 2:]
docs["concepts_found"] = docs["concepts_found"].apply(ast.literal_eval)
docs["keyphrases"] = docs["keyphrases"].apply(ast.literal_eval)

In [46]:
docs.head()

Unnamed: 0,id,title,summary,keyphrases,concepts_found
0,cs-9308101v1,Dynamic Backtracking,Because of their occasional need to return to ...,"[(backtracking, 0.5558), (backtrack, 0.4264), ...","[backtracking, backtrack, search, tree, approa..."
1,cs-9308102v1,A Market-Oriented Programming Environment and ...,Market price systems constitute a well-underst...,"[(allocations, 0.4223), (allocation, 0.4192), ...","[allocations, allocation, distributed, market,..."
2,cs-9309101v1,An Empirical Analysis of Search in GSAT,We describe an extensive study of search in GS...,"[(satisfiability, 0.5788), (algorithms, 0.4699...","[satisfiability, algorithms, 3sat, proposition..."
3,cs-9311101v1,The Difficulties of Learning Logic Programs wi...,As real logic programmers normally use cut (!)...,"[(predicate, 0.4523), (logic, 0.3937), (declar...","[predicate, logic, declarative, extensional, c..."
4,cs-9311102v1,Software Agents: Completing Patterns and Const...,To support the goal of allowing users to recor...,"[(software, 0.438), (interactive, 0.437), (qui...","[software, interactive, quicktime, recording, ..."


## Part 1: LDA tuning and topic modeling with Gensim (next part using BERT and comparing results)

In [47]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [48]:
texts = docs['concepts_found']
dict = Dictionary(texts)

In [49]:
corpus = [dict.doc2bow(text) for text in docs['concepts_found']]

In [51]:
coherence_scores = []
topic_range = range(2, 16)
lda_models = []

for k in topic_range:
    lda = LdaModel(corpus=corpus, id2word=dict, num_topics=k, random_state=0, passes=10)
    lda_models.append(lda)

    coherence_model = CoherenceModel(model=lda, texts=texts, dictionary=dict, coherence='c_v')
    score = coherence_model.get_coherence()
    coherence_scores.append(score)
    
    print(f"Topics: {k}  →  Coherence: {score:.4f}")

Topics: 2  →  Coherence: 0.2460
Topics: 3  →  Coherence: 0.3856
Topics: 4  →  Coherence: 0.2990
Topics: 5  →  Coherence: 0.3117
Topics: 6  →  Coherence: 0.3559
Topics: 7  →  Coherence: 0.3321
Topics: 8  →  Coherence: 0.3621
Topics: 9  →  Coherence: 0.3800
Topics: 10  →  Coherence: 0.4019
Topics: 11  →  Coherence: 0.3881
Topics: 12  →  Coherence: 0.3936
Topics: 13  →  Coherence: 0.3976
Topics: 14  →  Coherence: 0.3971
Topics: 15  →  Coherence: 0.4331


In [None]:
import nbformat
# Plot coherence scores interactively
score_df = pd.DataFrame({"Num Topics": list(topic_range), "Coherence": coherence_scores})

fig = px.line(score_df, x="Num Topics", y="Coherence", title="📈 Coherence Score vs. Number of Topics",
              markers=True)
fig.update_layout(xaxis_title="Number of Topics", yaxis_title="Coherence Score (c_v)")
fig.show()

NameError: name 'pd' is not defined