# Notebook for unsupervised learning and topic modeling with NLTK and Gensim

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import sklearn as sk
import nltk
import gensim
import spacy
import ast

In [2]:
docs = pd.read_csv('/kaggle/input/docs-and-phrases/phraseadded.csv').iloc[:, 2:]
docs["concepts_found"] = docs["concepts_found"].apply(ast.literal_eval)
docs["keyphrases"] = docs["keyphrases"].apply(ast.literal_eval)

In [3]:
docs.head()

Unnamed: 0,id,title,summary,keyphrases,concepts_found
0,cs-9308101v1,Dynamic Backtracking,Because of their occasional need to return to ...,"[(backtracking, 0.5558), (backtrack, 0.4264), ...","[backtracking, backtrack, search, tree, approa..."
1,cs-9308102v1,A Market-Oriented Programming Environment and ...,Market price systems constitute a well-underst...,"[(allocations, 0.4223), (allocation, 0.4192), ...","[allocations, allocation, distributed, market,..."
2,cs-9309101v1,An Empirical Analysis of Search in GSAT,We describe an extensive study of search in GS...,"[(satisfiability, 0.5788), (algorithms, 0.4699...","[satisfiability, algorithms, 3sat, proposition..."
3,cs-9311101v1,The Difficulties of Learning Logic Programs wi...,As real logic programmers normally use cut (!)...,"[(predicate, 0.4523), (logic, 0.3937), (declar...","[predicate, logic, declarative, extensional, c..."
4,cs-9311102v1,Software Agents: Completing Patterns and Const...,To support the goal of allowing users to recor...,"[(software, 0.438), (interactive, 0.437), (qui...","[software, interactive, quicktime, recording, ..."


## Part 1: LDA tuning and topic modeling with Gensim (next part using BERT and comparing results)

In [4]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [5]:
texts = docs['concepts_found']
dict = Dictionary(texts)

In [6]:
corpus = [dict.doc2bow(text) for text in docs['concepts_found']]

In [7]:
coherence_scores = []
topic_range = range(2, 16)
lda_models = []

for k in topic_range:
    lda = LdaModel(corpus=corpus, id2word=dict, num_topics=k, random_state=0, passes=10)
    lda_models.append(lda)

    coherence_model = CoherenceModel(model=lda, texts=texts, dictionary=dict, coherence='c_v')
    score = coherence_model.get_coherence()
    coherence_scores.append(score)
    
    print(f"Topics: {k}  →  Coherence: {score:.4f}")

Topics: 2  →  Coherence: 0.2460
Topics: 3  →  Coherence: 0.3856
Topics: 4  →  Coherence: 0.2990
Topics: 5  →  Coherence: 0.3117
Topics: 6  →  Coherence: 0.3559
Topics: 7  →  Coherence: 0.3321
Topics: 8  →  Coherence: 0.3621
Topics: 9  →  Coherence: 0.3800
Topics: 10  →  Coherence: 0.4019
Topics: 11  →  Coherence: 0.3881
Topics: 12  →  Coherence: 0.3936
Topics: 13  →  Coherence: 0.3976
Topics: 14  →  Coherence: 0.3971
Topics: 15  →  Coherence: 0.4331


In [8]:
# Plot coherence scores interactively
score_df = pd.DataFrame({"Num Topics": list(topic_range), "Coherence": coherence_scores})

fig = px.line(score_df, x="Num Topics", y="Coherence", title="📈 Coherence Score vs. Number of Topics",
              markers=True)
fig.update_layout(xaxis_title="Number of Topics", yaxis_title="Coherence Score (c_v)")
fig.show()

Use LDA k = 10 to start with

In [9]:
final_mod = lda_models[9]

In [10]:
# Assign the most probable topic to each document
def get_dominant_topic(model, corpus):
    dominant_topics = []
    for doc in corpus:
        topic_probs = model.get_document_topics(doc)
        if topic_probs:
            # Select topic with highest probability
            dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
        else:
            dominant_topic = -1  # If no topic is assigned
        dominant_topics.append(dominant_topic)
    return dominant_topics

docs["gensim_lda_topic"] = get_dominant_topic(final_mod, corpus)


In [11]:
docs["gensim_lda_topic"]

0        3
1        2
2        1
3        4
4        5
        ..
85934    2
85935    8
85936    8
85937    6
85938    8
Name: gensim_lda_topic, Length: 85939, dtype: int64

In [12]:
docs_ints = docs['gensim_lda_topic'].to_list()
max(docs_ints) - min(docs_ints)

10

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

#Extracting top 20 words per topic
topics = [final_mod.show_topic(topicid=i, topn=20) for i in range(10)]

#Build a global vocabulary (sorted for consistent ordering)
vocab = sorted({ word for topic in topics for word, _ in topic })

#Map each vocab term to its column index
vocab2idx = { w: idx for idx, w in enumerate(vocab) }

#Create a topic-by-vocab weight matrix
topic_matrix = np.zeros((len(topics), len(vocab)))
for ti, topic in enumerate(topics):
    for word, weight in topic:
        topic_matrix[ti, vocab2idx[word]] = weight

#Compute pairwise cosine similarities
sim_matrix = cosine_similarity(topic_matrix)

#Print with labels
labels = [f"Topic {i+1}" for i in range(len(topics))]
sim_df = pd.DataFrame(sim_matrix, index=labels, columns=labels)
print(sim_df.round(3))


          Topic 1  Topic 2  Topic 3  Topic 4  Topic 5  Topic 6  Topic 7  \
Topic 1     1.000      0.0    0.215    0.000    0.007    0.099    0.000   
Topic 2     0.000      1.0    0.000    0.000    0.000    0.000    0.000   
Topic 3     0.215      0.0    1.000    0.000    0.000    0.124    0.000   
Topic 4     0.000      0.0    0.000    1.000    0.000    0.042    0.030   
Topic 5     0.007      0.0    0.000    0.000    1.000    0.000    0.000   
Topic 6     0.099      0.0    0.124    0.042    0.000    1.000    0.051   
Topic 7     0.000      0.0    0.000    0.030    0.000    0.051    1.000   
Topic 8     0.000      0.0    0.000    0.000    0.000    0.000    0.000   
Topic 9     0.032      0.0    0.041    0.000    0.026    0.144    0.000   
Topic 10    0.000      0.0    0.000    0.000    0.000    0.000    0.000   

          Topic 8  Topic 9  Topic 10  
Topic 1       0.0    0.032       0.0  
Topic 2       0.0    0.000       0.0  
Topic 3       0.0    0.041       0.0  
Topic 4       0.0 

In [14]:
for i in range(1, 11):
    print(f"\nTopic {i}:")
    print(final_mod.print_topic(i, topn=20))


Topic 1:
0.069*"optimization" + 0.062*"adversarial" + 0.045*"optimal" + 0.041*"algorithms" + 0.033*"kernel" + 0.030*"encoder" + 0.026*"estimators" + 0.025*"regularized" + 0.018*"algorithm" + 0.018*"detection" + 0.016*"manifold" + 0.015*"autoencoders" + 0.015*"estimator" + 0.014*"outliers" + 0.014*"boosting" + 0.013*"hyperparameters" + 0.012*"minimax" + 0.012*"adaptation" + 0.010*"optimizing" + 0.010*"images"

Topic 2:
0.102*"learning" + 0.046*"language" + 0.045*"generative" + 0.028*"deep" + 0.026*"neural" + 0.024*"languages" + 0.022*"translation" + 0.022*"text" + 0.021*"multilingual" + 0.021*"gradient" + 0.017*"nonparametric" + 0.015*"annotation" + 0.015*"ensemble" + 0.013*"annotated" + 0.013*"annotations" + 0.012*"adaptive" + 0.012*"lingual" + 0.012*"autoencoder" + 0.012*"texts" + 0.012*"memory"

Topic 3:
0.100*"models" + 0.077*"bayesian" + 0.047*"stochastic" + 0.043*"variational" + 0.038*"sampling" + 0.030*"inference" + 0.027*"likelihood" + 0.026*"probabilistic" + 0.024*"causal" + 0

In [17]:
rows = []
for topic_id in range(1, 11):
    # get_topic returns a list of (term, weight) tuples
    for rank, (term, weight) in enumerate(final_mod.show_topic(topic_id, topn=20), start=1):
        rows.append({
            "topic":    topic_id,
            "rank":     rank,
            "term":     term,
            "weight":   weight
        })
    
df_topics = pd.DataFrame(rows)
df_topics.to_csv("topics_1-10_terms.csv", index=False)
print("Saved topics to topics_1-10_terms.csv")

Saved topics to topics_1-10_terms.csv
