<a href="https://colab.research.google.com/github/namitabagri/Topic-Modelling-Based-Classifier/blob/main/themes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
import numpy as np
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import gensim

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
url = "https://raw.githubusercontent.com/namitabagri/Topic-Modelling-Based-Classifier/main/synthetic_app_issues.csv"
df = pd.read_csv(url)
df=df.drop('week_start_date',axis=1)

In [None]:
df.head()

Unnamed: 0,ticket,application,week_start_date,description
0,GMC3005398,excel,2002-05-15,issue in managing file permission in one drive...
1,GMC3491287,share point,2002-05-15,excel chart formatting resets automatically fo...
2,GMC3241859,excel,2002-05-15,issue in managing file permission in one drive...
3,GMC3289072,excel,2002-05-15,excel formula =SUM(A1:A5) not working properly...
4,GMC3573428,excel,2002-05-15,the file is not syncing on the one drive syste...


In [None]:
df['application'].unique()

array(['excel', 'share point', 'one drive', 'ppt'], dtype=object)

In [None]:
docs = df['description'].astype(str).tolist()

In [None]:
# Simple preprocessing
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words and len(t)>1]
    return " ".join(tokens)

docs_clean = [preprocess(d) for d in docs]


In [None]:
# Vectorize (BoW)
vectorizer = CountVectorizer(max_df=0.85, min_df=2, max_features=5000, ngram_range=(1,2))
dtm = vectorizer.fit_transform(docs_clean)  # document-term matrix (sparse)


In [None]:
# Fit LDA
n_topics = 4
lda = LatentDirichletAllocation(n_components=n_topics,
                                max_iter=50,
                                learning_method='batch',
                                doc_topic_prior=0.1,
                                random_state=42)
lda.fit(dtm)


In [None]:
# Document-topic probability matrix
doc_topic_matrix = lda.transform(dtm)
dominant_topic_per_doc = np.argmax(doc_topic_matrix, axis=1)

In [None]:
feature_names = vectorizer.get_feature_names_out()

def print_topics(model, n_top_words=10):
    for idx, topic in enumerate(model.components_):
        print(f"Topic #{idx}: ",
              [feature_names[i] for i in topic.argsort()[-n_top_words:]])

print_topics(lda)


Topic #0:  ['playing', 'ppt animations', 'shared contact', 'contact admin', 'user', 'mail', 'mail com', 'user mail', 'contact user', 'ppt']
Topic #1:  ['level', 'permission system', 'point due', 'upload', 'unable upload', 'system level', 'unable', 'user unable', 'upload file', 'user']
Topic #2:  ['contact admin', 'mail', 'user mail', 'mail com', 'admin', 'company com', 'company', 'admin company', 'excel', 'user']
Topic #3:  ['permission', 'one', 'one drive', 'drive', 'file', 'contact support', 'example', 'example com', 'support', 'support example']


In [None]:
topic_to_label = {
    0: "ppt",
    1: "share point" ,
    2: "excel",
    3: "one drive"
}

In [None]:
df['topic_number'] = dominant_topic_per_doc
df['topic_label'] = df['topic_number'].map(topic_to_label)

In [None]:
df

Unnamed: 0,ticket,application,week_start_date,description,topic_number,topic_label
0,GMC3005398,excel,2002-05-15,issue in managing file permission in one drive...,3,one drive
1,GMC3491287,share point,2002-05-15,excel chart formatting resets automatically fo...,2,excel
2,GMC3241859,excel,2002-05-15,issue in managing file permission in one drive...,3,one drive
3,GMC3289072,excel,2002-05-15,excel formula =SUM(A1:A5) not working properly...,2,excel
4,GMC3573428,excel,2002-05-15,the file is not syncing on the one drive syste...,0,ppt
...,...,...,...,...,...,...
195,GMC3609447,ppt,2002-05-15,ppt crashes when adding images from http://com...,0,ppt
196,GMC3713946,share point,2002-05-15,the excel sheet is locked for editing by admin...,2,excel
197,GMC3011455,one drive,2002-05-15,excel formula =SUM(A1:A5) not working properly...,2,excel
198,GMC3254224,one drive,2002-05-15,ppt crashes when adding images from http://com...,0,ppt


In [None]:
df['topic_label'].value_counts()

Unnamed: 0_level_0,count
topic_label,Unnamed: 1_level_1
excel,80
ppt,61
share point,37
one drive,22


In [None]:
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import numpy as np

# Prepare text data as tokenized documents again (same preprocessing as before)
texts = df['description'].apply(lambda x: x.split()).tolist()

# Create a Gensim dictionary & corpus
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Convert sklearn LDA topics to gensim format
topics = []
for topic_weights in lda.components_:
    top_word_ids = topic_weights.argsort()[:-11:-1]  # top 10 words
    topics.append([vectorizer.get_feature_names_out()[i] for i in top_word_ids])

# Calculate coherence (C_V)
coherence_model = CoherenceModel(topics=topics,
                                 texts=texts,
                                 dictionary=dictionary,
                                 coherence='c_v')
coherence_score = coherence_model.get_coherence()

print(f"Coherence Score: {coherence_score:.4f}")


Coherence Score: 0.7743
