In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import gensim
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import numpy as np

em=pd.read_csv('englishmodel_cleaned.csv')
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_df=0.85,min_df=5,stop_words='english')

dtm=tfidf.fit_transform(em['StopWord Removal'])
dtm

<120927x8956 sparse matrix of type '<class 'numpy.float64'>'
	with 1075897 stored elements in Compressed Sparse Row format>

In [30]:
from sklearn.decomposition import NMF
nmf_model=NMF(n_components=20,random_state=1000)
nmf_model.fit(dtm)

def display_topics(model, feature_names, num_top_words):
    for index, topic in enumerate(model.components_):
        print(f"({index}, (", end='')
        top_features = [(feature_names[i], topic[i]) for i in topic.argsort()[-num_top_words:]]
        print(' + '.join([f'{weight:.3f}*" {word} "' for word, weight in reversed(top_features)]), end='')
        print("))")

# Display the topics
num_top_words = 10
feature_names = tfidf.get_feature_names_out()
display_topics(nmf_model, feature_names, num_top_words)

(0, (10.239*" good " + 0.205*" delivery " + 0.162*" really " + 0.156*" looking " + 0.143*" service " + 0.137*" packing " + 0.132*" products " + 0.114*" overall " + 0.113*" use " + 0.109*" performance "))
(1, (6.657*" nice " + 0.179*" products " + 0.103*" looking " + 0.076*" watch " + 0.059*" colour " + 0.052*" packing " + 0.049*" really " + 0.047*" comfortable " + 0.038*" look " + 0.038*" design "))
(2, (7.877*" product " + 0.253*" amazing " + 0.237*" satisfied " + 0.190*" useful " + 0.126*" great " + 0.113*" delivery " + 0.113*" worst " + 0.099*" use " + 0.098*" love " + 0.077*" loved "))
(3, (7.880*" quality " + 0.901*" poor " + 0.498*" sound " + 0.404*" low " + 0.300*" superb " + 0.245*" picture " + 0.166*" build " + 0.148*" expected " + 0.130*" cloth " + 0.122*" cheap "))
(4, (4.836*" money " + 3.938*" value " + 0.773*" waste " + 0.086*" dont " + 0.068*" superb " + 0.059*" performance " + 0.058*" tv " + 0.055*" great " + 0.051*" sound " + 0.050*" west "))
(5, (3.154*" love " + 2.03

In [3]:
topic_results=nmf_model.transform(dtm)
topic_results.argmax(axis=1)

array([2, 1, 3, ..., 3, 4, 3], dtype=int64)

In [4]:
em['Topic']=topic_results.argmax(axis=1)

In [5]:
mytopic_dict={
    0:'Size Availability',
    1:'Durability',
    2:'Discount and Promotion',
    3:'Comfortability',
    4:'Price'}

em['Topic Label']=em['Topic'].map(mytopic_dict)

In [6]:
# Extract top ten keywords for each topic
def get_top_keywords(model, feature_names, n_top_words):
    top_keywords = {}
    for idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[-n_top_words:]]
        top_keywords[idx] = ', '.join(reversed(top_features))
    return top_keywords


In [7]:
#DATA PREPROCESSING
# Apply the function to extract top ten keywords
top_keywords = get_top_keywords(nmf_model, tfidf.get_feature_names_out(), 10)

# Add top ten keywords to the DataFrame
em['Top Ten Keywords'] = em['Topic'].apply(lambda x: top_keywords[x])
# Count the frequency of each aspect
aspect_counts = em['Topic Label'].value_counts()

# Display the aspect counts
print("Aspect Counts:")
print(aspect_counts)

Aspect Counts:
Topic Label
Comfortability            48908
Discount and Promotion    28892
Size Availability         16950
Durability                15780
Price                     10397
Name: count, dtype: int64


In [8]:
pd.set_option('display.max_colwidth', None)
# Display the modified DataFrame
df = pd.DataFrame(em[['Topic', 'Topic Label', 'Top Ten Keywords']])
df.head()

Unnamed: 0,Topic,Topic Label,Top Ten Keywords
0,2,Discount and Promotion,"product, flipkart, awesome, happy, buy, price, excellent, thank, thanks, best"
1,1,Durability,"nice, products, flipkart, thank, looking, like, happy, really, thanks, small"
2,3,Comfortability,"quality, best, bad, poor, price, super, size, sound, low, small"
3,2,Discount and Promotion,"product, flipkart, awesome, happy, buy, price, excellent, thank, thanks, best"
4,2,Discount and Promotion,"product, flipkart, awesome, happy, buy, price, excellent, thank, thanks, best"


In [9]:
df.iloc[120910:120926]

Unnamed: 0,Topic,Topic Label,Top Ten Keywords
120910,3,Comfortability,"quality, best, bad, poor, price, super, size, sound, low, small"
120911,1,Durability,"nice, products, flipkart, thank, looking, like, happy, really, thanks, small"
120912,3,Comfortability,"quality, best, bad, poor, price, super, size, sound, low, small"
120913,3,Comfortability,"quality, best, bad, poor, price, super, size, sound, low, small"
120914,3,Comfortability,"quality, best, bad, poor, price, super, size, sound, low, small"
120915,3,Comfortability,"quality, best, bad, poor, price, super, size, sound, low, small"
120916,3,Comfortability,"quality, best, bad, poor, price, super, size, sound, low, small"
120917,3,Comfortability,"quality, best, bad, poor, price, super, size, sound, low, small"
120918,3,Comfortability,"quality, best, bad, poor, price, super, size, sound, low, small"
120919,3,Comfortability,"quality, best, bad, poor, price, super, size, sound, low, small"


In [31]:
# Coherence Score Calculation
# Prepare data for gensim
texts = [text.split() for text in em['normalized_text']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Create NMF topics
topics = [[feature_names[i] for i in topic.argsort()[-15:]] for topic in nmf_model.components_]
coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

# Cohesion Score Calculation with Sampling
def calculate_cohesion(dtm, topic_results, sample_size=1000):
    n_topics = topic_results.shape[1]
    cohesion_scores = []
    for topic in range(n_topics):
        topic_docs_indices = np.where(topic_results.argmax(axis=1) == topic)[0]
        if len(topic_docs_indices) > 1:
            sampled_indices = np.random.choice(topic_docs_indices, min(sample_size, len(topic_docs_indices)), replace=False)
            topic_docs = dtm[sampled_indices].toarray()
            similarity_matrix = cosine_similarity(topic_docs)
            np.fill_diagonal(similarity_matrix, 0)
            cohesion = np.mean(similarity_matrix)
            cohesion_scores.append(cohesion)
        else:
            cohesion_scores.append(0)  # Assign zero if only one document or none in the topic
    return np.mean(cohesion_scores)

cohesion_score = calculate_cohesion(dtm, topic_results)
print(f"Cohesion Score: {cohesion_score}")

Coherence Score: 0.4640647414665059
Cohesion Score: 0.07814361560902597
