In [36]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords

In [37]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Rashid
[nltk_data]     Iqbal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Rashid
[nltk_data]     Iqbal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load the data

In [38]:
reviews = pd.read_csv('reviews.csv')

reviews.head()

Unnamed: 0,content,score
0,I cannot open the app anymore,1
1,I have been begging for a refund from this app...,1
2,Very costly for the premium version (approx In...,1
3,"Used to keep me organized, but all the 2020 UP...",1
4,Dan Birthday Oct 28,1


#### Filter negative reviews

In [39]:
negative_reviews = reviews[reviews['score'].isin([1, 2])]

#### Preprocess the reviews

In [40]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stopwords.words('english')]
    return ' '.join(tokens)

In [41]:
negative_reviews.loc[:, 'cleaned_content'] = negative_reviews['content'].apply(preprocess_text)
preprocessed_reviews = negative_reviews[['content', 'cleaned_content']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_reviews.loc[:, 'cleaned_content'] = negative_reviews['content'].apply(preprocess_text)


#### Vectorize the cleaned reviews using TF-IDF

In [42]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(negative_reviews['cleaned_content'])

#### Apply K-means clustering

In [43]:
kmeans = KMeans(n_clusters=5, random_state=0)
categories = kmeans.fit_predict(tfidf_matrix)

#### Add the cluster labels to the DataFrame

In [44]:
negative_reviews = negative_reviews.assign(category=categories)

#### Find the most frequent terms in each cluster

In [45]:
def get_top_terms(cluster_center, terms, top_n=1):
    sorted_indices = cluster_center.argsort()[-top_n:]
    return [(terms[i], cluster_center[i]) for i in sorted_indices]

In [46]:
terms = vectorizer.get_feature_names_out()
topic_terms_list = []

In [47]:
terms = vectorizer.get_feature_names_out()
topic_terms_list = []
for i in range(5):
    cluster_center = kmeans.cluster_centers_[i]
    top_terms = get_top_terms(cluster_center, terms, top_n=1)
    term, _ = top_terms[0]
    frequency = (tfidf_matrix[negative_reviews['category'] == i].toarray() > 0).sum(axis=0)[vectorizer.vocabulary_[term]]
    topic_terms_list.append({'label': i, 'term': term, 'frequency': frequency})

In [48]:
topic_terms = pd.DataFrame(topic_terms_list)

In [49]:
print(preprocessed_reviews)

                                                 content  \
0                          I cannot open the app anymore   
1      I have been begging for a refund from this app...   
2      Very costly for the premium version (approx In...   
3      Used to keep me organized, but all the 2020 UP...   
4                                    Dan Birthday Oct 28   
...                                                  ...   
11940  I loved it until I realized that the very feat...   
11941  Gave it a test run and tried out the notificat...   
11942  Looks great but since installing, my device on...   
11943  This app looked good until I had to purchase i...   
11944                                           It's OK!   

                                         cleaned_content  
0                                       open app anymore  
1               begging refund app month nobody replying  
2      costly premium version approx indian rupees pe...  
3      used keep organized updates made mes

In [50]:
print(topic_terms)

   label     term  frequency
0      0     good        170
1      1     work        239
2      2  version        308
3      3      app       1382
4      4    tasks        334
