In [None]:
!pip install pandas matplotlib wordcloud nltk gensim pyLDAvis

In [None]:
# 0 Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from google.colab import files
from gensim.corpora import Dictionary

# Install nltk stopwords and import additional libraries
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))




In [None]:
#1 Upload file with reviews
file_name = input("Enter filename with reviews. it should one column review")
# file_name = 'Reviews_Mobile_01.csv'

df = pd.read_csv(file_name)
df.dropna(subset=['review'], inplace=True)
print("Number of Rows:")
print( df.shape[0])
print("SAMPLE ROWS:")
df.head()

In [None]:
# 2. Convert to lowercase
df['cleaned_text'] = df['review'].str.lower()
df.head()


In [None]:
# 3. Remove punctuation
df['cleaned_text'] = df['cleaned_text'].str.replace('[^\w\s]', '', regex=True)
df.head()

In [None]:
# 4. Remove stop words
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df.head()

In [None]:
# 5. Generate Word Cloud
text = ' '.join(df['cleaned_text'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Save word cloud as an image
wordcloud.to_file("wordcloud.png")
# files.download("wordcloud.png")

In [None]:
##Think is the number of samples sufficient to do Topic Mondelling?- can we just make sense from manual inspection for this case

In [None]:
# Function to determine the optimal number of topics

def determine_optimal_topics(corpus, dictionary, texts, max_topics=10):
    coherence_scores = []
    for num_topics in range(2, max_topics + 1):
        model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_scores.append(coherence_model.get_coherence())
    optimal_topics = coherence_scores.index(max(coherence_scores)) + 2
    return optimal_topics

tokenized_text = df['cleaned_text'].apply(lambda x: x.split())  # Tokenization
dictionary = Dictionary(tokenized_text)
corpus = [dictionary.doc2bow(text) for text in tokenized_text]

# Ask user for the number of topics
user_choice = input("Enter 'auto' to let the code determine the number of topics, or specify a number (2-10): ")

if user_choice.lower() == 'auto':
    num_topics = determine_optimal_topics(corpus, dictionary, tokenized_text)  # Ensure you have tokenized_text defined
    print(f"Optimal number of topics determined: {num_topics}")
else:
    num_topics = int(user_choice)

# Topic Modeling and Visualization
def topic_modeling(corpus, dictionary, num_topics, df):
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100)

    # Generate pyLDAvis visualization and save as HTML
    vis = gensimvis.prepare(lda_model, corpus, dictionary)
    pyLDAvis.save_html(vis, 'lda_topic_visualization.html')
    print("LDA visualization saved as 'lda_topic_visualization.html'")

    # Topic assignments and probabilities
    topic_assignments = []
    topic_probabilities_list = []

    for doc_bow in corpus:
        topic_probabilities = lda_model.get_document_topics(doc_bow, minimum_probability=0)
        probabilities = [prob for _, prob in topic_probabilities]
        topic_probabilities_list.append(probabilities)  # Store all topic probabilities

        if topic_probabilities:
            max_prob, max_topic = max(topic_probabilities, key=lambda x: x[1])
            if max_prob >= 0.6:
                topic_assignments.append(max_topic)
            else:
                topic_assignments.append(-1)  # Assign -1 for "None" topic
        else:
            topic_assignments.append(-1)  # Assign -1 for "None" topic

    # Create a DataFrame for the topic probabilities
    probabilities_df = pd.DataFrame(topic_probabilities_list, columns=[f'prob_topic_{i}' for i in range(num_topics)])

    # Add topic assignments and probabilities to original data
    df['topic_assignment'] = topic_assignments
    df = pd.concat([df, probabilities_df], axis=1)  # Concatenate the probabilities DataFrame
    df['max_topic'] = probabilities_df.idxmax(axis=1).apply(lambda x: x.replace('prob_topic_', ''))  # Add max topic name column

    return lda_model, topic_assignments, topic_probabilities_list, vis, df

# Run the topic modeling function, passing df as an argument
lda_model, topic_assignments, topic_probabilities_list, vis, df = topic_modeling(corpus, dictionary, num_topics, df)

# Save to CSV
df.to_csv('reviews_with_topics.csv', index=False)

# Save topic-word mapping
topic_words = {i: [word for word, _ in lda_model.show_topic(i)] for i in range(num_topics)}
topic_words_df = pd.DataFrame.from_dict(topic_words, orient='index').transpose()
topic_words_df.to_csv('topic_words_mapping.csv', index=False)

# Save frequency distribution of topic assignments including "None" topic
topic_frequency = df['topic_assignment'].value_counts().sort_index()
topic_frequency_table = pd.DataFrame({'Topic': topic_frequency.index, 'Review_Count': topic_frequency.values})
topic_frequency_table.to_excel('topic_frequency_distribution.xlsx', index=False)
print("Topic frequency distribution saved to 'topic_frequency_distribution.xlsx'.")

print("Processing complete. The topics have been assigned to reviews with high-probability thresholds.")
