In [23]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import re
import nltk
from tqdm import tqdm
tqdm.pandas()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
nltk.download('stopwords')
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
reviews = pd.read_pickle('Pickle/reviews.pkl')

In [25]:
stop_words = set(stopwords.words('english'))

# Preprocess reviews
def preprocess(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d+', '', text)   # Remove digits
    text = text.lower()               # Convert to lowercase
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

In [26]:
reviews['cleaned_reviews'] = reviews['review_text'].progress_apply(preprocess)

100%|██████████| 1000000/1000000 [00:52<00:00, 19079.16it/s]


In [27]:
# Define a custom function to fit_transform with progress bar
def fit_transform_with_progress(vectorizer, data):
    tqdm.pandas(desc="Vectorizing Reviews")
    return vectorizer.fit_transform(data.progress_apply(lambda x: x))

# Vectorize the reviews
vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
review_vectors = fit_transform_with_progress(vectorizer, reviews['cleaned_reviews'])


Vectorizing Reviews: 100%|██████████| 1000000/1000000 [00:00<00:00, 1311198.18it/s]


In [None]:
# Tokenize reviews
tokenized_reviews = reviews['cleaned_reviews'].progress_apply(lambda x: x.split())

Vectorizing Reviews:  24%|██▍       | 239236/1000000 [00:01<00:04, 177823.56it/s]

In [None]:
dictionary = Dictionary(tqdm(tokenized_reviews, desc="Processing Tokens"))

In [None]:
corpus = [dictionary.doc2bow(text) for text in tqdm(tokenized_reviews, desc="Processing Corpus")]

In [None]:
# Function to fit LDA and compute coherence score with 50 topics
def compute_coherence_for_50_topics(corpus, dictionary):
    k = 50  # Fixed number of topics
    lda_model = LatentDirichletAllocation(n_components=k, random_state=42)
    
    # Fitting the LDA model
    lda_model.fit(review_vectors)
    
    # Computing coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_reviews, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    return lda_model, coherence_score

In [None]:
# Compute LDA and coherence score with 50 topics
lda_model, coherence_score = compute_coherence_for_50_topics(corpus=corpus, dictionary=dictionary)

print(f"Coherence Score for 50 Topics: {coherence_score}")

In [None]:
# Get the topic distribution for each review
topic_distributions = lda_model.transform(review_vectors)
reviews['topic_distribution'] = list(topic_distributions)
print("Topic Distributions:")
print(topic_distributions)
