# Milestone 3

# Data Cleaning

In [2]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('reviews.csv')

In [3]:
# Fill in the missing data
df['reviewCreatedVersion'] = df['reviewCreatedVersion'].fillna('Unknown')

# Drop the unnecessary columns
df = df.drop(['userImage', 'sortOrder'], axis=1)

# Part 1. Topic Modeling (15 pts)
1. (15 pts) Based on the topic modeling task discussed with your instructor, please write codes to
perform the task and try to optimize your results. In your report:

    - Paste the key line of your code and the screenshot of relevant results in your report.
    - Discuss the optimization (if any) you made to improve the model performance.
    - Write one paragraph to discuss the main findings or takeaways from your result.

In [50]:
# Import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import TfidfModel, LsiModel
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.callbacks import PerplexityMetric


# Define stop words and lemmatizer
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# Define text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stop words and words with length < 3
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join tokens back into text
    text = ' '.join(tokens)
    return text

# Apply text preprocessing to review content
df['content_clean'] = df['content'].apply(preprocess_text)

In [51]:
# Topic modeling using LSI from gensim
from gensim import corpora, models, similarities

# Tokenize documents
texts = [doc.split() for doc in df['content_clean']]

# Create dictionary and corpus
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [52]:
# Build LSI model
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi_model = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)

In [54]:
# Get the coherence & perplexity scores for the SVD model
from gensim.models import CoherenceModel
from gensim.models import LdaModel
from gensim.corpora import Dictionary

# Calculate coherence score
coherence_model = CoherenceModel(model=lsi_model, texts=texts, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print("Coherence score:", coherence_score)

Coherence score: 0.5228713037363295


In [66]:
# Optimize the number of topics using coherence score as the metric

# Tokenize documents
documents = [doc.split() for doc in df['content_clean']]

# Create a dictionary from the preprocessed documents
id2word = Dictionary(documents)

# Create a corpus from the preprocessed documents
corpus = [id2word.doc2bow(doc) for doc in documents]

# Build multiple LSI models with varying numbers of topics
min_topics = 2
max_topics = 20
step_size = 1
topics_range = range(min_topics, max_topics + step_size, step_size)

lsi_models = []
for num_topics in topics_range:
    lsi_model = LsiModel(corpus=corpus, id2word=id2word, num_topics=num_topics)
    lsi_models.append(lsi_model)

# Calculate the coherence score for each LSI model
coherence_scores = []
for model in lsi_models:
    coherence_model = CoherenceModel(model=model, texts=documents, dictionary=id2word, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)

# Select the model with the highest coherence score
optimal_model = lsi_models[coherence_scores.index(max(coherence_scores))]
num_topics = topics_range[coherence_scores.index(max(coherence_scores))]

print('Optimal number of topics:', num_topics)
print('Coherence score:', max(coherence_scores))

Optimal number of topics: 5
Coherence score: 0.5726300919828144


In [72]:
# Rebuild the LSI model with the optimal number of topics
lsi_model = LsiModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

# Visualize the topics
from pprint import pprint
pprint(lsi_model.show_topics(num_words=5))

[(0, '0.701*"app" + 0.300*"n\'t" + 0.232*"task" + 0.154*"time" + 0.138*"like"'),
 (1, '0.612*"task" + -0.593*"app" + 0.311*"n\'t" + 0.158*"day" + 0.145*"list"'),
 (2,
  '-0.725*"n\'t" + 0.569*"task" + 0.211*"app" + -0.132*"calendar" + '
  '-0.108*"work"'),
 (3,
  '-0.444*"n\'t" + 0.379*"calendar" + 0.364*"time" + -0.321*"task" + '
  '0.241*"like"'),
 (4,
  '-0.713*"calendar" + 0.367*"time" + -0.277*"google" + 0.188*"habit" + '
  '-0.177*"task"')]
