In [2]:
import nltk as tk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd

tk.download('stopwords')
tk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/melchorbicalan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/melchorbicalan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# load cleaned data
trustpilot = pd.read_csv('../cleaned_trustpilot_reviews.csv')
bbb = pd.read_csv('../cleaned_bbb_reviews.csv')


In [4]:
# load the text data for nlp
trustpilot_corpus1 = trustpilot['Review Body'].values
trustpilot_corpus2 = trustpilot['Review Heading'].values
bbb_corpus = bbb['Review Body'].values


In [5]:

# consider cases where review body or review heading contains emojis
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map
        "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)
# consider cases where review heading contains ellipsis at the end. Ignore these instances
collect_valid_rhead = [remove_emojis(review) for review in trustpilot_corpus2 if review.endswith(('…','...')) == False]
# consider cases where review body is null
collect_valid_rbody = [remove_emojis(review) for review in trustpilot_corpus1 if type(review) == str]


In [6]:
# tokenize the texts
# each list is a multidimensional lists with varying lengths
tokenized_tp_rbody = [word_tokenize(review) for review in collect_valid_rbody]
tokenized_tp_rhead = [word_tokenize(review) for review in collect_valid_rhead]
tokenized_bbb_rbody = [word_tokenize(review) for review in bbb_corpus]

In [7]:
# implement stop words
stop_words = set(stopwords.words('english'))
len(stop_words)

198

In [8]:
#clean tokenized texts
def clean_text(text, stop_words):
    cleaned_text = []
    for word in text:
        if word.lower() not in stop_words:
            cleaned_text.append(word)
    return cleaned_text

cleaned_tp_rbody = [clean_text(review, stop_words) for review in tokenized_tp_rbody]
cleaned_tp_rhead = [clean_text(review, stop_words) for review in tokenized_tp_rhead]
cleaned_bbb_rbody = [clean_text(review, stop_words) for review in tokenized_bbb_rbody]


In [9]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tp_rbody = [[lemmatizer.lemmatize(word) for word in review] for review in cleaned_tp_rbody]
lemmatized_tp_rhead = [[lemmatizer.lemmatize(word) for word in review] for review in cleaned_tp_rhead]
lemmatized_bbb_rbody = [[lemmatizer.lemmatize(word) for word in review] for review in cleaned_bbb_rbody]


# VECTORIZATION

In [None]:
#join lemmatized words back again
lemmatized_tp_rbody_str = [' '.join(review) for review in lemmatized_tp_rbody]
lemmatized_tp_rhead_str = [' '.join(review) for review in lemmatized_tp_rhead]
lemmatized_bbb_rbody_str = [' '.join(review) for review in lemmatized_bbb_rbody]

In [31]:
bbb_reviews_text = lemmatized_bbb_rbody_str
trustpilot_reviews_text = lemmatized_tp_rbody_str + lemmatized_tp_rhead_str
len(bbb_reviews_text), len(trustpilot_reviews_text)

(128, 12734)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [42]:
vectorizer1 = TfidfVectorizer(
    max_df = 0.95,
    min_df = 2,
)
vectorizer2 = TfidfVectorizer(
    max_df = 0.95,
    min_df = 2,
)
X1 = vectorizer1.fit_transform(trustpilot_reviews_text)
X2 = vectorizer2.fit_transform(bbb_reviews_text)
X1.shape, X2.shape

((12734, 4632), (128, 400))

In [48]:
# NMF on Trustpilot
nmf1 = NMF(n_components=10, random_state=42)
W1 = nmf1.fit_transform(X1)
H1 = nmf1.components_

# NMF on BBB
nmf2 = NMF(n_components=10, random_state=42)
W2 = nmf2.fit_transform(X2)
H2 = nmf2.components_

In [50]:
feature_names1 = vectorizer1.get_feature_names_out()

for topic_idx, topic in enumerate(H1):
    top_words_idx = topic.argsort()[:-11:-1]
    top_words = [feature_names1[i] for i in top_words_idx]
    print(f"Trustpilot Topic #{topic_idx + 1}: {' '.join(top_words)}")

Trustpilot Topic #1: love absolutely pup puppy getting product monthly guy company snack
Trustpilot Topic #2: great product box company quality service experience value toys toy
Trustpilot Topic #3: box bark first amazing get month excited next wait super
Trustpilot Topic #4: toy treat quality month good new get fun pup one
Trustpilot Topic #5: barkbox first amazing pup best month happy always get every
Trustpilot Topic #6: service customer excellent amazing best product wonderful always ever company
Trustpilot Topic #7: loved first box absolutely puppy pup next wait treat bella
Trustpilot Topic #8: dog happy box absolutely excited get company know every monthly
Trustpilot Topic #9: awesome product company guy box baby toys experience job everyone
Trustpilot Topic #10: everything perfect pup got enjoyed box thank received next one


In [51]:
feature_names2 = vectorizer2.get_feature_names_out()

for topic_idx, topic in enumerate(H2):
    top_words_idx = topic.argsort()[:-11:-1]
    top_words = [feature_names2[i] for i in top_words_idx]
    print(f"BBB Topic #{topic_idx + 1}: {' '.join(top_words)}")

BBB Topic #1: response number sent getting call phone horrible email time cancel
BBB Topic #2: box purchasing sure treat toy dog three first service charged
BBB Topic #3: year notify no thought they renewing stuck couple want could
BBB Topic #4: address old change go next login try account email box
BBB Topic #5: garbage less torn messenger money text tecue apart obstruction debit
BBB Topic #6: issue pug 11th followed since ups back thing contact ve
BBB Topic #7: time someone manage reviewed anywhere ahold gift without purchased multiple
BBB Topic #8: double free offer first order october box told promotion paid
BBB Topic #9: keep dishonest impossible frustrating charging deserve credit rating cancel even
BBB Topic #10: talked cancelled help ppl fraudulent different trying horrible response service
