In [None]:
import nltk as tk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd

tk.download('stopwords')
tk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/melchorbicalan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/melchorbicalan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# load cleaned data
trustpilot = pd.read_csv('../cleaned_trustpilot_reviews.csv')
bbb = pd.read_csv('../cleaned_bbb_reviews.csv')


In [5]:
# load the text data for nlp
trustpilot_corpus1 = trustpilot['Review Body'].values
trustpilot_corpus2 = trustpilot['Review Heading'].values
bbb_corpus = bbb['Review Body'].values


In [6]:

# consider cases where review body or review heading contains emojis
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map
        "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)
# consider cases where review heading contains ellipsis at the end. Ignore these instances
collect_valid_rhead = [remove_emojis(review) for review in trustpilot_corpus2 if review.endswith(('…','...')) == False]
# consider cases where review body is null
collect_valid_rbody = [remove_emojis(review) for review in trustpilot_corpus1 if type(review) == str]


In [7]:
# tokenize the texts
# each list is a multidimensional lists with varying lengths
tokenized_tp_rbody = [word_tokenize(review) for review in collect_valid_rbody]
tokenized_tp_rhead = [word_tokenize(review) for review in collect_valid_rhead]
tokenized_bbb_rbody = [word_tokenize(review) for review in bbb_corpus]

In [8]:
# implement stop words
stop_words = set(stopwords.words('english'))
len(stop_words)

198

In [9]:
#clean tokenized texts
def clean_text(text, stop_words):
    cleaned_text = []
    for word in text:
        if word.lower() not in stop_words:
            cleaned_text.append(word)
    return cleaned_text

cleaned_tp_rbody = [clean_text(review, stop_words) for review in tokenized_tp_rbody]
cleaned_tp_rhead = [clean_text(review, stop_words) for review in tokenized_tp_rhead]
cleaned_bbb_rbody = [clean_text(review, stop_words) for review in tokenized_bbb_rbody]


In [15]:
cleaned_tp_rbody[0]

['signed',
 'BarkBox',
 'subscription',
 'Facebook',
 'Ad/link',
 '.',
 'box',
 'arrived',
 'every',
 'toy',
 'made',
 'China',
 '(',
 'vet',
 'strongly',
 'discouraged',
 'toys',
 'China',
 'dog',
 ')',
 '.',
 'refused',
 'exchange',
 'accept',
 'return',
 '.',
 'cancel',
 'subscription',
 'request',
 '.',
 'disappointed',
 'policies',
 'customer',
 'service',
 '.']

In [16]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tp_rbody = [[lemmatizer.lemmatize(word) for word in review] for review in cleaned_tp_rbody]
lemmatized_tp_rhead = [[lemmatizer.lemmatize(word) for word in review] for review in cleaned_tp_rhead]
lemmatized_bbb_rbody = [[lemmatizer.lemmatize(word) for word in review] for review in cleaned_bbb_rbody]
