# Pre-processing input data
* Data
    * In this task, we first load data given in Excel sheet. The data contains comments by viewers on contact lenses that they purchased. We want to tokenize the data
    and do
    * Sentiment Analysis
    * Topic determination

In [1]:
%matplotlib inline
%timeit
import pandas as pd
import string
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sys
from collections import Counter
import random
import ast
import re
import scipy

sys.path.append('pymodules')
# This class contains some utility functions Word2Vec, stop words etc. etc.
import pymodules.preprocessing_class as pc

# gender gueser
import gender_guesser.detector as gd

# for dictionary method synonym finder using wordnet
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /Users/bmukund/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/bmukund/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
def find_wordnet_synonyms(word_list, type_of_word=None):
    """
    Function to find synonyms of words given in the input respecting the type of word (noun or adjective or verb)
    It is assumed that the word_list, the input is a list
    @return lemmatized synonyms
    """
    synonyms = set()
    for word_to_look in word_list:
        #print(f"looking for synonyms of word:{word_to_look}")
        for syn in wn.synsets(word_to_look, pos=type_of_word):
            for i in syn.lemmas():
                synonyms.add(i.name())
    #print(f"Synonyms:\n {synonyms}")
    return synonyms

In [3]:
def first_name(x):
    """
    Function to get the first name so that we can guess the gender
    * We determine the first name from the given string.
    * We also remove any digits from the name.
    * We use Space to split names
    """
    x_split = str(x).split()
    fname = x_split[0]
    # remove reference to digits. Now after removal, there could be some misclassification, but that is ok ..
    fname_p = re.sub(r'[0-9]+', "", fname)
    ret_str = fname_p.capitalize()
    return ret_str


In [4]:
def read_input(filename, sheet_name, filter_columns):
    df = pd.read_excel(filename, sheet_name=sheet_name, index_col='REVIEW_DATE')
    df  = df.drop(columns = filter_columns, axis=1)
    return df

In [None]:
filename = "data/Master-data_Q42021.xlsx"
sheet_name = 'Scrubbed_data'
# We don't need these columns
not_needed = ['OVERALL_RATING', 'COMFORT_RATING', 'VISION_RATING', 'VALUE_FOR_MONEY', 'PROS', 'CONS', 'ORIGINAL_SOURCE', 'REPLY_FROM_ACCUVUE',
              'PRODUCT_LINK', 'WEBSITE']

text_data = read_input(filename, sheet_name, not_needed)

In [None]:
# Let us figure out the gender from the names and drop the names column
# We use gender_guesser package.
#text_data['AUTHOR'] = text_data['AUTHOR'].astype(str)
gdx = gd.Detector()
text_data['GENDER'] = text_data.AUTHOR.apply(first_name).map(lambda x: gdx.get_gender(x))

# Drop the author column now
text_data.drop(columns = ['AUTHOR'], axis=1, inplace=True)

# Check the gender counts just to see how the data looks like
text_data.GENDER.value_counts()

In [None]:
# Consolidate the comments into one column
# Comments can occur both in title and in Comment columns. 
text_data['COMMENT'] = text_data['TITLE'].astype(str).fillna("") + " " + text_data['COMMENTS'].astype(str).fillna("")
text_data.drop(columns = ['TITLE', 'COMMENTS'], axis=1, inplace=True)

In [None]:
# clean rating
# replace N = No rating with 0. We do this because rating is assumed to be numeric, not categorical
text_data['RATING'].replace('N', '0', inplace=True)
# convert rating to integers
text_data['RATING'] = text_data['RATING'].apply(lambda x: int(x))

In [None]:
# display results
text_data

### Tokenization <a class="anchor" id="first-bullet"></a>

In [None]:
## regex for tokenization
# Ref: http://sentiment.christopherpotts.net/code-data/happyfuntokenizing.py
emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
    )"""

# The components of the tokenizer:
regex_strings = (
    # Phone numbers:
    r"""
    (?:
      (?:            # (international)
        \+?[01]
        [\-\s.]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [\-\s.\)]*
      )?
      \d{3}          # exchange
      [\-\s.]*
      \d{4}          # base
    )"""
    ,
    # Emoticons:
    emoticon_string
    ,
    # HTML tags:
    r"""<[^>]+>"""
    ,
    # Twitter username:
    r"""(?:@[\w_]+)"""
    ,
    # Twitter hashtags:
    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
    ,
    # Remaining word types:
    r"""
    (?:[a-z][a-z'\-_]+[a-z])       # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    """,
    r"""
    (?x)                # set flag to allow verbose regexps (to separate logical sections of pattern and add comments)
    \w+(?:-\w+)*        # preserve expressions with internal hyphens as single tokens
    | [][.,;"'?():-_`]  # preserve punctuation as separate tokens
    """
)
word_re = re.compile(pattern=r"""(%s)""" % "|".join(regex_strings), flags=re.VERBOSE | re.I)

In [None]:
comments_data = text_data.COMMENT
prep_comments = pc.RawDocs(comments_data,  # series of documents
                  lower_case=True,  # whether to lowercase the text in the firs cleaning step
                  stopwords='long',  # type of stopwords to initialize
                  contraction_split=True,  # wheter to split contractions or not
                  tokenization_pattern=word_re  # custom tokenization patter
                  )

In [None]:
# notice that the documents from the object are identical to the ones from the pandas series
#comments_data
i = 0
print("Document from the pandas series:\n", comments_data[i])
print("\n-------------------------\n")
print("Document from preprocessing object:\n", prep_comments.docs[i])


In [None]:
# lower-case text, expand contractions and initialize stopwords list
prep_comments.basic_cleaning()

In [None]:
# explore an example after the basic cleaning has been applied
i = 0
print(comments_data[i])
print()
print(prep_comments.docs[i])


In [None]:
# now we can split the documents into tokens
prep_comments.tokenize_text()

In [None]:
i = 0
print(comments_data[i])
print()
print(prep_comments.tokens[i])

In [None]:
punctuation = string.punctuation
punctuation = punctuation.replace("-", "") # remove the hyphen from the punctuation string
punctuation

In [None]:
prep_comments.token_clean(length=2,                 # remove tokens with less than this number of characters
                 punctuation=punctuation,           # remove custom list of punctuation characters
                 numbers = True                     # remove numbers
                 )


In [None]:
i = 0
print(comments_data[i])
print()
print(prep_comments.tokens[i])


In [None]:
# get the list of stopwords provided earlier
print(sorted(prep_comments.stopwords))

In [None]:
# we need to specificy that we want to remove the stopwords from the "tokens"
prep_comments.stopword_remove('tokens')

In [None]:
i = 0
print(comments_data[i])
print()
print(prep_comments.tokens[i])


In [None]:
# stemming
# pre_comments.stem()

# apply lemmatization to all documents (takes a very long time so we will avoid it for now)
prep_comments.lemmatize()

In [None]:
# compare all versions of the same raw sentences
i = 0
print(comments_data[i])
print()
print(prep_comments.tokens[i])
print()
# print(prep_comments.stems[i])
# print()
print(prep_comments.lemmas[i])


In [None]:
prep_comments.get_term_ranking(items='tokens', score_type='df')
prep_comments.df_ranking['tokens'][:15]


TF-IDF weights more words that occur frequently but in less number of documents. This seems to skew ranking towards advertisement like reviews. (See output) It is not clear if such counting is relevant for this project.

In [None]:
prep_comments.get_term_ranking(items='tokens', score_type='tfidf')
prep_comments.tfidf_ranking['tokens'][:10]


In [None]:
prep_comments.df_ranking['tokens'][-10:]

In [None]:
prep_comments.tfidf_ranking['tokens'][-15:]


In [None]:
plt.figure(figsize=(10,6))
plt.plot([x[0] for x in prep_comments.df_ranking['tokens']])
plt.title('Document frequency ranking')
plt.ylabel("Document frequency")
plt.xlabel("Term ranking")
plt.show()


In [None]:
# we can use a log-log scale to observe more clearly the power-law distribution (Zipf's law)
plt.figure(figsize=(10,6))
plt.loglog([x[0] for x in prep_comments.df_ranking['tokens']])
plt.title('Document frequency ranking (log-log)')
plt.ylabel("log document frequency")
plt.xlabel("log term ranking")
plt.show()


In [None]:
plt.figure(figsize=(10,6))
plt.plot([x[0] for x in prep_comments.tfidf_ranking['tokens']])
plt.title('Tf-idf ranking')
plt.ylabel("tf-idf")
plt.xlabel("Term ranking")
plt.show()


### Vectorization

In [None]:
# simple auxiliary function to override the preprocessing done by sklearn
def do_nothing(doc):
    return doc


In [None]:
# create a CountVectorizer object using our preprocessed text
count_vectorizer = CountVectorizer(encoding='utf-8',
                                   preprocessor=do_nothing,  # apply no additional preprocessing
                                   tokenizer=do_nothing,     # apply no additional tokenization
                                   lowercase=False,
                                   strip_accents=None,
                                   stop_words=None,
                                   ngram_range=(1, 1),       # generate only unigrams
                                   analyzer='word',          # analysis at the word-level
                                   max_df=0.5,              # ignore tokens that have a higher document frequency (can be int or percent)
                                   min_df=500,                # ignore tokens that have a lowe document frequency (can be int or percent)
                                   max_features=None,        # we could impose a maximum number of vocabulary terms
                                   )


In [None]:
# transform our preprocessed tokens into a document-term matrix
dt_matrix = count_vectorizer.fit_transform(prep_comments.tokens)
print(f"Document-term matrix created with shape: {dt_matrix.shape}")


In [None]:
# we can access a dictionary that maps between words and positions of the document-term matrix
# list(count_vectorizer.vocabulary_.items())[0:10]
id_word_indexer = pd.DataFrame(count_vectorizer.vocabulary_.items())
id_word_indexer

### Dictionary methods
Dictionary methods ai to find synonyms of words and add to the corpus. In our case we want to classify our corpus based on these four terms by identifying synonyms from the text
* Identify "product"
* Identify "service"
* Identify "quality"
* Identify "price"

### Finding synonyms using WordNet for the words

In [None]:
product_words = ['gadget', 'contraption', 'appliance', 'widget', 'equipment', 'contrivance', 'gizmo', 'product', 'merchandise', 'ware', 'gismo']
service_words = ['service', 'assist', 'help', 'aid']
quality_words = ['quality', 'built', 'refurbish', 'comfort', 'relief']
price_words = ['price', 'money', 'cash', 'cheap', 'costly', 'pricey', 'discount', 'payment', 'rebate', 'cost']

#### Now we use the synonym finder to generate extra set of words for identifying classification words.
We then use the resulting set of words as tokens to be found in our corpus. Because the synonym words from the wordnet synonym finder may not be entirely suitable to be used automatically
we filter the result. Particularly, some words could be used in different senses (noun, adjective etc.) and could determine different classification. Given that we know the domain here,
we want to make sure that the synonyms we find don't add to the ambiguity
For example, one of the synonyms for "ware" is "convenience" (as in mode of convenience). But this could also mean "ease of use", which would come under "quality" and not under "product"
In order to avoid this, we manually filter out the output synonyms of wordnet results

In [None]:
syns_indicating_product = find_wordnet_synonyms(product_words, wn.NOUN)

In [None]:
syns_indicating_service = find_wordnet_synonyms(service_words, wn.NOUN)

In [None]:
syns_indicating_quality = find_wordnet_synonyms(quality_words, wn.NOUN)

In [None]:
syns_indicating_price = find_wordnet_synonyms(price_words, wn.NOUN)

In [None]:
tokens_indicating_product = product_words
tokens_indicating_service = service_words
tokens_indicating_quality = quality_words
tokens_indicating_price = price_words

In [None]:
# vocabulary's key is the feature word and the value is the feature-word's index in the feature column ...
service_indicator_token_ids = [v for k,v in count_vectorizer.vocabulary_.items() if k in tokens_indicating_service]
print(f"{len(service_indicator_token_ids)} tokens found in vocabulary indicating service, {service_indicator_token_ids}")

In [None]:
product_indicator_token_ids = [v for k,v in count_vectorizer.vocabulary_.items() if k in tokens_indicating_product]
print(f"{len(product_indicator_token_ids)} tokens found in vocabulary indicating product, {product_indicator_token_ids}")

In [None]:
quality_indicator_token_ids = [v for k,v in count_vectorizer.vocabulary_.items() if k in tokens_indicating_quality]
print(f"{len(quality_indicator_token_ids)} tokens found in vocabulary indicating quality, {quality_indicator_token_ids}")

In [None]:
price_indicator_token_ids = [v for k,v in count_vectorizer.vocabulary_.items() if k in tokens_indicating_price]
print(f"{len(price_indicator_token_ids)} tokens found in vocabulary indicating price, {price_indicator_token_ids}")

In [None]:
service_indicator_counts = dt_matrix.tocsr()[:, service_indicator_token_ids]
# for a given data, count all such tokens that indicate service and presumably, one can add this as a new column to the data itself
service_indicator_counts = service_indicator_counts.sum(axis=1)
service_indicator_counts

In [None]:
product_indicator_counts = dt_matrix.tocsr()[:, product_indicator_token_ids]
# for a given data, count all such tokens that indicate product and presumably, one can add this as a new column to the data itself
product_indicator_counts = product_indicator_counts.sum(axis=1)
np.array(product_indicator_counts).ravel()

In [None]:
price_indicator_counts = dt_matrix.tocsr()[:, price_indicator_token_ids]
# for a given data, count all such tokens that indicate price and presumably, one can add this as a new column to the data itself
price_indicator_counts = price_indicator_counts.sum(axis=1)
np.array(price_indicator_counts).ravel()

In [None]:
quality_indicator_counts = dt_matrix.tocsr()[:, quality_indicator_token_ids]
# for a given data, count all such tokens that indicate quality and presumably, one can add this as a new column to the data itself
quality_indicator_counts = quality_indicator_counts.sum(axis=1)
np.array(quality_indicator_counts).ravel()

In [None]:
# index 14 - service, 15-product, 16-quality, 17-price
for idx in [14, 15, 16, 17]:
    unique, counts = np.unique(dt_matrix.toarray()[:, idx], return_counts=True)
    print(f"Index:{idx}, value:\n{np.asarray((unique, counts)).T}")

In [None]:
text_data['service'] = np.array(service_indicator_counts).ravel()
text_data['product'] = np.array(product_indicator_counts).ravel()
text_data['quality'] = np.array(quality_indicator_counts).ravel()
text_data['price'] = np.array(price_indicator_counts).ravel()
text_data

In [None]:
text_data['RATING'].replace('N', '-1', inplace=True)
text_data.RATING.value_counts()
text_data['RATING'] = text_data['RATING'].apply(lambda x: int(x))

data_agg = text_data.groupby(['price'], as_index=False).agg({'COMMENT': 'sum', 'RATING':'mean'})


In [None]:
data_agg

In [None]:
data_agg = text_data.groupby(['service'], as_index=False).agg({'COMMENT': 'sum', 'RATING':'mean'})

In [None]:
data_agg

In [None]:
## FOR SVM, we need to make a matrix with proper column names
# Also, we need another column that denotes the review
# We also need to normalize the data
df_svm = pd.DataFrame(dt_matrix.toarray())
df_svm.rename(columns=id_word_indexer.to_dict()[0], inplace=True)
df_svm