# Pre-processing input data
* Data
    * In this task, we first load data given in Excel sheet. The data contains comments by viewers on contact lenses that they purchased. We want to tokenize the data
    and do
    * Sentiment Analysis
    * Topic determination

* We download nltk wordnet which provides us with a vocabulary of words, that we use to build our dictionary
* We also guess the gender using gender guesser package

In [None]:
%matplotlib inline
%timeit
import pandas as pd
import string
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sys
from collections import Counter
import random
import ast
import re
import scipy

# category
from pandas.api.types import CategoricalDtype

sys.path.append('pymodules')
# This class contains some utility functions Word2Vec, stop words etc. etc.
import pymodules.preprocessing_class as pc

# gender gueser
import gender_guesser.detector as gd

# for dictionary method synonym finder using wordnet
import nltk
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('corpora/omw-1.4')
except LookupError:
    nltk.download('omw-1.4')

from nltk.corpus import wordnet as wn

### Find synonyms to add to the vocabulary of words.
This is required for sentiment analysis so that most of the comment words by users are captured

In [None]:
def find_wordnet_synonyms(word_list, type_of_word=None):
    """
    Function to find synonyms of words given in the input respecting the type of word (noun or adjective or verb)
    It is assumed that the word_list, the input is a list
    @return lemmatized synonyms
    """
    synonyms = set()
    for word_to_look in word_list:
        #print(f"looking for synonyms of word:{word_to_look}")
        for syn in wn.synsets(word_to_look, pos=type_of_word):
            for i in syn.lemmas():
                synonyms.add(i.name())
    #print(f"Synonyms:\n {synonyms}")
    return synonyms

### Extract first name
* Used in guessing gender

In [None]:
def first_name(x):
    """
    Function to get the first name so that we can guess the gender
    * We determine the first name from the given string.
    * We also remove any digits from the name.
    * We use Space to split names
    """
    x_split = str(x).split()
    fname = x_split[0]
    # remove reference to digits. Now after removal, there could be some misclassification, but that is ok ..
    fname_p = re.sub(r'[0-9]+', "", fname)
    ret_str = fname_p.capitalize()
    return ret_str


### Excel File reader
* The date is used as index
* Depending on time of the year, comments may vary. So we can group data by dates and do analysis if required

In [None]:
def read_input(filename, sheet_name, filter_columns):
    df = pd.read_excel(filename, sheet_name=sheet_name, index_col='REVIEW_DATE')
    print(f"Input columns:{df.columns}")
    df  = df.drop(columns = filter_columns, axis=1)
    return df

## Start EDA analysis of input data

In [None]:
filename = "data/Master-data_Q42021.xlsx"
sheet_name = 'Scrubbed_data'
# We don't need these columns
not_needed = ['OVERALL_RATING', 'COMFORT_RATING', 'VISION_RATING', 'VALUE_FOR_MONEY', 'PROS', 'CONS', 'ORIGINAL_SOURCE', 'REPLY_FROM_ACCUVUE',
              'PRODUCT_LINK', 'WEBSITE']

text_data = read_input(filename, sheet_name, not_needed)

## Figure out the gender distribution
* It is interesting to note that majority of comments are by female, twice as much femals have commented compared to males

In [None]:
# Let us figure out the gender from the names and drop the names column
# We use gender_guesser package.
#text_data['AUTHOR'] = text_data['AUTHOR'].astype(str)
gdx = gd.Detector()
text_data['GENDER'] = text_data.AUTHOR.apply(first_name).map(lambda x: gdx.get_gender(x))

# Drop the author column now
text_data.drop(columns = ['AUTHOR'], axis=1, inplace=True)

In [None]:
# Check the gender counts just to see how the data looks like
text_data.GENDER.value_counts().plot(kind='bar')

## Extract comments of the users
* Comments are found in two columns 'COMMENTS' column and 'TITLE' column.
    * This is because when the user inputs his/her reviews, sometimes they put their comments in the title itself and not in the box for comments. We need to take care of that too.
* Consolidate the comments into one column called 'COMMENT'.
* Comments can occur both in title and in Comment columns.

In [None]:
# Consolidate the comments into one column
# Comments can occur both in title and in Comment columns. 
text_data['COMMENT'] = text_data['TITLE'].astype(str).fillna("") + " " + text_data['COMMENTS'].astype(str).fillna("")
text_data.drop(columns = ['TITLE', 'COMMENTS'], axis=1, inplace=True)

## Find user review rating
* This is our response variable, the output variable
* Ratings are curated so that they are all integers.
* Ratings vary from [0-5]

In [None]:
# clean rating
# replace N = No rating with 0. We do this because rating is assumed to be numeric, not categorical
text_data['RATING'].replace('N', '0', inplace=True)
# convert rating to integers
text_data['RATING'] = text_data['RATING'].apply(lambda x: int(x))

### Plot rating

In [None]:
text_data['RATING']

In [None]:
text_data['RATING'].plot(kind='density')

In [None]:
import seaborn as sns
sns.displot(text_data, x="RATING")
## complicated plot ...
#ax = sns.countplot(x=text_data['RATING'], order=text_data['RATING'].value_counts(ascending=False).index)
#abs_values = text_data['RATING'].value_counts(ascending=False).values
#ax.bar_label(container=ax.containers[0], labels=abs_values)

#### Sentiment from User Ratings using threshold:
* Rating Values < threshold => Negative sentiment
* Rating Values = threshold => Neutral sentiment
* Rating Values > threshold => Positive sentiment

In [None]:
sentiment_threshold = 3

In [None]:
# attach sentiment to ratings
def find_sentiment(senti_threshold):
    def _find_sentiment(rating):
        choices = [0, 1, 2]
        conditions = [rating < senti_threshold, rating == senti_threshold, rating > senti_threshold]
        senti = np.select(conditions, choices)
        return senti
    return _find_sentiment

SENTIMENT_SERIES = df['RATING'].apply(find_sentiment(sentiment_threshold)).astype('category')
df_svm['SENTI'] = SENTIMENT_SERIES.values
df_svm['SENTI'] = df_svm['SENT'].astype('category')
sentiments = df_svm._SENTIMENT_.value_counts()
print(sentiments)

import seaborn as sns
ax = sns.countplot(x=df_svm['_SENTIMENT_'], order=df_svm['_SENTIMENT_'].value_counts(ascending=False).index)
abs_values = df_svm['_SENTIMENT_'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=abs_values)
ax.set(xticklabels=['>3', '<3', '=3'])

### Check output

In [None]:
# display results
text_data

### Tokenization <a class="anchor" id="first-bullet"></a>
* Based on tokenization http://sentiment.christopherpotts.net/code-data/happyfuntokenizing.py
* A tokenizer is a function that splits a string of text into words based on some delimiter that separates the strings.
* In this usae case, we use a regular expression to determine what a word means. Typically comments ca ne equated to tweets
  in its brevity and also for the language used. Hence we use the same regular expressions as we would for parsing a tweet.

In [None]:
## regex for tokenization
# Ref: http://sentiment.christopherpotts.net/code-data/happyfuntokenizing.py
emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
    )"""

# The components of the tokenizer:
regex_strings = (
    # Phone numbers:
    r"""
    (?:
      (?:            # (international)
        \+?[01]
        [\-\s.]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [\-\s.\)]*
      )?
      \d{3}          # exchange
      [\-\s.]*
      \d{4}          # base
    )"""
    ,
    # Emoticons:
    emoticon_string
    ,
    # HTML tags:
    r"""<[^>]+>"""
    ,
    # Twitter username:
    r"""(?:@[\w_]+)"""
    ,
    # Twitter hashtags:
    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
    ,
    # Remaining word types:
    r"""
    (?:[a-z][a-z'\-_]+[a-z])       # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    """,
    r"""
    (?x)                # set flag to allow verbose regexps (to separate logical sections of pattern and add comments)
    \w+(?:-\w+)*        # preserve expressions with internal hyphens as single tokens
    | [][.,;"'?():-_`]  # preserve punctuation as separate tokens
    """
)
word_re = re.compile(pattern=r"""(%s)""" % "|".join(regex_strings), flags=re.VERBOSE | re.I)

### Tokenize using regex
* We take the data consolidated in 'COMMENT' column of dataframe and tokenize it as follows
* We convert all text to lower case
* We remove commonly used stop words and prepositions such as 'in', 'with'. These stopwords have been compiled for us and we simply use a database of stop words
* We do a lazy tokenization, in that we take all the required inputs and create an object called 'RawDocs'
    * class RawDocs has a lot of functions other than tokenization, (for example, join two consecutive words (bi-gramming), lemmatization etc.)
* We use a list of "short" stop words. Further, we remove "not", "no" etc. from the stop words list because we need such words to be counted as legitimate words for sentiment analysis

In [None]:
comments_data = text_data.COMMENT
prep_comments = pc.RawDocs(comments_data,  # series of documents
                  lower_case=True,  # whether to lowercase the text in the firs cleaning step
                  stopwords='short',  # type of stopwords to initialize
                  contraction_split=True,  # wheter to split contractions or not
                  tokenization_pattern=word_re  # custom tokenization patter
                  )

* We should notice that the RawDocs object created is identical to the input data

In [None]:
#comments_data
i = 0
print(f"Document data at index[{i}:\n {comments_data[i]}")
print("\n-------------------------\n")
print(f"Document data after objectifying input with tokenization procedures added[{i} :\n {prep_comments.docs[i]}")


### Basic cleaning
* Remove stop words. One can input custom stop words list as argument to the function in addition to default
    * Here we use standard stop words
* We expand contractions like "don't", "can't" etc. to make word explicit for NLP

In [None]:
# lower-case text, expand contractions and initialize stopwords list
prep_comments.basic_cleaning()

### Print the comments after basic cleaning

In [None]:
# explore an example after the basic cleaning has been applied
test_index = 0
print(f"Data at index[{test_index}] after basic cleaning:\n {prep_comments.docs[test_index]}")
test_index = -1
print(f"Data at index[{test_index}] after basic cleaning:\n {prep_comments.docs[test_index]}")

### Tokenize
* Now tokenize the data that is already cached in RawDocs object called prep_comments

In [None]:
# now we can split the documents into tokens
prep_comments.tokenize_text()

### print data after tokenization

In [None]:
test_index = 0
print(f"Original comments at index[{test_index}]:\n {comments_data[test_index]}")
print(f"After tokenization, comments:\n {prep_comments.tokens[test_index]}")

### Remove punctuations
* remove tokens with less than TWO characters
* remove custom list of punctuation characters
* remove numbers
* remove hypens

In [None]:
punctuation = string.punctuation
punctuation = punctuation.replace("-", "") # remove the hyphen from the punctuation string
# punctuation
prep_comments.token_clean(length=2,                 # remove tokens with less than this number of characters
                 punctuation=punctuation,           # remove custom list of punctuation characters
                 numbers = True                     # remove numbers
                 )


### Check after removal of punctuations
* Note the removal of exclamation marks.

In [None]:
test_index = 22
print(f"Original comments at index[{test_index}]:\n {comments_data[test_index]}")
print(f"After tokenization, comments:\n {prep_comments.tokens[test_index]}")

### Stopwords to be removed

In [None]:
# get the list of stopwords provided earlier
print(sorted(prep_comments.stopwords))

### Now remove stopwords
* tokens are cached inside prep_comments object as 'tokens', i.e. prep_comments.tokens will give us back the tokenized user comment dat

In [None]:
# we need to specificy that we want to remove the stopwords from the "tokens"
prep_comments.stopword_remove('tokens')

### Check after removal of stop words
* Note that numbers, exclamation marks have been removed, some stop words like "that", "my" has been removed

In [None]:
test_index = 42
print(f"Original comments at index[{test_index}]:\n {comments_data[test_index]}")
print(f"After tokenization, comments:\n {prep_comments.tokens[test_index]}")

### Lemmatize
From: https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html
"
The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. For instance:

am, are, is $\Rightarrow$ be

car, cars, car's, cars' $\Rightarrow$ car

The result of this mapping of text will be something like:
the boy's cars are different colors $\Rightarrow$ the boy car be differ color

However, the two words differ in their flavor.

Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes.

Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma.

If confronted with the token saw, stemming might return just s, whereas lemmatization would attempt to return either see or saw depending on whether the use of the token was as a verb or a noun. The two may also differ in that stemming most commonly collapses derivationally related words, whereas lemmatization commonly only collapses the different inflectional forms of a lemma.

Linguistic processing for stemming or lemmatization is often done by an additional plug-in component to the indexing process, and a number of such components exist, both commercial and open-source.
The most common algorithm for stemming English, and one that has repeatedly been shown to be empirically very effective, is Porter's algorithm (Porter, 1980).
"

* We do lemmatization, which is a slow process compared to stemming in our work here

In [None]:
# apply lemmatization (SLOW)
prep_comments.lemmatize()

### Check lemmatization
* Note 'shipping' has been lemmatized to 'ship'

In [None]:
# compare all versions of the same raw sentences
test_index = 22
print(f"Original comments at index[{test_index}]:\n {comments_data[test_index]}")
print(f"After tokenization, comments:\n {prep_comments.tokens[test_index]}")
print(f"After lemmatization, comments:\n {prep_comments.lemmas[test_index]}")


### Document term matrix
There is a power law distribution of tokens in all documents. This is also called the Zipf's law. So, in order to treat tokens on equal footing for language analysis, we weigh the tokens.
The weights for each token are consolidated in something called as a document term matrix. We will see how we give weights to tokens taking note of Zipf's law.

From wikipedia: [[https://en.wikipedia.org/wiki/Document-term_matrix]]
"
A document-term matrix is a mathematical matrix that describes the frequency of terms that occur in a collection of documents.
In a document-term matrix, rows correspond to documents in the collection and columns correspond to terms.
This matrix is a specific instance of a document-feature matrix where "features" may refer to other properties of a document besides terms.
It is also common to encounter the transpose, or term-document matrix where documents are the columns and terms are the rows.
They are useful in the field of natural language processing and computational text analysis.
"
* We study the document frequency in two different ways.
(a) Term Frequency (TF, DF)
(b) Term Frequency-Inverse Document Frequency (TF-IDF)
"
Term frequency
Suppose we have a set of English text documents and wish to rank them by which document is more relevant to the query, "the brown cow". A simple way to start out is by eliminating documents that do not contain all three words "the", "brown", and "cow", but this still leaves many documents. To further distinguish them, we might count the number of times each term occurs in each document; the number of times a term occurs in a document is called its term frequency. However, in the case where the length of documents varies greatly, adjustments are often made (see definition below). The first form of term weighting is due to Hans Peter Luhn (1957) which may be summarized as:
    'The weight of a term that occurs in a document is simply proportional to the term frequency.'

Inverse document frequency
Because the term "the" is so common, term frequency will tend to incorrectly emphasize documents which happen to use the word "the" more frequently, without giving enough weight to the more meaningful terms "brown" and "cow". The term "the" is not a good keyword to distinguish relevant and non-relevant documents and terms, unlike the less-common words "brown" and "cow". Hence, an inverse document frequency factor is incorporated which diminishes the weight of terms that occur very frequently in the document set and increases the weight of terms that occur rarely.

Karen Spärck Jones (1972) conceived a statistical interpretation of term-specificity called Inverse Document Frequency (idf), which became a cornerstone of term weighting:

'The specificity of a term can be quantified as an inverse function of the number of documents in which it occurs.'
"

### TF-IDF weights more words that occur frequently but in less number of documents. This seems to skew ranking towards advertisement like reviews.  Based on observations in our use case, it is not clear if such counting is relevant for this project.
* We will use a simple count vectorizer to create the document term matrix. i.e. we simply count the number of occurances of each token and leave it at that.

### Term Frequency on our data

In [None]:
prep_comments.get_term_ranking(items='tokens', score_type='df')
prep_comments.df_ranking['tokens'][:10]

### Term Frequency-Inverse Document Frequency on our document
* Note how the ranking now is different from Term Frequency ranking above
* Note also that the terms that are ranked high *seem* to indicate advertisement words, product descriptions etc.
    * This is why we chose not to go with TF-IDF document matrix formulation

In [None]:
prep_comments.get_term_ranking(items='tokens', score_type='tfidf')
prep_comments.tfidf_ranking['tokens'][:10]

### Plot of TF, TF-IDF for our document
* The frequency of words is inversely proportional to their rank. (Zipf's law)
    * i.e. if the ranking of a token is low (low number), the frequency of that token is high (fairly obvious)


### Term Frequency - Zipf's law plot

In [None]:
plt.figure(figsize=(10,6))
plt.plot([x[0] for x in prep_comments.df_ranking['tokens']])
plt.title('Document frequency ranking')
plt.ylabel("Document frequency")
plt.xlabel("Term ranking")
plt.show()


### Term Frequency LOG-LOG plot
* Power law yields a straight line in log-log plot

In [None]:
# we can use a log-log scale to observe more clearly the power-law distribution (Zipf's law)
plt.figure(figsize=(10,6))
plt.loglog([x[0] for x in prep_comments.df_ranking['tokens']])
plt.title('Document frequency ranking (log-log)')
plt.ylabel("log document frequency")
plt.xlabel("log term ranking")
plt.show()


### Term Frequency-Inverse Document Frequency plot

In [None]:
plt.figure(figsize=(10,6))
plt.plot([x[0] for x in prep_comments.tfidf_ranking['tokens']])
plt.title('Tf-idf ranking')
plt.ylabel("tf-idf")
plt.xlabel("Term ranking")
plt.show()


In [None]:
plt.figure(figsize=(10,6))
# add 1 to prevent taking log of zero
plt.loglog([1+x[0] for x in prep_comments.tfidf_ranking['tokens']])
plt.title('Tf-idf ranking (log-log)')
plt.ylabel("log(tf-idf)")
plt.xlabel("log(Term ranking)")
plt.show()

### Use CountVectorizer() for creating document term matrix for our project
* We apply no additional preprocessing as we have already "pre-processed" the comments (tokenization, removal of stop words, removal of punctuations etc.)
* We generate unigrams, i.e. each word as its own (in contrast to two words at a time joined together) for EDA purposes
    * When we undertake real analysis, we do add bigrams to our token set in order to capture sentiments. This is essential because we can capture sentiments such as "good-service", "bad-product", which will not be possible if we take each word seperately.
* In order to curb noise in our token analysis, we ignore tokens that have a frequency lower than 5 occurances in all documents
    * These frequency of occurances values can be adjusted by experimenting with various values.
    * In this preliminary analysis, we chose this number to show case capabilities. In production, we can decide on the exact numbers

In [None]:
# simple auxiliary function to override the preprocessing done by sklearn
def do_nothing(doc):
    return doc

# create a CountVectorizer object using our preprocessed text
count_vectorizer = CountVectorizer(encoding='utf-8',
                                   preprocessor=do_nothing,  # apply no additional preprocessing
                                   tokenizer=do_nothing,     # apply no additional tokenization
                                   lowercase=False,
                                   strip_accents=None,
                                   stop_words=None,
                                   ngram_range=(1, 1),       # generate only unigrams, bigrams
                                   analyzer='word',          # analysis at the word-level
                                   #max_df=0.5,              # ignore tokens that have a higher document frequency (can be int or percent)
                                   #min_df=500,                # ignore tokens that have a lowe document frequency (can be int or percent)
                                   min_df=5,
                                   max_features=None,        # we could impose a maximum number of vocabulary terms
                                   )

### Output document-term matrix

In [None]:
# transform our preprocessed tokens into a document-term matrix
dt_matrix = count_vectorizer.fit_transform(prep_comments.tokens)
print(f"Document-term matrix created with shape: {dt_matrix.shape}")

### Associate words with positions in the matrix and print them
* The value in column "1" indicates the column number of the word in the document term matrix

In [None]:
# we can access a dictionary that maps between words and positions of the document-term matrix
# list(count_vectorizer.vocabulary_.items())[0:10]
id_word_indexer = pd.DataFrame(count_vectorizer.vocabulary_.items())
id_word_indexer

In [None]:
## When we get all the values of a particular row using df.values, we then narrow it down to the specific column to determine the index. In our case, there are only
## two values, index 0 corresponds to name of the token and index 1 contains the index of the token in document term matrix
this_row = 0
required_value_index = 1
# find index of token product
product_index = id_word_indexer.loc[id_word_indexer[0] == 'product'].values[this_row][required_value_index]
service_index = id_word_indexer.loc[id_word_indexer[0] == 'service'].values[this_row][required_value_index]
quality_index = id_word_indexer.loc[id_word_indexer[0] == 'quality'].values[this_row][required_value_index]
price_index = id_word_indexer.loc[id_word_indexer[0] == 'price'].values[this_row][required_value_index]

### Dictionary methods
We do a simple analysis of our comments by looking for words that denote the topics:
* "product"
* "service"
* "quality"
* "price"
In order to find words that correspond to above topics, we find synonyms of words above and add to our vocabulary of words. We then use this comprehensive vocabulary to extract words in each document that correspond to the above words

* We use publicly available NLTK based WordNet to get synonyms of words

### Finding synonyms using WordNet for the words
This is a two step process. (This could be a multi-step process, but we get enough synonym words for two step process and we stop)

Step 1: Find synonyms using Wordnet for "product. service, quality, price"

Step 2: Use the resulting synonyms as starting set of words and find "more" synonyms.

  * Because the synonym words from the wordnet synonym finder may not be entirely suitable to be used automatically we filter the result (synonyms) obtained.
    Particularly, some words could be used in different sense (noun, adjective etc.) and only humans could determine different classification.
    Given that we know the domain here, we want to make sure that the synonyms we find don't add to the ambiguity.
    * For example, one of the synonyms for "ware" is "convenience" (as in mode of convenience).
          But this could also mean "ease of use", which would come under "quality" and not under "product", In order to avoid this, we manually filter out the output synonyms of wordnet results

In [None]:
product_words = ['gadget', 'contraption', 'appliance', 'widget', 'equipment', 'contrivance', 'gizmo', 'product', 'merchandise', 'ware', 'gismo']
service_words = ['service', 'assist', 'help', 'aid']
quality_words = ['quality', 'built', 'refurbish', 'comfort', 'relief']
price_words = ['price', 'money', 'cash', 'cheap', 'costly', 'pricey', 'discount', 'payment', 'rebate', 'cost']

Once we find the synonym words, we use these words and search each document (i.e., each row, each comment) and count the occurances of each for simple analysis later

### Product synonyms
* Only nouns are considered.

In [None]:
syns_indicating_product = find_wordnet_synonyms(product_words, wn.NOUN)

### Service synonyms

In [None]:
syns_indicating_service = find_wordnet_synonyms(service_words, wn.NOUN)

### Quality synonyms

In [None]:
syns_indicating_quality = find_wordnet_synonyms(quality_words, wn.NOUN)

### Price synonyms

In [None]:
syns_indicating_price = find_wordnet_synonyms(price_words, wn.NOUN)

### Find all tokens that correspond to "price, service, quality, product"

In [None]:
tokens_indicating_product = product_words
tokens_indicating_service = service_words
tokens_indicating_quality = quality_words
tokens_indicating_price = price_words

### Determine counts of each of the above topics in each comment made by the user.
* This information can be used for classification of documents that talk about 'service' or 'product' for example

### Find all token-ids corresponding to "product, price, quality, service" so we can add them up

In [None]:
# vocabulary's key is the feature word and the value is the feature-word's index in the feature column ...
service_indicator_token_ids = [v for k,v in count_vectorizer.vocabulary_.items() if k in tokens_indicating_service]
print(f"{len(service_indicator_token_ids)} tokens found in vocabulary indicating service, {service_indicator_token_ids}")

In [None]:
product_indicator_token_ids = [v for k,v in count_vectorizer.vocabulary_.items() if k in tokens_indicating_product]
print(f"{len(product_indicator_token_ids)} tokens found in vocabulary indicating product, {product_indicator_token_ids}")

In [None]:
quality_indicator_token_ids = [v for k,v in count_vectorizer.vocabulary_.items() if k in tokens_indicating_quality]
print(f"{len(quality_indicator_token_ids)} tokens found in vocabulary indicating quality, {quality_indicator_token_ids}")

In [None]:
price_indicator_token_ids = [v for k,v in count_vectorizer.vocabulary_.items() if k in tokens_indicating_price]
print(f"{len(price_indicator_token_ids)} tokens found in vocabulary indicating price, {price_indicator_token_ids}")

### Add up the number tokens in each document (i.e., for each row, each comment) that correspond to words "product, price,..." and its synonyms

In [None]:
service_indicator_counts = dt_matrix.tocsr()[:, service_indicator_token_ids]
# for a given data, count all such tokens that indicate service and presumably, one can add this as a new column to the data itself
service_indicator_counts = service_indicator_counts.sum(axis=1)
service_indicator_counts

In [None]:
product_indicator_counts = dt_matrix.tocsr()[:, product_indicator_token_ids]
# for a given data, count all such tokens that indicate product and presumably, one can add this as a new column to the data itself
product_indicator_counts = product_indicator_counts.sum(axis=1)
np.array(product_indicator_counts).ravel()

In [None]:
price_indicator_counts = dt_matrix.tocsr()[:, price_indicator_token_ids]
# for a given data, count all such tokens that indicate price and presumably, one can add this as a new column to the data itself
price_indicator_counts = price_indicator_counts.sum(axis=1)
np.array(price_indicator_counts).ravel()

In [None]:
quality_indicator_counts = dt_matrix.tocsr()[:, quality_indicator_token_ids]
# for a given data, count all such tokens that indicate quality and presumably, one can add this as a new column to the data itself
quality_indicator_counts = quality_indicator_counts.sum(axis=1)
np.array(quality_indicator_counts).ravel()

#### Determine counts of topics: "service, product, quality, price"
* We study these because it is important to know what customers think on these topics
* Counts of: service, product, quality, price

In [None]:
dt_matrix.toarray()

In [None]:
# index 14 - service, 15-product, 16-quality, 17-price
topic_index_name_map = {}
topic_index_name_map[product_index] = 'product'
topic_index_name_map[service_index] = 'service'
topic_index_name_map[quality_index] = 'quality'
topic_index_name_map[price_index] = 'price'

### Check to see how many documents contains the topics words
* For each topic word ('price', 'service' etc.), how many documents contains 0, 1, 2, ... count of those words
* Just a sanity check to see if we counted proerly or not

In [None]:
# We want to s
for idx in [product_index, service_index, quality_index, price_index]:
    unique, counts = np.unique(dt_matrix.toarray()[:, idx], return_counts=True)
    print(f"Index:{idx}, topic: {topic_index_name_map[idx]}, values:\n{np.asarray((unique, counts)).T}")

### Add count of topics as separate columns in data for easier analysis

In [None]:
text_data['service'] = np.array(service_indicator_counts).ravel()
text_data['product'] = np.array(product_indicator_counts).ravel()
text_data['quality'] = np.array(quality_indicator_counts).ravel()
text_data['price'] = np.array(price_indicator_counts).ravel()
text_data

### Let us find average rating, aggregated by 'price' of the product

In [None]:
data_agg_price = text_data.groupby(['price'], as_index=False).agg({'COMMENT': 'count', 'RATING':'mean'})
data_agg_price


### let us find average rating, aggregated by number of usages of word 'service'

In [None]:
data_agg_service = text_data.groupby(['service'], as_index=False).agg({'COMMENT': 'count', 'RATING':'mean'})
data_agg_service

### let us find average rating, aggregated by number of usages of the word 'quality'

In [None]:
data_agg_quality = text_data.groupby(['quality'], as_index=False).agg({'COMMENT': 'count', 'RATING':'mean'})
data_agg_quality

In [None]:
text_data

### Let us find average rating by month

In [None]:
data_agg_time = text_data.groupby(pd.Grouper(freq='M'), as_index=False).agg({'COMMENT': 'count', 'RATING':'mean'})
data_agg_time

### Let us find rating aggregated by the gender of the author of the comments

In [None]:
data_agg_quality = text_data.groupby(['GENDER'], as_index=False).agg({'COMMENT': 'count', 'RATING':'mean'})
data_agg_quality

### Let us find rating aggregated by product and brand name

In [None]:
data_agg_quality = text_data.groupby(['PRODUCT', 'BRAND'], as_index=False).agg({'COMMENT': 'count', 'RATING':'mean'})
data_agg_quality

### A way to convert document matrix into a matrix with documents as rows and token names (instead of token id) as columns for further analysis

In [None]:
## FOR SVM, we need to make a matrix with proper column names
# Also, we need another column that denotes the review
# We also need to normalize the data
df_svm = pd.DataFrame(dt_matrix.toarray())
df_svm.rename(columns=id_word_indexer.to_dict()[0], inplace=True)
df_svm