# Project Elissa - Machine Learning Methodology


In [1]:
import pandas as pd
import nltk

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("vader_lexicon")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yunan.wang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yunan.wang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yunan.wang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/yunan.wang/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
df = pd.read_csv("../data/reviews_en.csv", index_col=0)
df["body"] = df["body"].fillna("No comment")
df.head()


Unnamed: 0,asin,rating,title,country,date,body,body_en
0,B07BFMNKBJ,1,Pas reçue,France,2020-12-02,Ont me dis que je les reçue alors que non !,I was told that I received them when I didn&#3...
1,B07BFMNKBJ,1,Sans commentaire,France,2018-10-11,Sans commentaire,No comment
2,B07BFMNKBJ,5,Parfait 👌,France,2022-01-27,"J'utilise ce modèle depuis des années, je ne c...","I have been using this model for years, I no l..."
3,B07BFMNKBJ,4,Bien mais,France,2021-11-21,Légèrement moins cher qu’en grande surface mai...,Slightly cheaper than supermarkets but Pro Con...
4,B07BFMNKBJ,4,Bons tampons,France,2022-01-29,Rien de particulier à signaler.,Nothing special to report.


In [3]:
# Unescape some HTML characters in the text
from html import unescape

df["body"] = df["body"].apply(unescape)
df["body_en"] = df["body_en"].apply(unescape)


## Translation, Text Cleaning, Preprocessing

First, in order to tackle the issue of multiple existing languages in the reviews corpus that might make the following steps more difficult and prone to error, we will translate all texts to English with `Google Cloud Translation API`. Although information can be lost due to this _à priori_ operation, this sacrifice can be leveraged.


In [4]:
def batch_translate():
    from google.cloud import translate_v2 as translate
    from tqdm import tqdm
    from concurrent.futures import ThreadPoolExecutor

    raise ValueError("Add your google cloud credentials to the environment")

    translate_client = translate.Client()

    def translate_text(text, target="en"):
        result = translate_client.translate(text, target_language=target)
        return result["translatedText"]

    with ThreadPoolExecutor() as ex:
        translated = list(tqdm(ex.map(translate_text, df["body"]), total=len(df)))

    return translated


# batch_translate()
# df["body_en"] = translated
# df.to_csv("../data/reviews_en.csv")


In [5]:
def clean_text(text: str | None) -> str:
    """
    Clean text by removing non-alphabetic characters,
    shortwords and multiple spaces.
    """
    import re

    if text is None:
        return None

    # remove everything except alphabets (accents included)
    text = re.sub(r"[^a-zA-ZÀ-ÿ]", " ", text)
    # remove shortwords (length <= 3)
    text = " ".join([word for word in text.split() if len(word) > 3])
    # remove multiple spaces
    text = " ".join(text.split())
    # lowercase
    text = text.lower()

    return text


In [6]:
def remove_stopwords(text: str | None) -> str:
    """Remove stopwords from text."""
    stop_words = nltk.corpus.stopwords.words("english")

    if text is None:
        return None

    text = " ".join([word for word in text.split() if word not in stop_words])

    return text


In [7]:
def lemmatize_text(text: str) -> list[str]:
    """Lemmatize text."""

    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize import word_tokenize

    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    words = [
        word for word in words if word not in nltk.corpus.stopwords.words("english")
    ]

    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    return lemmatized_words


## Aspect Definition

We conduct a Theme Modeling with Latent Dirichlet Allocation (LDA) for find out the most important themes and their related keywords in the reviews corpus.


In [8]:
def train_LDA(docs: pd.Series) -> dict[str, object]:
    """Train LDA model."""

    import gensim
    from gensim import corpora

    docs_preprocessed: pd.Series[list[str]] = (
        docs.apply(clean_text).apply(remove_stopwords).apply(lemmatize_text)
    )
    dictionary = corpora.Dictionary(docs_preprocessed)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs_preprocessed]
    LDA = gensim.models.ldamodel.LdaModel
    lda_model = LDA(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=10,
        random_state=42,
        chunksize=1000,
        passes=100,
    )

    topics_df = pd.DataFrame(
        {
            f"topic_{i+1}": [token for token, _ in lda_model.show_topic(i, topn=10)]
            for i in range(lda_model.num_topics)
        }
    )
    output = {
        "model": lda_model,
        "dictionary": dictionary,
        "doc_term_matrix": doc_term_matrix,
        "topics_df": topics_df,
    }
    return output


In [9]:
import pickle

# LDA_result = train_LDA(df["body_en"])
# LDA_result["topics_df"]
# with open("../data/LDA_result.pkl", "wb") as f:
#     pickle.dump(LDA_result, f)

with open("../data/LDA_result.pkl", "rb") as f:
    LDA_result = pickle.load(f)
LDA_result["topics_df"]


Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10
0,package,first,comfortable,product,false,good,pad,panty,flow,tampon
1,description,menstrual,smell,item,review,product,time,liner,well,brand
2,perfect,cup,water,adhesive,amazon,quality,using,size,heavy,product
3,edge,size,clean,ordered,playercards,great,period,towel,good,made
4,received,time,easy,hygiene,vsemetrics,price,like,comfortable,leak,cotton
5,wing,also,wash,bag,desktop,recommend,year,small,period,applicator
6,glass,disc,leakage,received,player,excellent,tampon,well,thin,plastic
7,arrived,like,soft,daughter,event,fold,love,wear,practical,organic
8,pack,feel,feel,order,inline,delivery,product,absorbent,night,price
9,pouch,little,wearing,purchase,clientid,value,used,little,protection,silicone


In [10]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(
    topic_model=LDA_result["model"],
    corpus=LDA_result["doc_term_matrix"],
    dictionary=LDA_result["dictionary"],
)


## Keyword Extraction

With each of the aspect pre-defined from the aspects extracted from the previous theme modeling, we apply the TF-IDF Vectorizer algorithm to find out the most relevant and recurrent keywords around each aspect.


In [11]:
def generate_keywords_set(corpus: pd.Series, aspects: set[str]) -> dict[str, list[str]]:
    """Generate a set of keywords for each pre-defined aspect."""

    import nltk
    import warnings
    from nltk.corpus import stopwords
    from sklearn.feature_extraction.text import TfidfVectorizer

    warnings.filterwarnings("ignore")

    def preprocess_text(text):
        tokens = nltk.word_tokenize(text)
        tokens = [word.lower() for word in tokens if word.isalnum()]
        stop_words = set(stopwords.words("english"))
        tokens = [word for word in tokens if word not in stop_words]
        return tokens

    def extract_keywords(corpus, aspect) -> list[str]:
        vectorizer = TfidfVectorizer(tokenizer=preprocess_text)
        tfidf_matrix = vectorizer.fit_transform(corpus)

        feature_names = vectorizer.get_feature_names_out()
        aspect_indices = [
            i for i, word in enumerate(feature_names) if aspect.lower() in word
        ]
        top_indices = sorted(
            aspect_indices, key=lambda i: tfidf_matrix.sum(axis=0)[0, i], reverse=True
        )
        top_keywords = [feature_names[i] for i in top_indices]

        return top_keywords

    output = {}
    for aspect in aspects:
        keywords = extract_keywords(corpus, aspect)
        output[aspect] = keywords

    return output


In [12]:
import json

# ASPECTS = {"comfort", "absorb", "material", "size", "price", "leak", "package"}
# KEYWORDS_SET = generate_keywords_set(corpus=df["body_en"], aspects=ASPECTS)

# with open("../data/keywords_set.json", "w") as f:
#     json.dump(KEYWORDS_SET, f)

with open("../data/keywords_set.json", "r") as f:
    KEYWORDS_SET = json.load(f)


In [13]:
for aspect, keywords in KEYWORDS_SET.items():
    print(aspect, keywords)


price ['price', 'prices', 'priced', 'pricey', 'overpriced', 'priceless', 'pricewise']
leak ['leaks', 'leak', 'leakage', 'leaking', 'leaked', 'leaky', 'leakages', 'leakproof', 'receivedleaked', 'leakguard', 'toiletleaks', 'leakage3', 'leakageproblem']
absorb ['absorbent', 'absorb', 'absorbency', 'absorbs', 'absorbed', 'absorbant', 'absorbents', 'absorbing', 'absorbancy', 'absorbance', 'superabsorbent', 'absorbence', 'absorbable', 'absorbtion', 'absorbentwith', 'absorbe', 'absorbencycons', 'absorbante']
comfort ['comfortable', 'uncomfortable', 'comfort', 'discomfort', 'comfortably', 'procomfort', 'uncomfortably', 'largecomfortable', 'comfortablegood', 'comfortablebut', 'promisedcomfortable', 'discomforts', 'comforts', 'comforting', 'discomfortable', 'discomforting', 'notuncomfortable', 'comfortcontrary', 'uncomfortablely']
material ['material', 'materials', 'materials5', 'materiali']
package ['package', 'packaged', 'packages', 'packagedvery', 'unpackaged', 'discreetlypackaged']
size ['si

## Keyword-based Sentiment Analysis

For a span to have a positive label there are two conditions, it needs to contain a predefined keyword and be associated by a neutral or positive sentiment. We included the neutral sentiment because some comments not being "positive enough" for the algortihm like "confortable et agréable" were identified as neutral when they were positive. On the other hand, for a span to be categorized under a negative label the conditions are that it needs to be associated with an obvious negative sentiment AND must contain a negative keyword.


In [14]:
def analyze_sentiment(doc: str) -> dict[str, float]:
    """Analyze sentiment of a document."""

    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    sid = SentimentIntensityAnalyzer()
    scores = sid.polarity_scores(doc)

    return scores


In [15]:
def analyze_sentiment_by_aspect(doc: str, aspect: list[str]) -> tuple[int, int]:
    """Analyze sentiment score of a document by an aspect."""

    from nltk.tokenize import sent_tokenize

    doc = doc.lower()
    sentences = sent_tokenize(doc)

    pos = 0
    neg = 0

    for keyword in aspect:
        for sentence in sentences:
            if keyword in sentence:
                scores = analyze_sentiment(sentence)
                if (scores["pos"] + scores["neu"]) / 2 > scores["neg"]:
                    pos += 1
                else:
                    neg += 1

    return pos, neg


In [16]:
def construct_doc_aspect_score(
    doc: str, aspects: dict[str, list[str]]
) -> dict[str, tuple[int, int]]:
    """Construct a dictionary of document-aspect scores."""

    output = {}
    for aspect, keywords in aspects.items():
        output[aspect] = analyze_sentiment_by_aspect(doc, keywords)

    return output


In [17]:
def construct_score_df(
    docs: pd.Series, aspects: dict[str, list[str]], asins: pd.Series
) -> pd.DataFrame:
    """Construct a dataframe of document-aspect scores."""

    scores = [
        construct_doc_aspect_score(doc, aspects) for doc in docs if doc is not None
    ]
    df_pos = pd.DataFrame(
        {aspect: [score[aspect][0] for score in scores] for aspect in aspects}
    )
    df_neg = pd.DataFrame(
        {aspect: [score[aspect][1] for score in scores] for aspect in aspects}
    )

    df_pos.columns = [f"{aspect}_pos" for aspect in aspects]
    df_neg.columns = [f"{aspect}_neg" for aspect in aspects]
    df = pd.concat([df_pos, df_neg], axis=1)
    df["asin"] = asins
    grouped = df.groupby("asin").sum()

    return grouped


In [18]:
# scores = construct_score_df(df["body_en"], KEYWORDS_SET, asins=df["asin"])
# scores.to_csv("../data/scores.csv")

scores = pd.read_csv("../data/scores.csv", index_col=0)
scores.head()


Unnamed: 0_level_0,price_pos,leak_pos,absorb_pos,comfort_pos,material_pos,package_pos,size_pos,price_neg,leak_neg,absorb_neg,comfort_neg,material_neg,package_neg,size_neg
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
B00028O5RY,5,4,24,23,1,1,6,0,0,3,6,0,0,0
B000E3DXCA,4,4,3,0,0,0,1,0,7,4,0,0,0,0
B000FAG6X0,0,8,0,9,0,2,1,0,0,0,2,0,0,0
B000W6KKLQ,0,0,0,4,0,0,0,0,0,0,0,0,0,0
B000W6KKMK,1,0,2,4,0,0,0,0,0,0,0,0,0,0


## Scores Normalisation

1. **Issue identified**
   Since every product don't have the same number of reviews, products with a large number of reviews might naturally have higher absolute positive and negative scores, so normalizing these scores will allow for a more equitable comparison across products with varying numbers of reviews. Therefore, a normalization step was needed, in order to give more weight to products with a high a number of reviews, we decided to use the Weighted Normalization Method.

2. **Calculation of net scores**
   For each label, calculated the net score by subtracting the negative score from the positive score.

3. **Weighted Normalization of Net Scores**

Applied a weighted normalization formula to each net score:
weighted normalized score = net score / log (total number of reviews + 1)

This formula accounts for the total number of reviews to reduce the impact of products with very few reviews.

The logarithm (log) function diminishes the impact of the total number of reviews. The logarithmic scale ensures that the increase in impact is smaller with each additional review, which helps mitigate the issue of products with very few reviews having disproportionately high normalized scores.
The +1 in the formula avoids division by zero for products with no reviews.

4. **Min/Max method**
   After applying weighted normalization, the scores were further normalized using the min/max method. This step scales each score to a range between 0 and 1, making it easier to compare scores across different labels and products.

5. **Why is this approach relevant?**

- **Valuing More Reviews**: Products with a higher number of reviews tend to have a larger denominator in the normalization formula, which leads to a smaller overall normalized score compared to the raw net score. However, because the logarithmic function grows slowly (logarithmic growth), the impact of additional reviews diminishes. Therefore, while more reviews do impact the score, the impact of each additional review is less significant in larger sets of reviews.
- **Reducing the Impact of Few Reviews**: For products with very few reviews, the normalization has a more significant effect, as the logarithm of a small number is considerably less than the logarithm of a large number. This means that a single positive or negative review will have less impact on the weighted normalized score for a product with many reviews compared to a product with only a few.


In [19]:
products = pd.read_csv("../data/products.csv", index_col=0)
composite = scores.merge(products[["asin", "num_reviews"]], on="asin")
composite


Unnamed: 0,asin,price_pos,leak_pos,absorb_pos,comfort_pos,material_pos,package_pos,size_pos,price_neg,leak_neg,absorb_neg,comfort_neg,material_neg,package_neg,size_neg,num_reviews
0,B00028O5RY,5,4,24,23,1,1,6,0,0,3,6,0,0,0,623.0
1,B000E3DXCA,4,4,3,0,0,0,1,0,7,4,0,0,0,0,184.0
2,B000FAG6X0,0,8,0,9,0,2,1,0,0,0,2,0,0,0,10944.0
3,B000W6KKLQ,0,0,0,4,0,0,0,0,0,0,0,0,0,0,114.0
4,B000W6KKMK,1,0,2,4,0,0,0,0,0,0,0,0,0,0,82.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,B0CL3VMWJR,3,0,0,0,0,0,2,0,0,0,0,0,0,0,1.0
613,B0CL56DM4L,3,0,0,0,0,0,2,0,0,0,0,0,0,0,2.0
614,B0CL56VDZV,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2.0
615,B0CL9VMF6K,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0


In [21]:
categories = [x.replace("_pos", "") for x in composite.columns if "_pos" in x]

for x in categories:
    composite[f"{x}_scaled"] = (
        composite[f"{x}_pos"] - composite[f"{x}_neg"]
    ) / composite["num_reviews"]

composite


Unnamed: 0,asin,price_pos,leak_pos,absorb_pos,comfort_pos,material_pos,package_pos,size_pos,price_neg,leak_neg,...,package_neg,size_neg,num_reviews,price_scaled,leak_scaled,absorb_scaled,comfort_scaled,material_scaled,package_scaled,size_scaled
0,B00028O5RY,5,4,24,23,1,1,6,0,0,...,0,0,623.0,0.008026,0.006421,0.033708,0.027287,0.001605,0.001605,0.009631
1,B000E3DXCA,4,4,3,0,0,0,1,0,7,...,0,0,184.0,0.021739,-0.016304,-0.005435,0.000000,0.000000,0.000000,0.005435
2,B000FAG6X0,0,8,0,9,0,2,1,0,0,...,0,0,10944.0,0.000000,0.000731,0.000000,0.000640,0.000000,0.000183,0.000091
3,B000W6KKLQ,0,0,0,4,0,0,0,0,0,...,0,0,114.0,0.000000,0.000000,0.000000,0.035088,0.000000,0.000000,0.000000
4,B000W6KKMK,1,0,2,4,0,0,0,0,0,...,0,0,82.0,0.012195,0.000000,0.024390,0.048780,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,B0CL3VMWJR,3,0,0,0,0,0,2,0,0,...,0,0,1.0,3.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.000000
613,B0CL56DM4L,3,0,0,0,0,0,2,0,0,...,0,0,2.0,1.500000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000
614,B0CL56VDZV,0,0,0,0,1,0,0,0,0,...,0,0,2.0,0.000000,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000
615,B0CL9VMF6K,0,0,0,0,0,0,0,0,0,...,0,0,2.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
