# Project Elissa - NLP Playground


In [1]:
import re
import pandas as pd
from langdetect import detect, LangDetectException
import nltk
import spacy
import gensim
from gensim import corpora


In [2]:
df_products = pd.read_csv("../data/products.csv")
df_reviews = pd.read_csv("../data/reviews.csv")

df_reviews.dropna(subset=["body"], inplace=True)


## Language Detection


In [3]:
def detect_language(body: str | None) -> str:
    """Detect language of a string."""

    if body is None:
        return None
    try:
        body = str(body)
        return detect(body)
    except LangDetectException:
        return None


In [4]:
df_reviews["language"] = df_reviews["body"].apply(detect_language)

# Display languages that have more than 100 reviews
df_reviews["language"].value_counts().loc[lambda x: x > 100]


language
fr    3999
en    3572
de    2128
it    1322
es    1182
Name: count, dtype: int64

## Theme Modelling

- [Difference bewteen LSA and LDA - Medium](https://medium.com/@sujathamudadla1213/difference-between-lda-and-lsa-f7fefa6b4bfd)

We will proceed with only French for the time being.


In [5]:
LANG_EN = {"long": "english", "short": "en", "spacy": "en_core_web_sm"}
LANG = {"long": "french", "short": "fr", "spacy": "fr_core_news_sm"}


In [6]:
nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words(LANG["long"])

df = df_reviews[df_reviews["language"] == LANG["short"]].copy()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yunan.wang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Text Cleaning


In [7]:
def clean_text(text: str | None) -> str:
    """
    Clean text by removing non-alphabetic characters,
    shortwords and multiple spaces.
    """

    if text is None:
        return None

    # remove everything except alphabets (accents included)
    text = re.sub(r"[^a-zA-ZÀ-ÿ]", " ", text)
    # remove shortwords (length <= 3)
    text = " ".join([word for word in text.split() if len(word) > 3])
    # remove multiple spaces
    text = " ".join(text.split())
    # lowercase
    text = text.lower()

    return text


In [8]:
def remove_stopwords(text: str | None) -> str:
    """Remove stopwords from text."""

    if text is None:
        return None

    text = " ".join([word for word in text.split() if word not in stop_words])

    return text


In [9]:
doc_cleaned = df["body"].apply(clean_text).apply(remove_stopwords)


### LSA (Latent Semantic Analysis)


In [10]:
def train_LSA(texts: pd.Series, lang: str = LANG["long"]) -> pd.DataFrame:
    """Train Latent Semantic Analysis model to extract topics from texts."""

    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.decomposition import TruncatedSVD

    stopwords = nltk.corpus.stopwords.words(lang)
    vectorizer = TfidfVectorizer(
        stop_words=stopwords,
        max_features=1000,
        max_df=0.5,
        smooth_idf=True,
    )

    X = vectorizer.fit_transform(texts)
    svd_model = TruncatedSVD(
        n_components=10, algorithm="randomized", n_iter=100, random_state=122
    )
    svd_model.fit(X)
    terms = vectorizer.get_feature_names_out()

    topics = {}
    for i, comp in enumerate(svd_model.components_):
        terms_comp = zip(terms, comp)
        sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:7]
        topics[f"Topic_{i}"] = [t[0] for t in sorted_terms]
    return pd.DataFrame(topics)


In [11]:
LSA_result = train_LSA(doc_cleaned)


In [12]:
LSA_result


Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9
0,très,produit,très,qualité,pratique,recommande,parfait,conforme,parfait,conforme
1,bien,conforme,bien,bonne,recommande,bien,conforme,bonne,très,très
2,produit,recommande,produit,prix,super,conforme,description,confortable,recommande,description
3,plus,qualité,pratique,rapport,conforme,description,prix,description,satisfaite,prix
4,qualité,efficace,joli,très,parfait,super,bien,taille,prix,satisfaite
5,recommande,tres,nettoie,excellent,très,elles,rapport,très,serviettes,plus
6,bonne,livraison,efficace,parfait,facile,prix,fait,recommande,confortable,rapport


### LDA (Latent Dirichlet Allocation)


In [13]:
!python -m spacy download {LANG["spacy"]}
nlp = spacy.load(LANG["spacy"], disable=["parser", "ner"])


Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m90.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [14]:
def lemmatize_text(
    nlp: spacy.language, text: str, tags=["NOUN", "ADJ", "VERB"]
) -> list[str]:
    """Lemmatize text."""

    doc = nlp(text)
    return [token.lemma_ for token in doc if token.pos_ in tags]


In [15]:
def train_LDA(docs: pd.Series) -> dict[str, object]:
    docs_lemmatized = docs.apply(lemmatize_text)
    dictionary = corpora.Dictionary(docs_lemmatized)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs_lemmatized]
    LDA = gensim.models.ldamodel.LdaModel
    lda_model = LDA(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=10,
        random_state=100,
        chunksize=1000,
        passes=50,
    )

    topics_df = pd.DataFrame(
        {
            f"topic_{i}": [token for token, _ in lda_model.show_topic(i, topn=10)]
            for i in range(lda_model.num_topics)
        }
    )
    output = {
        "model": lda_model,
        "dictionary": dictionary,
        "doc_term_matrix": doc_term_matrix,
        "topics_df": topics_df,
    }
    return output


In [16]:
LDA_result = train_LDA(doc_cleaned)
LDA_result["topics_df"]


Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
0,tampon,petit,mettre,qualité,super,produire,serviette,culotte,slip,grand
1,applicateur,taille,retirer,bon,absorbante,produit,hygiénique,flux,protège,petit
2,pratique,falloir,place,prix,protection,tre,fuite,taille,protèg,recevoir
3,utiliser,mettre,faire,parfaire,épais,cher,odeur,règle,marque,efficace
4,confortable,pouvoir,facile,rapport,joli,satisfaire,jetable,confortable,produit,recommander
5,marque,coupe,retraire,recommander,bel,paquet,produire,prendre,agréable,article
6,plastique,fois,noir,conforme,satisfait,odeur,pouvoir,petit,acheter,parfum
7,facile,utiliser,difficile,produire,confortable,recevoir,machine,fuite,prix,protection
8,produire,pratique,insertion,absorption,meilleur,recommander,protection,nuit,porter,confortable
9,recommander,coup,utilisation,rapide,sensible,commander,nuit,menstruel,petit,vêtement


In [17]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(
    topic_model=LDA_result["model"],
    corpus=LDA_result["doc_term_matrix"],
    dictionary=LDA_result["dictionary"],
)


In [None]:
# TDOO: Finetune model
# https://zhuanlan.zhihu.com/p/604636818
# https://zhuanlan.zhihu.com/p/341449469


## Sentiment Analysis


In [25]:
# TODO: Find a better way to extract keywords
TAGS_KEYWORDS = {
    "Comfort": {
        "confort",
        "confortable",
        "aise",
        "douceur",
        "soft",
        "agréable",
        "ergonomique",
        "facile à utiliser",
        "sans gêne",
    },
    "Absorbency": {
        "absorption",
        "absorbe",
        "absorbant",
        "efficace",
        "capacité d'absorption",
        "haute absorption",
        "longue durée",
    },
    "Material": {
        "matériau",
        "material",
        "organique",
        "organic",
        "synthétique",
        "synthetic",
        "naturel",
        "hypoallergénique",
        "écologique",
        "durable",
        "planète",
        "allergies",
    },
    "Size and Fit": {
        "taille",
        "ajustement",
        "fit",
        "adaptable",
        "flexible",
        "ajustable",
        "sur mesure",
        "confortablement",
    },
    "Price": {
        "prix",
        "affordable",
        "pas cher",
        "économique",
        "value for money",
        "bon marché",
        "coût-efficace",
        "rentable",
        "abordable",
    },
    "Leak Protection": {
        "protection fuite",
        "anti-fuite",
        "étanche",
        "leak proof",
        "sécurisé",
        "protection",
        "fiable",
        "imperméable",
    },
    "Odor Control": {
        "contrôle odeur",
        "anti-odeur",
        "odor control",
        "neutralise",
        "sans odeur",
        "fraîcheur",
        "parfumé",
        "désodorisant",
    },
    "Discretion": {
        "discret",
        "discrétion",
        "emballage",
        "packaging",
        "subtil",
        "caché",
        "non visible",
        "élégant",
        "simple",
        "pratique",
        "inaperçu",
    },
}


In [70]:
def analyse_sentiment(doc: str) -> float:
    """Analyse sentiment of a text."""

    from textblob import TextBlob

    blob = TextBlob(doc)
    return blob.sentiment.polarity


def extract_sentiment_by_tag(keywords: set[str], doc: str) -> float:
    """Extract the sentiment around a tag, in the sentence in a text."""

    from nltk.tokenize import sent_tokenize

    doc = str(doc).lower()
    sentences = sent_tokenize(doc)

    sentiment = 0

    for sentence in sentences:
        if any(keyword in sentence for keyword in keywords):
            sentence_sentiment = 1 if analyse_sentiment(sentence) >= 0 else 0
            sentiment += sentence_sentiment

    return sentiment


def construct_doc_tag_score(
    doc: str, tags_keywords: dict[str, list[str]]
) -> dict[str, int]:
    """Construct a dictionary of tags and their sentiment score for a text."""

    doc = str(doc).lower()
    doc_score = {}

    for tag, keywords in tags_keywords.items():
        doc_score[tag] = extract_sentiment_by_tag(keywords, doc)

    return doc_score


In [66]:
# FIXME: check length issue

reviews_score = pd.DataFrame(
    [construct_doc_tag_score(doc, TAGS_KEYWORDS) for doc in df_reviews["body"]]
)

reviews_score["asin"] = df["asin"]
reviews_score


Unnamed: 0,Comfort,Absorbency,Material,Size and Fit,Price,Leak Protection,Odor Control,Discretion,asin
0,0,0,0,0,0,0,0,0,B07BFMNKBJ
1,0,0,0,0,0,0,0,0,B07BFMNKBJ
2,1,1,1,1,0,0,0,0,B07BFMNKBJ
3,1,0,0,0,0,0,0,0,B07BFMNKBJ
4,0,0,0,0,0,0,0,0,B07BFMNKBJ
...,...,...,...,...,...,...,...,...,...
12614,0,0,0,0,0,0,0,0,
12615,0,0,0,0,0,0,0,0,
12616,0,0,0,0,0,0,0,0,
12617,0,0,0,0,0,0,0,0,


In [71]:
df_scores = reviews_score.groupby("asin").sum()
df_scores


Unnamed: 0_level_0,Comfort,Absorbency,Material,Size and Fit,Price,Leak Protection,Odor Control,Discretion
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
B00028O5RY,1,2,1,0,0,0,0,0
B000E3DXCA,2,2,2,0,2,0,0,0
B000FAG6X0,2,0,0,0,0,0,0,0
B000W6KKMK,0,0,0,0,0,0,0,0
B000X29GY6,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
B0C1L89ZDR,0,1,0,0,1,1,0,0
B0C4TH74KS,0,0,0,0,0,0,0,0
B0C668XCZQ,0,0,0,0,0,0,0,0
B0CCF2B381,0,0,1,0,0,0,0,0
