# Project Elissa - NLP Playground


In [1]:
import re
import pandas as pd
from langdetect import detect, LangDetectException
import nltk
import spacy
import gensim
from gensim import corpora


In [2]:
df_products = pd.read_csv("../data/products.csv")
df_reviews = pd.read_csv("../data/reviews.csv")

df_reviews.dropna(subset=["body"], inplace=True)


## Language Detection


In [3]:
def detect_language(body: str | None) -> str:
    """Detect language of a string."""

    if body is None:
        return None
    try:
        body = str(body)
        return detect(body)
    except LangDetectException:
        return None


df_reviews["language"] = df_reviews["body"].apply(detect_language)


In [4]:
# Display languages that have more than 100 reviews
df_reviews["language"].value_counts().loc[lambda x: x > 100]


language
fr    4000
en    3576
de    2134
it    1323
es    1183
Name: count, dtype: int64

## Theme Modelling

- [Difference bewteen LSA and LDA - Medium](https://medium.com/@sujathamudadla1213/difference-between-lda-and-lsa-f7fefa6b4bfd)

We will proceed with only French for the time being.


In [5]:
LANG_EN = {"long": "english", "short": "en", "spacy": "en_core_web_sm"}
LANG = {"long": "french", "short": "fr", "spacy": "fr_core_news_sm"}


In [6]:
nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words(LANG["long"])

df = df_reviews[df_reviews["language"] == LANG["short"]].copy()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yunan.wang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Text Cleaning


In [7]:
def clean_text(text: str | None) -> str:
    """
    Clean text by removing non-alphabetic characters,
    shortwords and multiple spaces.
    """

    if text is None:
        return None

    # remove everything except alphabets (accents included)
    text = re.sub(r"[^a-zA-ZÀ-ÿ]", " ", text)
    # remove shortwords (length <= 3)
    text = " ".join([word for word in text.split() if len(word) > 3])
    # remove multiple spaces
    text = " ".join(text.split())
    # lowercase
    text = text.lower()

    return text


In [8]:
def remove_stopwords(text: str | None) -> str:
    """Remove stopwords from text."""

    if text is None:
        return None

    text = " ".join([word for word in text.split() if word not in stop_words])

    return text


In [9]:
doc_cleaned = df["body"].apply(clean_text).apply(remove_stopwords)


### LSA (Latent Semantic Analysis)


In [10]:
def train_LSA(texts: pd.Series, lang: str = LANG["long"]) -> pd.DataFrame:
    """Train Latent Semantic Analysis model to extract topics from texts."""

    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.decomposition import TruncatedSVD

    stopwords = nltk.corpus.stopwords.words(lang)
    vectorizer = TfidfVectorizer(
        stop_words=stopwords,
        max_features=1000,
        max_df=0.5,
        smooth_idf=True,
    )

    X = vectorizer.fit_transform(texts)
    svd_model = TruncatedSVD(
        n_components=10, algorithm="randomized", n_iter=100, random_state=122
    )
    svd_model.fit(X)
    terms = vectorizer.get_feature_names_out()

    topics = {}
    for i, comp in enumerate(svd_model.components_):
        terms_comp = zip(terms, comp)
        sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:7]
        topics[f"Topic_{i}"] = [t[0] for t in sorted_terms]
    return pd.DataFrame(topics)


In [11]:
LSA_result = train_LSA(doc_cleaned)


In [12]:
LSA_result


Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9
0,très,produit,très,qualité,pratique,recommande,parfait,conforme,pratique,bien
1,bien,conforme,bien,bonne,recommande,conforme,conforme,bonne,conforme,bonne
2,produit,recommande,produit,prix,super,bien,description,description,bien,taille
3,plus,qualité,pratique,rapport,parfait,description,prix,confortable,description,parfait
4,qualité,efficace,absorbe,très,conforme,super,taille,taille,trop,recommande
5,recommande,tres,efficace,excellent,très,elles,fait,très,qualité,super
6,bonne,chargé,joli,parfait,facile,prix,trop,recommande,taille,trop


### LDA (Latent Dirichlet Allocation)


In [13]:
!python -m spacy download {LANG["spacy"]}
nlp = spacy.load(LANG["spacy"], disable=["parser", "ner"])


Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [14]:
def lemmatize_text(text: str, tags=["NOUN", "ADJ", "VERB"]) -> list[str]:
    """Lemmatize text."""

    doc = nlp(text)
    return [token.lemma_ for token in doc if token.pos_ in tags]


In [15]:
def train_LDA(docs: pd.Series) -> dict[str, object]:
    docs_lemmatized = docs.apply(lemmatize_text)
    dictionary = corpora.Dictionary(docs_lemmatized)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs_lemmatized]
    LDA = gensim.models.ldamodel.LdaModel
    lda_model = LDA(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=10,
        random_state=100,
        chunksize=1000,
        passes=50,
    )

    topics_df = pd.DataFrame(
        {
            f"topic_{i}": [token for token, _ in lda_model.show_topic(i, topn=10)]
            for i in range(lda_model.num_topics)
        }
    )
    output = {
        "model": lda_model,
        "dictionary": dictionary,
        "doc_term_matrix": doc_term_matrix,
        "topics_df": topics_df,
    }
    return output


In [16]:
LDA_result = train_LDA(doc_cleaned)
LDA_result["topics_df"]


Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
0,pratique,pouvoir,produit,qualité,serviette,marque,culotte,slip,recommander,taille
1,confortable,falloir,conforme,bon,hygiénique,autre,flux,protège,dire,petit
2,produire,mettre,article,prix,protection,faire,taille,protèg,parfait,tre
3,recommander,petit,description,rapport,tampon,tampon,confortable,agréable,modèle,produit
4,utiliser,serviette,correspondre,amazon,applicateur,produit,serviette,prix,commander,lavage
5,fuite,premier,lingerie,cher,parfaire,long,règle,place,fort,prendre
6,tampon,utiliser,recevoir,grand,pouvoir,merci,fuite,boîte,prix,usage
7,super,fois,produire,produire,flux,sympa,nuit,acheter,colle,produire
8,facile,taille,tissu,produit,marque,normal,petit,porter,femme,grand
9,efficace,place,recommand,jolie,plastique,tre,épais,parfum,satisfait,commander


In [17]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(
    topic_model=LDA_result["model"],
    corpus=LDA_result["doc_term_matrix"],
    dictionary=LDA_result["dictionary"],
)
