# Project Elissa - NLP Playground


In [3]:
import re
import pandas as pd
from langdetect import detect, LangDetectException
import nltk
import spacy
import gensim
from gensim import corpora


In [5]:
df_products = pd.read_csv("../data/products.csv")
df_reviews = pd.read_csv("../data/reviews.csv")


## Language Detection


In [3]:
def detect_language(body: str | None) -> str:
    """Detect language of a string."""

    if body is None:
        return None
    try:
        return detect(body)
    except LangDetectException:
        return None


df_reviews["language"] = df_reviews["body"].apply(detect_language)


In [4]:
# Display languages that have more than 100 reviews
df_reviews["language"].value_counts().loc[lambda x: x > 100]


language
fr    3998
en    3575
de    2127
it    1325
es    1182
Name: count, dtype: int64

## Theme Modelling

- [Difference bewteen LSA and LDA - Medium](https://medium.com/@sujathamudadla1213/difference-between-lda-and-lsa-f7fefa6b4bfd)

We will proceed with only French for the time being.


In [5]:
LANG_EN = {"long": "english", "short": "en", "spacy": "en_core_web_sm"}
LANG = {"long": "french", "short": "fr", "spacy": "fr_core_news_sm"}


### Text Cleaning


In [16]:
nltk.download("stopwords")

df = df_reviews.query("language == 'en'")
stop_words = nltk.corpus.stopwords.words(LANG["long"])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yunan.wang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
def clean_text(text: str | None) -> str:
    """
    Clean text by removing non-alphabetic characters,
    shortwords and multiple spaces.
    """

    if text is None:
        return None

    # remove everything except alphabets (accents included)
    text = re.sub(r"[^a-zA-ZÀ-ÿ]", " ", text)
    # remove shortwords (length <= 3)
    text = " ".join([word for word in text.split() if len(word) > 3])
    # remove multiple spaces
    text = " ".join(text.split())
    # lowercase
    text = text.lower()

    return text


In [18]:
def remove_stopwords(text: str | None) -> str:
    """Remove stopwords from text."""

    if text is None:
        return None

    text = " ".join([word for word in text.split() if word not in stop_words])

    return text


In [19]:
doc_cleaned = df["body"].apply(clean_text).apply(remove_stopwords)


### LSA (Latent Semantic Analysis)


In [20]:
def train_LSA(texts: pd.Series, lang: str = LANG["long"]) -> dict[int, list[str]]:
    """Train Latent Semantic Analysis model to extract topics from texts."""

    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.decomposition import TruncatedSVD

    stopwords = nltk.corpus.stopwords.words(lang)
    vectorizer = TfidfVectorizer(
        stop_words=stopwords,
        max_features=1000,
        max_df=0.5,
        smooth_idf=True,
    )

    X = vectorizer.fit_transform(texts)
    svd_model = TruncatedSVD(
        n_components=10, algorithm="randomized", n_iter=100, random_state=122
    )
    svd_model.fit(X)
    terms = vectorizer.get_feature_names_out()

    topics = {}
    for i, comp in enumerate(svd_model.components_):
        terms_comp = zip(terms, comp)
        sorted_terms = sorted(terms_comp, key=lambda x: x[1], reverse=True)[:7]
        topics[i] = [t[0] for t in sorted_terms]
    return topics


In [21]:
LSA_result = train_LSA(doc_cleaned)


In [22]:
LSA_result


{0: ['product', 'good', 'great', 'like', 'comfortable', 'easy', 'cups'],
 1: ['good', 'product', 'great', 'quality', 'price', 'value', 'comfortable'],
 2: ['great', 'product', 'easy', 'comfortable', 'love', 'organic', 'tampons'],
 3: ['comfortable',
  'easy',
  'product',
  'clean',
  'menstrual',
  'remove',
  'effective'],
 4: ['product', 'disc', 'first', 'using', 'time', 'period', 'inside'],
 5: ['love',
  'organic',
  'tampons',
  'applicator',
  'plastic',
  'comfortable',
  'cotton'],
 6: ['love', 'easy', 'clean', 'quality', 'works', 'good', 'recommend'],
 7: ['love', 'size', 'well', 'comfortable', 'small', 'liners', 'soft'],
 8: ['easy', 'small', 'well', 'size', 'liners', 'plastic', 'would'],
 9: ['size', 'small', 'applicator', 'tampons', 'perfect', 'large', 'organic']}

### LDA (Latent Dirichlet Allocation)


In [23]:
!python -m spacy download {LANG["spacy"]}
nlp = spacy.load(LANG["spacy"], disable=["parser", "ner"])


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [24]:
def lemmatize_text(text: str, tags=["NOUN", "ADJ", "VERB"]) -> list[str]:
    """Lemmatize text."""

    doc = nlp(text)
    return [token.lemma_ for token in doc if token.pos_ in tags]


In [25]:
doc_lemmatized = doc_cleaned.apply(lemmatize_text)


In [35]:
def train_LDA(docs: pd.Series) -> dict[str, object]:
    docs_lemmatized = docs.apply(lemmatize_text)
    dictionary = corpora.Dictionary(docs_lemmatized)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs_lemmatized]
    LDA = gensim.models.ldamodel.LdaModel
    lda_model = LDA(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=10,
        random_state=100,
        chunksize=1000,
        passes=50,
    )

    topics_df = pd.DataFrame(
        {
            f"topic_{i}": [token for token, _ in lda_model.show_topic(i, topn=10)]
            for i in range(lda_model.num_topics)
        }
    )
    output = {
        "model": lda_model,
        "dictionary": dictionary,
        "doc_term_matrix": doc_term_matrix,
        "topics_df": topics_df,
    }
    return output


In [36]:
LDA_result = train_LDA(doc_cleaned)
LDA_result["topics_df"]


Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
0,product,false,liner,product,absorbent,tampon,use,clean,comfy,period
1,receive,review,pad,great,comfortable,applicator,cup,wash,wing,use
2,fall,playercard,thin,good,natural,organic,disc,smell,sticky,tampon
3,pack,vsemetric,stay,love,protection,brand,size,brand,amount,heavy
4,return,clientid,panty,recommend,arrive,one,small,use,moon,time
5,order,player,bag,use,flimsy,find,feel,work,ultra,feel
6,purchase,event,place,comfortable,pantie,plastic,try,good,usual,day
7,open,true,underwear,easy,chemical,make,menstrual,price,thick,hour
8,irritate,videoasinlist,stick,quality,unscented,cotton,easy,love,hygienic,leak
9,look,isinternal,wrap,price,package,use,make,skin,variety,take


In [39]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(
    topic_model=LDA_result["model"],
    corpus=LDA_result["doc_term_matrix"],
    dictionary=LDA_result["dictionary"],
)
