<strong>
    <font color="#0E1117">
        Author: lprtk
    </font>
</strong>

<br/>
<br/>


<Center>
    <h1 style="font-family: Arial">
        <font color="#0E1117">
            NLP: topic modeling using word2vec & LDA
        </font>
    </h1>
</Center>

-------------------------------------------------------------------------------------------------------------------------------

<div style="margin: 10px;">
    <h2 style="font-family: Arial">
        <font color="#0E1117">
            Introduction & context
        </font>
    </h2>
</div>

<p align="justify">
    The objective is to extract information and value from large volumes of textual data using Natural Language Processing (NLP). This notebook focuses on the use of the word2vec algorithm to represent and study the existing similarities between the words of several documents and on the combination of word2vec and the unsupervised learning algorithm LDA to perform topic modeling by grouping the documents by topic and by detailing the keywords of each document.
<p>

-------------------------------------------------------------------------------------------------------------------------------

<div style="margin: 10px;">
    <h2 style="font-family: Arial">
        <font color="#0E1117">
            Librairies import
        </font>
    </h2>
</div>

In [None]:
from gensim import corpora
from gensim.models import LdaMulticore, Word2Vec
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.tokenize import word_tokenize
import pandas as pd
#from pyLDAvis import gensim_models
#pyLDAvis.enable_notebook()
from pyTCTK import TextNet, WordNet
import seaborn as sns
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings("ignore")
from wordcloud import WordCloud

-------------------------------------------------------------------------------------------------------------------------------

<div style="margin: 10px;">
    <h2 style="font-family: Arial">
        <font color="#0E1117">
            Data import
        </font>
    </h2>
</div>

In [None]:
df_data = pd.read_csv(filepath_or_buffer="papers.csv", sep=",")

In [None]:
df_data.head(3)

-------------------------------------------------------------------------------------------------------------------------------

<div style="margin: 10px;">
    <h2 style="font-family: Arial">
        <font color="#0E1117">
            Text cleaning
        </font>
    </h2>
</div>

<p style="text-align: justify">
    We clean up the text in order to normalize it (lowercase, punctuation, etc.), remove all special characters and words that don't make sense or don't provide any information (stopwords) and then we transform words with a common root into a single word (lemmatization).
</p>

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            1) Lowercase
        </font>
    </h3>
</div>

In [None]:
df_data = TextNet(
    data=df_data,
    column="text"
).lowercase()

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            2) Punctuation
        </font>
    </h3>
</div>

In [None]:
df_data = TextNet(
    data=df_data,
    column="text"
).remove_punctuation()

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            3) Specific cleaning
        </font>
    </h3>
</div>

In [None]:
df_data = TextNet(
    data=df_data,
    column="text"
).remove_url()

In [None]:
df_data = TextNet(
    data=df_data,
    column="text"
).remove_html()

In [None]:
df_data = TextNet(
    data=df_data,
    column="text"
).remove_email()

In [None]:
df_data = TextNet(
    data=df_data,
    column="text"
).remove_digit()

In [None]:
df_data = TextNet(
    data=df_data,
    column="text"
).remove_mention()

In [None]:
df_data = TextNet(
    data=df_data,
    column="text"
).remove_single_character()

In [None]:
df_data = TextNet(
    data=df_data,
    column="text"
).additional_cleaning(
    add_regexs=None
)

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            4) Remove stopwords
        </font>
    </h3>
</div>

In [None]:
df_data = WordNet(
    data=df_data,
    column="text"
).remove_stopword(
    language="english",
    lowercase=False,
    remove_accents=False,
    add_stopwords=None,
    remove_stopwords=None
)

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            5) Lemmatization process
        </font>
    </h3>
</div>

In [None]:
df_data = WordNet(
    data=df_data,
    column="text"
).lemmatize(
    language="english",
    lowercase=False,
    remove_accents=False
)

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            6) Remove spaces
        </font>
    </h3>
</div>

In [None]:
df_data = TextNet(
    data=df_data,
    column="text"
).remove_space()

In [None]:
df_data = TextNet(
    data=df_data,
    column="text"
).remove_whitespace()

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            7) Tokenization process
        </font>
    </h3>
</div>

In [None]:
df_data["text_tokenized"] = df_data["text"].apply(lambda x: word_tokenize(x))

In [None]:
df_data.head(3)

-------------------------------------------------------------------------------------------------------------------------------

<div style="margin: 10px;">
    <h2 style="font-family: Arial">
        <font color="#0E1117">
            Vectorization using Word2vec
        </font>
    </h2>
</div>

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            1) Build model
        </font>
    </h3>
</div>

In [None]:
w2v_model = Word2Vec(
    df_data["text_tokenized"],
    min_count=600,
    window=10,
    vector_size=250,
    alpha=0.03,
    min_alpha=0.0007,
    workers=4,
    seed=42
)

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            2) Words similarity
        </font>
    </h3>
</div>

In [None]:
sim_words = w2v_model.wv.most_similar("estimator")
print(sim_words)

In [None]:
sim_words_2 = w2v_model.wv.most_similar("connectivity")
print(sim_words_2)

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            3) Words visualisation
        </font>
    </h3>
</div>

In [None]:
labels = []
tokens = []

for word in w2v_model.wv.key_to_index:
    tokens.append(w2v_model.wv[w2v_model.wv.key_to_index])
    labels.append(word)

tsne_model = TSNE(perplexity=50, n_components=2, init="pca", n_iter=2000, random_state=23)
new_values = tsne_model.fit_transform(tokens)

x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])

plt.figure(figsize=(15, 13)) 
for i in range(len(x)):
    plt.scatter(x[i],y[i])
    plt.annotate(
        labels[i],
        xy=(x[i], y[i]),
        xytext=(5, 2),
        textcoords="offset points",
        ha="right",
        va="bottom"
    )
plt.show()

-------------------------------------------------------------------------------------------------------------------------------

<div style="margin: 10px;">
    <h2 style="font-family: Arial">
        <font color="#0E1117">
            Topic Modelling using LDA
        </font>
    </h2>
</div>

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            1) Create dictionary
        </font>
    </h3>
</div>

In [None]:
# dictionary
dictionary = corpora.Dictionary(df_data["text_tokenized"])

# term document frequency
doc_term_matrix = [dictionary.doc2bow(paper) for paper in df_data["text_tokenized"]]

In [None]:
print(doc_term_matrix[:1][0][:30])

In [None]:
# human readable format of corpus (term frequency)
[[(dictionary[id2word], frequency) for id2word, frequency in corpus] for corpus in doc_term_matrix[:1]]

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            2) Build model
        </font>
    </h3>
</div>

In [None]:
lda_model = LdaMulticore(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=4,
    random_state=42,
    chunksize=200,
    passes=100,
    per_word_topics=True
)

In [None]:
lda_model.print_topics()

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            3) Topics' coherence
        </font>
    </h3>
</div>

In [None]:
coherence_model_lda = CoherenceModel(
    model=lda_model,
    texts=df_data["text_tokenized"],
    dictionary=dictionary,
    coherence="c_v"
)

coherence_lda = coherence_model_lda.get_coherence()
print(f"Coherence score: {round(coherence_lda, 4)}")

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            4) Topics' keywords
        </font>
    </h3>
</div>

In [None]:
list_colors = ["#17C37B", "#F92969", "#FACA0C", "#0D1117"]

wc = WordCloud(
    background_color="white",
    max_words=10,
    max_font_size=300,
    colormap="tab10",
    color_func=lambda *args, **kwargs: list_colors[i],
    prefer_horizontal=1.0
)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=True, dpi=160)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    wc.generate_from_frequencies(topic_words)
    plt.gca().imshow(wc)
    plt.gca().set_title("Topic "+str(i), fontdict=dict(size=16))
    plt.gca().axis("off")

plt.axis("off")
plt.subplots_adjust(wspace=0.1, hspace=0)
plt.margins(x=0, y=0)
plt.show()

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            5) Dominant topic for each document
        </font>
    </h3>
</div>

In [None]:
df_topics = pd.DataFrame(
    columns=["topic", "contribution", "keywords"]
)

# get main topic in each document
for i, list_rows in enumerate(lda_model[doc_term_matrix]):
    row = list_rows[0] if lda_model.per_word_topics else list_rows
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    
    # get the Dominant topic, Perc Contribution and Keywords for each document
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:
            wp = lda_model.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            df_topics = df_topics.append(
                dict(
                    zip(
                        df_topics.columns,
                        [
                            int(topic_num),
                            round(prop_topic,4),
                            topic_keywords
                        ]
                    )
                ),
                ignore_index=True
            )
        else:
            break

# add original text to the end of the output
df_topics = pd.concat([df_topics, df_data["text"]], axis=1)

In [None]:
df_topics.head(3)

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            6) Topics' distribution
        </font>
    </h3>
</div>

In [None]:
fig = plt.figure(figsize=(30, 13))
plt.subplot(1, 1, 1)
df_topics["topic"].value_counts().plot(kind="bar", color="#17C37B",)
plt.title("Topics' distribution by number of documents", size=18)
plt.xlabel("Topics", size=16)
plt.ylabel("Number of documents", size=16)
plt.xticks(rotation=0)
plt.subplots_adjust(hspace=0.3)
plt.show()

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            7) Topics' keywords
        </font>
    </h3>
</div>

In [None]:
df_data["topic"] = df_topics["topic"]

In [None]:
list_colors = ["#17C37B", "#F92969", "#FACA0C", "#0D1117"]

wc = WordCloud(
    max_words=30,
    min_font_size=10,
    background_color="white",
    colormap="tab10",
    color_func=lambda *args, **kwargs: list_colors[i],
    stopwords=None,
    prefer_horizontal=1.0
)

fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=True, dpi=160)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    wc.generate(
        str(df_data[df_data["topic"]==i]["text"])
    )
    plt.gca().imshow(wc)
    plt.gca().set_title("Topic "+str(i), fontdict=dict(size=16))
    plt.gca().axis("off")

plt.axis("off")
plt.subplots_adjust(wspace=0.1, hspace=0)
plt.margins(x=0, y=0)
plt.show()

<div style="margin: 10px;">
    <h3 style="font-family: Arial">
        <font color="#0E1117">
            8) Topics' visualization
        </font>
    </h3>
</div>

gensim_models.prepare(lda_model, doc_term_matrix, dictionary)