[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nils-holmberg/socs-qmd/blob/main/jnb/lab2_nlp2.ipynb)

In [None]:
# running pyldavis in google colab env
!pip install --upgrade pandas
# gensim topic modeling plotting tools
!pip install -q pyLDAvis

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# load text data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
#!pip install gdown
!gdown https://drive.google.com/uc?id=1EMzJxxoBaN_NbvF7xhoc09K82vQ6H_LX

In [None]:
fp = "content.xlsx"
df = pd.read_excel(fp, header=None, names=['id', 'image', 'text'])
df.head()

In [None]:
df.shape

In [None]:
# Convert to list
corpus = df.text.values.tolist()
corpus[:2]

In [None]:
corpus = ["Rafael Nadal Joins Roger Federer in Missing U.S. Open",
          "Rafael Nadal Is Out of the Australian Open",
          "Biden Announces Virus Measures",
          "Biden's Virus Plans Meet Reality",
          "Where Biden's Virus Plan Stands"]

In [None]:
# Download necessary NLTK data
import nltk
nltk.download('popular')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [None]:
# define data cleaning function
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

# clean data stored in a new list
clean_corpus = [clean(doc).split() for doc in corpus]

In [None]:
clean_corpus

# document term matrix (gensim approach)

![](https://raw.githubusercontent.com/nils-holmberg/socs-qmd/main/res/img/nlp-image_0-259d7a671398a16dc7cdfe05d89d4880.png)


In [None]:
# Importing Gensim
import gensim
from gensim import corpora

In [None]:
# Creating the term dictionary of our courpus that is of all the words (Sepcific to Genism syntax perspective),
# where every unique term is assigned an index.

dict_ = corpora.Dictionary(clean_corpus)

print(dict_)

In [None]:
# The dictionary had 18 unqiue words in the cleaned corpus.
for i in dict_.values():
    print(i)

In [None]:
# Converting list of documents (corpus) into Document Term Matrix using the dictionary
doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]
doc_term_matrix

# topic modeling (gensim approach)

In [None]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

In [None]:
# Running and Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=2, id2word=dict_, passes=1, random_state=0, eval_every=None)

In [None]:
# Prints the topics with the indexes: 0,1,2 :
# we need to manually check whethere the topics are different from one another or not
ldamodel.print_topics()

In [None]:
# num_topics mean: how many topics want to extract
# num_words: the number of words that want per topic
print(ldamodel.print_topics(num_topics=2, num_words=5))

In [None]:
# printing the topic associations with the documents
count = 0
for i in ldamodel[doc_term_matrix]:
    print("doc : ",count,i)
    count += 1

# topic modeling optimization (gensim)

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
# Convert to list
data = df.text.values.tolist()

In [None]:
print(data[:2])

In [None]:
import re

if False:
    df['text'] = df['text'].str.replace("&#039;", "'", regex=False)
    # Remove Emails
    data = [re.sub(r'\S*@\S*\s?', '', str(sent)) for sent in data]
    # Remove new line characters
    data = [re.sub(r'\s+', ' ', str(sent)) for sent in data]
    # Remove distracting single quotes
    data = [re.sub(r"\'", "", str(sent)) for sent in data]
    print(data[:2])

In [None]:
# spacy for nlp analysis
import spacy

def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

lemmatized_texts = lemmatization(data)
print(lemmatized_texts[0][0:50])

In [None]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print(data_words[0][0:5])

In [None]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

print(corpus[0][0:20])


In [None]:
word = id2word[[0][:1][0]]
print(word)

In [None]:
id2word[0]

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
topics = []
score = []

for i in range(1,20,1):
  lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=i, iterations=10, passes=10, random_state=100)
  cm = CoherenceModel(model=lda_model, corpus=corpus, dictionary=id2word, coherence='u_mass')
  topics.append(i)
  score.append(cm.get_coherence())

_=plt.plot(topics, score)
_=plt.xlabel('number of topics')
_=plt.ylabel('u_mass coherence score (-14, 14)')
plt.show()

In [None]:
topics = []
score = []

for i in range(1,20,1):
  lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=i, iterations=10, passes=10, random_state=100)
  cm = CoherenceModel(model=lda_model, texts=data_words, corpus=corpus, dictionary=id2word, coherence='c_v')
  topics.append(i)
  score.append(cm.get_coherence())

_=plt.plot(topics, score)
_=plt.xlabel('number of topics')
_=plt.ylabel('c_v coherence score (0-1)')
plt.show()

In [None]:
# modeling with optimal number of topics
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, iterations=10, passes=10, random_state=100)


In [None]:
# Print topics
lda_model.print_topics(-1)

# topic modeling visualization (gensim)

In [None]:
import pyLDAvis
import pyLDAvis.gensim

# Visualize the topics
#pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.display(vis)
#vis

In [None]:
# Save the report
pyLDAvis.save_html(vis, 'lab2-nlp2-topics.html')

# topic modelling with gensim and pyldavis
- [https://nils-holmberg.github.io/cca-nlp/jnb/scom-gpols-topics.html](https://nils-holmberg.github.io/cca-nlp/jnb/scom-gpols-topics.html)