In [None]:
#!pip install gensim
#!pip install pyLDAvis

import warnings
warnings.filterwarnings('ignore')


In [None]:
import sqlalchemy as sal
from sqlalchemy import text

import pandas as pd
import re

import nltk
from nltk.corpus import stopwords

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt


nltk.download("stopwords")

In [None]:
#!pip install spacy

In [None]:

# If trouble importing spacy try:
# 1. Kill Jupyter Lab/Jupyter Notebook completely
# 2. Go to terminal and type
#    export KMP_DUPLICATE_LIB_OK=TRUE
# 3. Restart Jupyter and try the import again.

import spacy

In [None]:
engine = sal.create_engine(
    "postgresql+psycopg2://ag_class:WUcgdfQ1@awesome-hw.sdsc.edu/postgres"
)
conn = engine.connect()

In [None]:
# Schema query
sqlquery = text(
    """
SELECT
   table_name,
   column_name,
   data_type
FROM
   information_schema.columns
WHERE
   table_name = 'usnewspaper';
"""
)

result = conn.execute(sqlquery)

data = [i for i in result]
data

In [None]:
sql_query = text(
    """
SELECT keyword, news, title, publishdate, src
    FROM (
        SELECT
            UNNEST(keywords) AS keyword, news, title, publishdate, src
        FROM
            usnewspaper
        ) AS k
WHERE keyword ILIKE 'CTE' OR keyword ILIKE 'encephalopathy';
"""
)
result = conn.execute(sql_query)

In [None]:
sql_query = text(
"""SELECT DISTINCT title, news, keywords 
    FROM usnewspaper 
    WHERE ARRAY['cte','lawsuit']::text[] <@ keywords and news is not null
UNION
SELECT DISTINCT title, news, keywords  
     FROM usnewspaper 
     WHERE ARRAY['nfl', 'helmet']::text[] <@ keywords and news is not null
UNION
SELECT DISTINCT title, news, keywords  
     FROM usnewspaper 
     WHERE ARRAY['nfl', 'brain']::text[] <@ keywords and news is not null
UNION
SELECT DISTINCT title, news, keywords 
    FROM usnewspaper 
    WHERE ARRAY['encephalopathy']::text[] <@ keywords AND news is not null;"""
)   
result = conn.execute(sql_query)

In [None]:
data = [i for i in result]

In [None]:
data[0:3]

In [None]:
len(data)

In [None]:
# OR this

# sql_query = text(
#     """
# SELECT keyword, news, title, publishdate, src
#     FROM (
#         SELECT
#             UNNEST(keywords) AS keyword, news, title, publishdate, src
#         FROM
#             usnewspaper
#         ) AS k
# WHERE keyword ILIKE ANY (ARRAY['CTE', 'encephalopathy']);
# """
# )
# result = conn.execute(sql_query)

In [None]:
# data2 = [i for i in result]

In [None]:
# data2

In [None]:
# OR this

# sql_query = text(
#     """
# SELECT
#     keywords, news, title, publishdate, src
# FROM usnewspaper
# WHERE keywords && (ARRAY['CTE', 'encephalopathy', 'cte', 'Encephalopathy']);
# """
# )
# result = conn.execute(sql_query)

In [None]:
# data3 = [i for i in result]
# len(data3)

In [None]:
#df = pd.DataFrame(data)
df = pd.DataFrame(data, columns=["title", "news", "keywords"])

In [None]:
df.head()

In [None]:
#!python3 -m spacy download en_core_web_sm

## Perform Named Entity Recognition (NER) Using Spacy

In [None]:
nlp_spacy = spacy.load("en_core_web_sm")

In [None]:
docs = list(nlp_spacy.pipe(df["news"]))

In [None]:
list_of_ents = []
for doc in docs:
    list_of_ents.append(
        list(set([ent.text for ent in doc.ents if (ent.label_ == "ORG") or (ent.label_ == "PERSON")]))
    )

In [None]:
df["named_entities"] = list_of_ents

In [None]:
df

## Now Perform LDA Topic Modeling

In [None]:
stop_words = stopwords.words("english")

In [None]:
# Convert to list
data = df.news.values.tolist()

# Remove Emails
data = [re.sub(r"\S*@\S*\s?", "", sent) for sent in data]

# Remove new line characters
data = [re.sub(r"\s+", " ", sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("'", "", sent) for sent in data]
data = [re.sub("`", "", sent) for sent in data]
data = [re.sub("´", "", sent) for sent in data]

print(data[:1])

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield (
            gensim.utils.simple_preprocess(str(sentence), deacc=True)
        )  # deacc=True removes punctuations


data_words = list(sent_to_words(data))

print(data_words[:1])

In [None]:
# Define functions for stopwords and lemmatization
def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]


def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(
            [token.lemma_ for token in doc if token.pos_ in allowed_postags]
        )
    return texts_out


def bigrams_and_trigrams(texts):

    # Add bigrams and trigrams to docs (only ones that appear 2 times or more).
    bigram = gensim.models.Phrases(texts, min_count=2)
    for idx in range(len(texts)):
        for token in bigram[texts[idx]]:
            if "_" in token:
                # Token is a bigram, add to document.
                texts[idx].append(token)
    return texts

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(
    data_words_nostops, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]
)

data_bigrams = bigrams_and_trigrams(data_lemmatized)
data_trigrams = bigrams_and_trigrams(data_bigrams)
# print(data_lemmatized[0])
# print(data_bigrams[0])
print(data_trigrams[0])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Frequency list
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[0])

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=4,
    random_state=100,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha="auto",
    per_word_topics=True,
)

In [None]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)

In [None]:
vis

In [None]:
# Compute Perplexity
print("\nPerplexity: ", lda_model.log_perplexity(corpus))
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(
    model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence="c_v"
)
coherence_lda = coherence_model_lda.get_coherence()
print("\nCoherence Score: ", coherence_lda)