In [1]:
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import json
import pandas as pd

In [2]:
# create tokenized document
def create_tokenized_document(file):
    with open(file) as f:
        data = json.load(f)
    tokenized_document = []
    for doc in data:
        tokenized_document.append(doc["summary"].split())
    return tokenized_document

In [3]:
# read the file list and loop through each file
texts = []
file_list = pd.read_csv("../data/meta_data/QnA_file_list.csv")
# file_list = file_list.head(3)
for file in file_list["file_name"]:
    body = create_tokenized_document("../data/QnA_data/" + file)
    texts = texts + body

In [4]:
len(texts)

600

In [5]:
texts[0]

['A',
 'rocket',
 'launched',
 'from',
 'southern',
 'Lebanon',
 'struck',
 'a',
 'building',
 'in',
 'the',
 'Shtula',
 'settlement',
 'in',
 'northern',
 'Israel,',
 'causing',
 'damage',
 'but',
 'no',
 'casualties,',
 'according',
 'to',
 'Israeli',
 'Army',
 'Radio.',
 'The',
 'Israeli',
 'army',
 'reported',
 'around',
 '60',
 'rockets',
 'fired',
 'from',
 'southern',
 'Lebanon',
 'since',
 'Friday',
 'morning,',
 'following',
 "Hezbollah's",
 'announcement',
 'of',
 'over',
 '10',
 'operations',
 'targeting',
 'Israeli',
 'sites',
 'near',
 'the',
 'Lebanon',
 'border.',
 'Since',
 'Wednesday,',
 'Israel',
 'has',
 'faced',
 'over',
 '425',
 'rocket',
 'attacks',
 'from',
 'southern',
 'Lebanon,',
 'resulting',
 'in',
 'significant',
 'forest',
 'fires.',
 'Tensions',
 'have',
 'escalated',
 'after',
 'the',
 'killing',
 'of',
 'Hezbollah',
 'leader',
 'Talib',
 'Sami',
 'Abdallah',
 'in',
 'an',
 'Israeli',
 'airstrike.',
 'Concurrently,',
 'Israel',
 'faces',
 'international'

In [6]:
texts[300]

['**Summary:**',
 'Following',
 'the',
 '2024',
 'Lok',
 'Sabha',
 'election',
 'results,',
 'both',
 'the',
 'NDA',
 'and',
 'INDIA',
 'alliances',
 'are',
 'celebrating.',
 'The',
 'BJP',
 'did',
 'not',
 'secure',
 'a',
 'clear',
 'majority',
 'but',
 'is',
 'excited',
 'about',
 "NDA's",
 'return',
 'to',
 'power',
 'for',
 'a',
 'third',
 'term.',
 'The',
 'INDIA',
 'alliance',
 'has',
 'also',
 'made',
 'a',
 'strong',
 'comeback',
 'as',
 'the',
 'opposition,',
 'increasing',
 'their',
 'seat',
 'count',
 'and',
 'morale.',
 'Both',
 'alliances',
 'have',
 'held',
 'important',
 'meetings',
 'to',
 'discuss',
 'future',
 'strategies',
 'and',
 'government',
 'formation.',
 'Key',
 'events',
 'include:',
 '-',
 'BJP-led',
 'NDA',
 'reviewing',
 'election',
 'results',
 'and',
 'discussing',
 'government',
 'formation.',
 '-',
 'Narendra',
 'Modi',
 'resigning',
 'as',
 'Prime',
 'Minister',
 'but',
 'set',
 'to',
 'take',
 'oath',
 'for',
 'a',
 'historic',
 'third',
 'term.',
 '

In [7]:
texts[500]

['In',
 'a',
 'recent',
 'international',
 'conference',
 'in',
 'Singapore,',
 'Xu',
 'Hui,',
 'a',
 'major',
 'general',
 'and',
 'dean',
 'at',
 'the',
 'National',
 'Defense',
 'University',
 'of',
 'China,',
 'falsely',
 'attributed',
 'a',
 'statement',
 'to',
 'U.S.',
 'President',
 'Joe',
 'Biden,',
 'which',
 'was',
 'originally',
 'fabricated',
 'by',
 'Russian',
 'officials.',
 'Xu',
 'suggested',
 'that',
 'Ukrainian',
 'President',
 'Zelensky',
 'should',
 'consider',
 'the',
 'value',
 'of',
 'Ukrainian',
 'lives',
 'and',
 'cease',
 'resistance',
 'against',
 'Russian',
 'invasion,',
 'sparking',
 'widespread',
 'criticism',
 'in',
 'China.',
 "Xu's",
 'comments',
 'were',
 'seen',
 'as',
 'urging',
 'Ukraine',
 'to',
 'surrender,',
 'leading',
 'to',
 'comparisons',
 'with',
 'historical',
 'traitor',
 'Wang',
 'Jingwei.',
 "Xu's",
 'misattribution',
 'involved',
 'a',
 'distorted',
 'version',
 'of',
 "Biden's",
 'support',
 'for',
 'Ukraine,',
 'which',
 'was',
 'actu

In [8]:
# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(texts)

# Filter out words that occur less than 2 documents, or more than 50% of the documents
dictionary.filter_extremes(no_below=2, no_above=0.5)

# Convert the dictionary to a bag of words corpus for each document
corpus = [dictionary.doc2bow(text) for text in texts]

# Train the LDA model
lda_model = LdaModel(
    corpus, num_topics=3, id2word=dictionary, passes=15, random_state=42
)

# Print the topics with their top words
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}")

# Calculate the model's perplexity
perplexity = lda_model.log_perplexity(corpus)
print(f"\nPerplexity: {perplexity}")

# Calculate the coherence score
coherence_model_lda = CoherenceModel(
    model=lda_model, texts=texts, dictionary=dictionary, coherence="c_v"
)
coherence_lda = coherence_model_lda.get_coherence()
print(f"\nCoherence Score: {coherence_lda}")

Topic: 0 
Words: 0.010*"is" + 0.009*"has" + 0.009*"from" + 0.008*"BJP" + 0.007*"Congress" + 0.007*"Lok" + 0.007*"Sabha" + 0.007*"seats," + 0.006*"INDIA" + 0.006*"Minister"
Topic: 1 
Words: 0.024*"China" + 0.014*"South" + 0.011*"Chinese" + 0.011*"that" + 0.010*"China's" + 0.009*"U.S." + 0.009*"Sea" + 0.009*"Philippine" + 0.008*"Philippines" + 0.008*"at"
Topic: 2 
Words: 0.013*"Israeli" + 0.009*"President" + 0.008*"summit" + 0.008*"from" + 0.007*"Israel" + 0.007*"G7" + 0.007*"support" + 0.007*"Hamas" + 0.007*"leaders" + 0.006*"has"

Perplexity: -7.328045419638832

Coherence Score: 0.4711874349226703


In [9]:
# select the best topic for each document
topics = []
for i in range(len(corpus)):
    topics.append(max(lda_model.get_document_topics(corpus[i]), key=lambda x: x[1])[0])

In [10]:
import plotly.graph_objs as go
import pandas as pd
import json
import numpy as np
from sklearn.manifold import TSNE

pd.set_option("mode.copy_on_write", True)


def extract_summary(file):
    with open(file) as f:
        data = json.load(f)
    # Initialize a list to store the extracted values
    summary = []
    for item in data:
        if "summary" in item:
            s = item["summary"]
            summary.append(s)
    return summary


latent_vectors = pd.read_csv("../data/latent_vector/latent_vectors_5.csv", header=None)
# latent_vectors = latent_vectors.head(300)
# print(latent_vectors.shape)

tsne = TSNE(n_components=2, random_state=42)

v_2d = tsne.fit_transform(latent_vectors)
v_2d_df = pd.DataFrame(v_2d, columns=["x", "y"])

v_2d_df["topic"] = (
    ["Gaza"] * 100
    + ["India Election"] * 100
    + ["South China Sea Dispute"] * 100
    + ["India Election"] * 100
    + ["Gaza"] * 100
    + ["South China Sea Dispute"] * 100
)

v_2d_df["topic_lda"] = topics  # add the topic from LDA model

# read the summary from json file
summary = []

for file in file_list["file_name"]:
    s = extract_summary("../data/QnA_data/" + file)
    summary = summary + s
v_2d_df["summary"] = summary


# Function to format summaries for hover text
def format_summary(text):
    return text.replace("\n", "<br>")


v_2d_df["formatted_summary"] = v_2d_df["summary"].apply(format_summary)

# Plot using Plotly
# Create a color mapping for languages
color_map = {
    "Gaza": "blue",
    "South China Sea Dispute": "red",
    "India Election": "green",
}
v_2d_df["color"] = v_2d_df["topic"].map(color_map)

# Create a symbol mapping for languages
symbol_map = {
    0: "circle",
    1: "x",
    2: "triangle-up",
}

v_2d_df["symbol"] = v_2d_df["topic_lda"].map(symbol_map)
# print(v_2d_df["topic_lda"].unique())

# Create traces for each topic
traces = []
for topic in v_2d_df["topic"].unique():
    df = v_2d_df[v_2d_df["topic"] == topic]
    for topic_lda in df["topic_lda"].unique():
        df_lda = df[df["topic_lda"] == topic_lda]
        trace = go.Scatter(
            x=df_lda["x"],
            y=df_lda["y"],
            mode="markers",
            name=f"{topic} ({topic_lda})",
            marker=dict(color=color_map[topic], symbol=symbol_map[topic_lda], size=6),
            customdata=df_lda[["formatted_summary"]],
            # hovertemplate="<b>%{customdata[0]}</b><extra></extra>",
            hoverinfo="none",
        )
        traces.append(trace)

# Create the layout
layout = go.Layout(
    title=f"t-SNE Visualization of News Articles (d = 5)",
    xaxis=dict(title="t-SNE component 1"),
    yaxis=dict(title="t-SNE component 2"),
    showlegend=True,
)

# Create the figure
fig = go.Figure(data=traces, layout=layout)

# Customize the hover template to instruct user to hover
# fig.update_traces(
#    hovertemplate="<b>Hover over a point to see the summary</b><extra></extra>",
#    customdata=v_2d_df[["formatted_summary"]].values,
# )
# Update layout to adjust the width of the scatter plot
# fig.update_layout(width=1000)  # Set the width in pixels

# Save the plot as an HTML file with embedded JavaScript and CSS
html_content = f"""
<html>
<head>
    <title>Interactive Plot</title>
    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
    <style>
        #summary-box {{
            position: absolute;
            top: 10px;
            right: 10px;
            width: 30vw;
            max-height: 98vh;
            overflow-y: auto;
            background-color: white;
            border: 1px solid black;
            padding: 10px;
            z-index: 1000;
            white-space: pre-line;
        }}
    </style>
</head>
<body>
    <div id="plot">{fig.to_html(full_html=False, include_plotlyjs='cdn')}</div>
    <div id="summary-box">Hover over a point to see the summary here.</div>
    <script>
        document.addEventListener('DOMContentLoaded', function() {{
            const plotDiv = document.getElementById('plot');
            const plotElement = plotDiv.querySelector('.js-plotly-plot');
            const newWidth = window.innerWidth * 2 / 3;
            Plotly.relayout(plotElement, 'width', newWidth);
            plotElement.on('plotly_hover', function(data) {{
                if(data.points.length > 0) {{
                    const summary = data.points[0].customdata[0];
                    document.getElementById('summary-box').innerHTML = summary;
                }}
            }});
            plotElement.on('plotly_unhover', function(data) {{
                document.getElementById('summary-box').innerHTML = 'Hover over a point to see the summary here.';
            }});
            
        }});
    </script>
</body>
</html>
"""

# Save the HTML content to a file
with open("lda_topic_interactive_plot.html", "w") as f:
    f.write(html_content)