## 1.6

In [15]:
import json
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data/")

author_docs = pd.read_pickle(DATA_DIR / "author_docs_with_tokens.pkl")
docs_tokens_nostop = author_docs["tokens_nostop"].tolist()
docs_tokens_keepstop = author_docs["tokens_keepstop"].tolist()


In [None]:
# 根据 task 建议, 用保留 stopwords 的 tokens
docs_tokens = author_docs["tokens_keepstop"].tolist()

In [None]:
# !pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl (24.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting smart-open>=1.8.1
  Downloading smart_open-7.3.1-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.7/61.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wrapt
  Downloading wrapt-2.0.1-cp38-cp38-macosx_10_9_x86_64.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.5/60.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wrapt, smart-open, gensim
Successfully installed gensim-4.3.3 smart-open-7.3.1 wrapt-2.0.1


In [22]:
from gensim.models import LdaModel

def run_lda(alpha, beta, K=4):
    model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=K,
        alpha=alpha,
        eta=beta,     # gensim 用 eta 表示 Beta
        random_state=42,
        passes=20
    )
    return model


In [19]:
# 创建 dictionary & corpus
from gensim.corpora import Dictionary

dictionary = Dictionary(docs_tokens)
dictionary.filter_extremes(no_below=2, no_above=0.95)

corpus = [dictionary.doc2bow(doc) for doc in docs_tokens]

In [20]:
# Hyperparameter ranges
alpha_low  = 0.01
alpha_high = 1.0

beta_low   = 0.01
beta_high  = 1.0

In [23]:
# 4 symmetric runs
model_LL = run_lda(alpha_low,  beta_low)   # Low α, Low β
model_LH = run_lda(alpha_low,  beta_high)  # Low α, High β
model_HL = run_lda(alpha_high, beta_low)   # High α, Low β
model_HH = run_lda(alpha_high, beta_high)  # High α, High β


In [24]:
def print_topics(model, label):
    print(f"\n===== {label} =====")
    for tid, words in model.print_topics(num_words=10):
        print(f"Topic {tid}: {words}")

print_topics(model_LL, "Low α, Low β")
print_topics(model_LH, "Low α, High β")
print_topics(model_HL, "High α, Low β")
print_topics(model_HH, "High α, High β")



===== Low α, Low β =====
Topic 0: 0.051*"the" + 0.033*"climate" + 0.022*"paris" + 0.020*"for" + 0.015*"and" + 0.009*"climatechange" + 0.009*"from" + 0.008*"are" + 0.007*"change" + 0.006*"deal"
Topic 1: 0.043*"the" + 0.021*"for" + 0.019*"climate" + 0.015*"amp" + 0.014*"and" + 0.010*"our" + 0.009*"from" + 0.008*"climatechange" + 0.008*"paris" + 0.008*"are"
Topic 2: 0.039*"climatechange" + 0.034*"the" + 0.018*"for" + 0.018*"amp" + 0.018*"copparis" + 0.014*"freepresidentnasheed" + 0.013*"climate" + 0.013*"maldives" + 0.012*"and" + 0.011*"behind"
Topic 3: 0.038*"climate" + 0.027*"the" + 0.026*"climatechange" + 0.024*"cdnpoli" + 0.016*"amp" + 0.014*"design" + 0.014*"change" + 0.014*"green" + 0.010*"for" + 0.009*"actonclimate"

===== Low α, High β =====
Topic 0: 0.049*"the" + 0.032*"climate" + 0.020*"paris" + 0.019*"for" + 0.014*"and" + 0.010*"climatechange" + 0.008*"from" + 0.007*"are" + 0.007*"change" + 0.006*"amp"
Topic 1: 0.040*"the" + 0.020*"climate" + 0.019*"for" + 0.015*"amp" + 0.013*

The corpus is highly homogeneous

the dataset is:

small,

short texts (authors aggregated into small documents),

highly homogeneous (everyone talks about climate politics).

Therefore:

LDA cannot separate topics strongly,

α changes little in top-words,

β only slightly changes topic smoothness,

asymmetric α still shows the clearest effect.

Low β (0.01)  topics are sharper and more peaked.
Some words such as “climatechange”, “copparis”, “climate” become very dominant.

High β (1.0)  topics become more uniform, with more medium-probability words.
Rare words almost disappear; common words (the, climate, for, and) appear more.

In [25]:
# Asymmetric α
# when k=4
asym_alpha = [0.1, 0.5, 1.0, 2.0]
model_asym = run_lda(asym_alpha, beta_low)

print_topics(model_asym, "Asymmetric α (Beta low)")



===== Asymmetric α (Beta low) =====
Topic 0: 0.051*"the" + 0.034*"climate" + 0.023*"paris" + 0.020*"for" + 0.015*"and" + 0.009*"from" + 0.007*"are" + 0.007*"climatechange" + 0.007*"deal" + 0.007*"change"
Topic 1: 0.044*"the" + 0.022*"for" + 0.018*"climate" + 0.014*"and" + 0.014*"amp" + 0.010*"our" + 0.009*"paris" + 0.009*"from" + 0.008*"are" + 0.008*"with"
Topic 2: 0.036*"the" + 0.034*"climatechange" + 0.019*"for" + 0.018*"amp" + 0.017*"copparis" + 0.013*"and" + 0.012*"freepresidentnasheed" + 0.012*"maldives" + 0.011*"our" + 0.011*"behind"
Topic 3: 0.035*"climate" + 0.034*"the" + 0.028*"climatechange" + 0.016*"amp" + 0.013*"change" + 0.012*"cdnpoli" + 0.012*"for" + 0.009*"green" + 0.008*"design" + 0.007*"are"


## 1.7

In [27]:
# 训练多个k的LDA模型
from gensim import corpora, models

# 使用 NOT STOP 的或 KEEP STOP 的都行，但任务 1.6 建议保留停用词
tokens_list = docs_tokens_keepstop  

# 词典与语料
dictionary = corpora.Dictionary(tokens_list)
corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]

def train_lda(K, alpha='symmetric', eta='auto', passes=10, random_state=42):
    lda = models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=K,
        alpha=alpha,
        eta=eta,
        random_state=random_state,
        passes=passes
    )
    return lda

# 选择几个 K
K_list = [5, 10, 20]
lda_models = {K: train_lda(K) for K in K_list}

lda_models


{5: <gensim.models.ldamodel.LdaModel at 0x1acf4a430>,
 10: <gensim.models.ldamodel.LdaModel at 0x1ae035e50>,
 20: <gensim.models.ldamodel.LdaModel at 0x19d3d3cd0>}

In [28]:
# 随机抽n对作者文档
import random
import pandas as pd
from pathlib import Path

N = 15
random.seed(0)

num_docs = len(author_docs)
pairs = set()

while len(pairs) < N:
    i, j = random.sample(range(num_docs), 2)
    if i > j:
        i, j = j, i
    pairs.add((i, j))

rows = []
for idx, (i, j) in enumerate(pairs):
    doc_i = author_docs.iloc[i]
    doc_j = author_docs.iloc[j]
    rows.append({
        "pair_id": idx,
        "doc_i_index": i,
        "doc_j_index": j,
        "author_i": doc_i["author_id"],
        "author_j": doc_j["author_id"],
        "text_i": doc_i["full_text"][:400],
        "text_j": doc_j["full_text"][:400],
        "label": ""  # 你之后手动填 0/1/2
    })

pairs_df = pd.DataFrame(rows)

out_path = Path("../data/topic_similarity_pairs.csv")
pairs_df.to_csv(out_path, index=False)

out_path


PosixPath('../data/topic_similarity_pairs.csv')

In [29]:
# 计算文档–topic 分布（对齐你的 corpus）
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def doc_topic_matrix(lda_model, corpus, K):
    theta = np.zeros((len(corpus), K))
    for d, bow in enumerate(corpus):
        for topic_id, prob in lda_model.get_document_topics(bow):
            theta[d, topic_id] = prob
    return theta


In [34]:
# 载入你的人工标注
labels_df = pd.read_csv("../data/topic_similarity_pairs.csv")
labels_df["label"] = labels_df["label"].astype(int)


In [35]:
all_results = []

for K, lda in lda_models.items():
    theta = doc_topic_matrix(lda, corpus, K)

    tmp_rows = []
    for _, row in labels_df.iterrows():
        i = int(row["doc_i_index"])
        j = int(row["doc_j_index"])
        label = int(row["label"])

        sim = cosine_similarity(theta[i].reshape(1, -1),
                                theta[j].reshape(1, -1))[0, 0]

        tmp_rows.append({
            "K": K,
            "pair_id": row["pair_id"],
            "label": label,
            "sim": sim
        })
    tmp = pd.DataFrame(tmp_rows)
    all_results.append(tmp)

results_df = pd.concat(all_results, ignore_index=True)
results_df.head()


Unnamed: 0,K,pair_id,label,sim
0,5,0,2,0.256352
1,5,1,2,0.999775
2,5,2,2,0.99667
3,5,3,1,0.025457
4,5,4,1,0.999625


In [36]:
summary = (
    results_df
    .groupby(["K", "label"])["sim"]
    .agg(["mean", "std", "count"])
    .reset_index()
)
summary


Unnamed: 0,K,label,mean,std,count
0,5,0,0.458135,0.647901,2
1,5,1,0.65281,0.406116,7
2,5,2,0.551745,0.49624,6
3,10,0,0.13957,0.197382,2
4,10,1,0.542684,0.508547,7
5,10,2,0.347679,0.363524,6
6,20,0,0.224758,0.317856,2
7,20,1,0.51433,0.488818,7
8,20,2,0.39738,0.480414,6


We found that the similarity estimates  depend on the choice of K.

With K = 5, topics were too coarse and many pairs—including unrelated ones—received high similarity.

With K = 20, topics became too fragmented, increasing noise and pushing similarities toward the middle.

K = 10 provided the most reasonable separation between unrelated and somewhat related pairs, although the model still struggled to distinguish between "somewhat related" and "very related".

This shows the difficulty of adapting clustering‐style evaluation to mixed‐membership models, as LDA’s topic proportions do not align perfectly with human semantic judgments.

## 1.8

Associate topic results with the author's metadata (Type or Stance) to determine whether different groups exhibit preferences for certain topics.

In [37]:
from gensim.corpora import Dictionary

dictionary = Dictionary(docs_tokens_keepstop)

corpus = [dictionary.doc2bow(doc) for doc in docs_tokens_keepstop]


In [38]:
from gensim.models import LdaModel

K = 10

lda10 = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=K,
    passes=10,
    random_state=42,
)

In [None]:
# Generate the author's topic distribution
topic_dist = [
    lda10.get_document_topics(bow, minimum_probability=0)
    for bow in corpus
]

In [40]:
import numpy as np

n_docs = len(topic_dist)
topic_matrix = np.zeros((n_docs, K))

for i, dist in enumerate(topic_dist):
    for topic_id, prob in dist:
        topic_matrix[i, topic_id] = prob


In [41]:
# Merge into author_docs (because it contains Type / Stance)
df = author_docs.copy()
for k in range(K):
    df[f"topic_{k}"] = topic_matrix[:, k]


In [43]:
# Calculate intergroup differences by type or stance
# by type
group_stats = df.groupby("Type")[ [f"topic_{k}" for k in range(K)] ].mean()
group_stats

# by stance
stance_stats = df.groupby("Stance")[ [f"topic_{k}" for k in range(K)] ].mean()
stance_stats

Unnamed: 0_level_0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
Stance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Against,2.1e-05,0.21309,0.086139,2.1e-05,2.1e-05,9.1e-05,2.1e-05,2.1e-05,0.700556,2.1e-05
For,0.026292,0.088645,0.458618,0.01271,0.004905,0.144121,0.00897,0.095286,0.137289,0.023164
Unclear,0.028752,0.11633,0.453647,0.0227,0.006164,0.130091,0.023356,0.078936,0.106604,0.03342


Based on an LDA model with K=10, I calculated the average topic distribution for documents across different stances (For / Against / Unclear). The results reveal that certain topics exhibit distinct preferences across different stances.

The “Against” group shows high concentration in topic_8 (0.70). Opponents tend to talk about one very specific theme, with extremely little thematic diversity.

The “For” group primarily focuses on topic_2 (0.46) and topic_5 (0.14).

Unclear accounts behave like a mixture of For and casual observers