In [1]:
import pandas as pd

df = pd.read_pickle(r"C:\Users\54241\Social-data-mining\data\author_docs_with_tokens.pkl")
df.head()


Unnamed: 0,author_id,Type,Stance,full_text,tokens_nostop,tokens_keepstop
0,100189549,Advocacy actors,For,@N_Hulot #COP21 For the first time humanity is...,"[cop, first, time, humanity, walking, right, d...","[cop, for, the, first, time, humanity, walking..."
1,1011036600,Advocacy actors,For,Pics from our participation in #COP21 When #Gr...,"[pics, participation, cop, greenbusiness, join...","[pics, from, our, participation, cop, when, gr..."
2,102325445,Political actors,Unclear,Investing for a Greener Future: development fi...,"[investing, greener, future, development, fina...","[investing, for, greener, future, development,..."
3,102448827,Political actors,Unclear,#HighAmbitionCoalition on the way to #COP21 Pl...,"[highambitioncoalition, way, cop, plenary, fin...","[highambitioncoalition, the, way, cop, plenary..."
4,102634281,Advocacy actors,For,Read up on the latest news &amp; updates about...,"[read, latest, news, amp, updates, took, place...","[read, the, latest, news, amp, updates, about,..."


In [2]:
df.columns

Index(['author_id', 'Type', 'Stance', 'full_text', 'tokens_nostop',
       'tokens_keepstop'],
      dtype='object')

In [3]:
import ast

if isinstance(df.loc[0, 'tokens_nostop'], str):
    df["tokens_nostop"] = df["tokens_nostop"].apply(ast.literal_eval)

In [None]:
from gensim.corpora import Dictionary

texts = df["tokens_nostop"].tolist()   # list of list of tokens

dictionary = Dictionary(texts)

# remove extreme tokens (too rare or too common)
dictionary.filter_extremes(no_below=5, no_above=0.5)

# create the bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in texts]


In [None]:
from gensim.models import LdaModel
# train LDA model with automatic hyperparameter tuning

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    alpha='auto',
    eta='auto'
)


In [6]:
lda_model.num_topics


100

In [7]:
for i, topic in lda_model.print_topics(num_words=5):
    print(f"Topic {i}: {topic}\n")


Topic 3: 0.009*"energy" + 0.007*"coal" + 0.007*"map" + 0.007*"carbon" + 0.007*"sif"

Topic 94: 0.019*"cdnpoli" + 0.015*"globalwarming" + 0.013*"design" + 0.010*"green" + 0.010*"new"

Topic 72: 0.009*"agreement" + 0.008*"action" + 0.007*"live" + 0.007*"lpaa" + 0.006*"day"

Topic 34: 0.009*"action" + 0.006*"carbon" + 0.006*"must" + 0.005*"agreement" + 0.005*"need"

Topic 21: 0.022*"sif" + 0.011*"energy" + 0.010*"says" + 0.007*"global" + 0.006*"new"

Topic 51: 0.010*"energy" + 0.008*"live" + 0.007*"action" + 0.007*"agreement" + 0.006*"today"

Topic 38: 0.063*"behind" + 0.057*"copparis" + 0.053*"maldives" + 0.046*"bars" + 0.010*"please"

Topic 89: 0.017*"climateaction" + 0.013*"challenge" + 0.011*"coal" + 0.010*"photo" + 0.010*"thanks"

Topic 33: 0.012*"actonclimate" + 0.007*"global" + 0.007*"agreement" + 0.006*"unfccc" + 0.006*"live"

Topic 71: 0.008*"energy" + 0.008*"says" + 0.007*"global" + 0.007*"solar" + 0.006*"actonclimate"

Topic 25: 0.008*"agreement" + 0.008*"energy" + 0.008*"deal"

In [8]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
vis


In [None]:
import random
import numpy as np

def topic_intruder_test(lda_model, corpus, df, text_col="full_text", S=2, N=5):
    """
    lda_model : trained gensim LDA model
    corpus    : corpus = [dictionary.doc2bow(text) for text in tokens]
    df        : DataFrame containing author-level documents, one per row
    text_col  : name of the text column (you can switch to cleaned_text or full_text)
    S         : number of documents sampled from the specified topic
    N         : number of test rounds
    """
    
    topic_indices = list(range(lda_model.num_topics))
    results = []
    
    print(f"🔎 Running Topic Intruder Test for {N} rounds (S={S})...\n")
    
    for _ in range(N):
        # 1. select a topic z randomly
        z = random.choice(topic_indices)
        
        # get topic distribution for all documents
        doc_topics = [lda_model.get_document_topics(bow) for bow in corpus]
        
        # 2. find documents with high probability of belonging to topic z
        docs_in_topic = []
        for i, topics in enumerate(doc_topics):
            for t_id, prob in topics:
                if t_id == z and prob > 0.2:   # threshold can be adjusted
                    docs_in_topic.append((i, prob))
        
        # if too few documents belong to z, skip this round
        if len(docs_in_topic) < S:
            continue
        
        # sort by probability (high probability first)
        docs_in_topic = sorted(docs_in_topic, key=lambda x: x[1], reverse=True)
        
        # select S documents that definitely belong to topic z
        chosen_z_docs = random.sample(docs_in_topic[:30], S)
        
        # 3. select 1 document that does NOT belong to topic z (intruder)
        docs_not_z = []
        for i, topics in enumerate(doc_topics):
            prob_z = dict(topics).get(z, 0.0)
            if prob_z < 0.05:  # clearly does not belong to the topic
                docs_not_z.append(i)
        
        intruder_idx = random.choice(docs_not_z)
        
        # collect three documents
        sample_docs = []
        for idx, prob in chosen_z_docs:
            sample_docs.append(("IN_TOPIC", df.iloc[idx][text_col][:500]))
        sample_docs.append(("INTRUDER", df.iloc[intruder_idx][text_col][:500]))
        
        # shuffle the documents
        random.shuffle(sample_docs)
        
        # display
        print(f"📌 Test Round: Topic {z}")
        for i, (label, text) in enumerate(sample_docs):
            print(f"\n---- Document {i+1} ----")
            print(text)
        
        print("\n👉 Try to identify the intruder.\n")
        
        results.append((z, sample_docs))
    
    return results


In [10]:
df.columns

Index(['author_id', 'Type', 'Stance', 'full_text', 'tokens_nostop',
       'tokens_keepstop'],
      dtype='object')

In [11]:
results = topic_intruder_test(lda_model, corpus, df, text_col="full_text", S=2, N=2)


🔎 Running Topic Intruder Test for 2 rounds (S=2)...

📌 Test Round: Topic 87

---- Document 1 ----
@adoptnegotiator No, we do not want to be mainstreamed into a polluted stream #COP21 .@kuminaidoo: There are many governments who talk nicely about human rights @#COP21, but violate them @ home! https://t.co/OKW8KVvc0L Jannie: How can the purpose of this agreement not be people. We call civil society to rise. #Stand4Rights #COP21 https://t.co/waips9chUc Press conference now! #HumanRights and #GenderEquality are the core, the ❤️, of the #COP21 agreement - Flavia Cherry https://t.co/lZgoCE5CUP Wha

---- Document 2 ----
... The "Window of Time" is closing, now. https://t.co/z7yDIMLXrk #COP21  #climatechange https://t.co/KaET1v8kcZ The "Window of Time" is closing, now. https://t.co/z7yDIMLXrk #COP21  #climatechange https://t.co/KaET1v8kcZ The "Window of Time" is closing, now. https://t.co/z7yDIMLXrk #COP21  #climatechange https://t.co/605qf2yhqP The "Window of Time" is closing, now. https://t.

In [None]:
import random

def topic_identification_test(lda_model, corpus, df, text_col="full_text", N=2):

    topics = lda_model.print_topics(num_words=10)
    doc_topics = [lda_model.get_document_topics(bow) for bow in corpus]

    for r in range(N):
        print(f"\n===================")
        print(f"🔎 Test Round {r+1}")
        print("===================\n")

        # 1. select a topic z randomly
        z = random.choice(range(lda_model.num_topics))

        # 2. find documents belonging to topic z
        docs_z = []
        for idx, topic_dist in enumerate(doc_topics):
            prob_z = dict(topic_dist).get(z, 0)
            if prob_z > 0.2:
                docs_z.append((idx, prob_z))

        if len(docs_z) == 0:
            continue

        # sort by probability (high probability first)
        docs_z = sorted(docs_z, key=lambda x: x[1], reverse=True)

        # 3. select 1 document from topic z
        doc_idx, _ = random.choice(docs_z[:10])  # top 10 documents with highest probability
        doc_text = df.iloc[doc_idx][text_col][:600]

        # 4. display the document
        print("📄 Document:\n")
        print(doc_text)
        print("\n")

        # 5. display top words of all topics
        print("📌 Topic options:\n")
        print(lda_model.print_topic(z, topn=15))


In [13]:
topic_identification_test(lda_model, corpus, df, text_col="full_text", N=2)



🔎 Test Round 1

📄 Document:

RT @Sir_David_King: Tackling #ClimateChange needs ambitious commitment from every country #BackClimateAction #COP21 https://t.co/L6VKZrt76u


📌 Topic options:

0.011*"auspol" + 0.010*"climateaction" + 0.008*"agreement" + 0.007*"deal" + 0.007*"global" + 0.006*"action" + 0.006*"new" + 0.005*"talks" + 0.005*"tcot" + 0.005*"leaders" + 0.005*"obama" + 0.005*"need" + 0.005*"day" + 0.005*"energy" + 0.005*"join"

🔎 Test Round 2

📄 Document:

Now we must move with urgency to deliver on promise of climate agreement. Read:  https://t.co/QHgZRq7nU2 #COP21 "We have an agreement. It's a good agreement. And you should all be proud." - Ban Ki-moon https://t.co/BGxFImU3rO #COP21 #ParisAgreement "The current level of ambition is the floor, not the ceiling." #UNSG Ban Ki-moon LIVE: https://t.co/BGxFImU3rO #COP21 The #ParisAgreement has been finalized! Watch the moment: https://t.co/3SMpLBPX9e #COP21 via @UNFCCC World climate accord hailed as turning point from fossil fuels h