In [1]:
import json
import pickle
from pathlib import Path
from datetime import datetime
from collections import Counter
from urllib.parse import urlparse

import pandas as pd


In [2]:
DATA_DIR = Path("../data")

# ---------- 1) Load accounts.tsv ----------
accounts_path = DATA_DIR / "accounts.tsv"
accounts_df = pd.read_csv(
    accounts_path,
    sep="\t",
    dtype=str,
    keep_default_na=False
)
accounts_df.columns = [c.strip() for c in accounts_df.columns]
if "author_id" not in accounts_df.columns:
    for alt in ["user_id", "id", "account_id"]:
        if alt in accounts_df.columns:
            accounts_df = accounts_df.rename(columns={alt: "author_id"})
            break

# ---------- 3) helpers to parse tweets.dat ----------
tweets_path = DATA_DIR / "tweets.dat"

def safe_get(d, *keys, default=None):
    cur = d
    for k in keys:
        if not isinstance(cur, dict) or k not in cur:
            return default
        cur = cur[k]
    return cur

def parse_created_at(ts):
    if not ts:
        return None
    try:
        return datetime.fromisoformat(ts.replace("Z", "+00:00"))
    except Exception:
        return None


## qualitative characterisation of Bot group

From the bot network in Task 2, we first identified the largest connected component (51 accounts).
Within this component, we then applied the Louvain algorithm and selected its largest community (36 accounts) as the meso-level structure to analyse.
This community shows a high level of synchronised posting behaviour, with multiple accounts sharing the same URLs within extremely short time windows.
Such coordination is unlikely for human-driven accounts, so this group appears to be the most suspicious and the most likely to contain automated or bot-like actors.

In [None]:
import pickle
from pathlib import Path

DATA_DIR = Path("../data")

# 1) read VertexClustering
BOT_COMMS_PATH = DATA_DIR / "G_bot_gc_comms.pkl"
with open(BOT_COMMS_PATH, "rb") as f:
    bot_gc_comms = pickle.load(f)   # igraph.clustering.VertexClustering

# 2) get underlying igraph.Graph
g_bot_gc_ig = bot_gc_comms.graph

# 3) select the community you want to analyze (k_bot = 0 → size=36)
k_bot = 0
bot_vertex_indices = bot_gc_comms[k_bot]   # this is a list of indices like [46, 7, 39, 40, ...]

# 4) use vertex index to look up vs["name"], get the real author_id
bot_user_ids = {
    str(g_bot_gc_ig.vs[idx]["name"])   # name is the original node (author_id)
    for idx in bot_vertex_indices
}

print("Selected bot community:", k_bot)
print("Number of users in bot group:", len(bot_user_ids))
print("Example *author_id*:", list(bot_user_ids)[:10])


Selected bot community: 0
Number of users in bot group: 36
Example *author_id*: ['1413949782', '4304243774', '567045831', '3628769299', '363399297', '193735085', '4010844315', '1022779850', '955688402', '3426754877']


After identifying the largest connected component (51 nodes) in the bot network, we applied the Louvain algorithm on this component and selected its largest community (36 nodes) as our meso-level structure.
This gave us a list of actors (author_id values) belonging to this group.

The next step was to extract all tweets written by these accounts.
To do this, we scanned the entire tweets.dat file and collected every tweet whose author_id matched one of the user IDs in the selected community.

In [None]:
def collect_tweets_for_users(user_ids, tweets_path, max_lines=None):
    user_ids = set(user_ids)
    rows = []

    with open(tweets_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if max_lines is not None and i >= max_lines:
                break
            line = line.strip()
            if not line:
                continue
            try:
                tw = json.loads(line)
            except json.JSONDecodeError:
                continue

            aid = str(tw.get("author_id", "")).strip()
            if aid not in user_ids:
                continue

            # basic fields
            created_at = tw.get("created_at")
            lang = tw.get("lang")
            text = tw.get("text", "")

            # URLs (may be empty)
            urls = safe_get(tw, "entities", "urls", default=[])
            expanded_urls = []
            domains = []
            if isinstance(urls, list):
                for u in urls:
                    uu = u.get("expanded_url") or u.get("url")
                    if uu:
                        expanded_urls.append(uu)
                        try:
                            domains.append(urlparse(uu).netloc)
                        except Exception:
                            domains.append(None)

            rows.append({
                "tweet_id": str(tw.get("id", "")),
                "author_id": aid,
                "created_at": created_at,
                "lang": lang,
                "text": text,
                "urls": expanded_urls,
                "domains": domains,
            })

    df = pd.DataFrame(rows)
    if not df.empty:
        df["created_at_parsed"] = df["created_at"].apply(parse_created_at)
    return df


In [37]:
# Bot group tweets
bot_tweets_df = collect_tweets_for_users(bot_user_ids, tweets_path)
print("Bot group tweets:", bot_tweets_df.shape)
display(bot_tweets_df.head())

Bot group tweets: (152633, 8)


Unnamed: 0,tweet_id,author_id,created_at,lang,text,urls,domains,created_at_parsed
0,675823000747380737,3292042493,2015-12-12T23:42:14.000Z,en,_RT_ jeremycorbyn: Paris #COP21 agreement is h...,[],[],2015-12-12 23:42:14+00:00
1,675822131360440320,2237585322,2015-12-12T23:38:46.000Z,en,RT @WRIClimate: From @ClimateMorgan -This agre...,[],[],2015-12-12 23:38:46+00:00
2,675820986541973504,4010844315,2015-12-12T23:34:13.000Z,es,#sassoufit RT fernandosolanas: En breve RFI me...,[],[],2015-12-12 23:34:13+00:00
3,675820854266216448,2237585322,2015-12-12T23:33:42.000Z,en,RT @CharlotteFrerot: This agreement should be ...,[],[],2015-12-12 23:33:42+00:00
4,675819395411742722,3292042493,2015-12-12T23:27:54.000Z,en,_RT_ ApurvGupta5: BarryGardiner COP21 Today we...,[],[],2015-12-12 23:27:54+00:00


In [38]:
def sample_tweets(df, n=10, label="group"):
    print(f"\n=== Sample tweets for {label} ===")
    if df.empty:
        print("No tweets.")
        return
    for _, row in df.sample(min(n, len(df))).iterrows():
        print("----")
        print("author_id:", row["author_id"])
        print("created_at:", row["created_at"])
        print("text:", row["text"])

sample_tweets(bot_tweets_df, n=5, label="Bot group")


=== Sample tweets for Bot group ===
----
author_id: 4339043357
created_at: 2015-12-02T20:54:36.000Z
text: @ViHuster Bravo ! Découvrez et partagez la vidéo de l’emblème #COP21 en 3D que vous avez contribué à créer https://t.co/A3totuSTu7
----
author_id: 4339043357
created_at: 2015-12-11T10:57:41.000Z
text: @APO_source Votre Tweet façonne un emblème #COP21 en 3D. Retweetez pour le découvrir dès qu’il sera prêt ! https://t.co/zLU6Mppbcm
----
author_id: 4339043357
created_at: 2015-12-02T14:32:26.000Z
text: @unatalie Bravo ! Découvrez et partagez la vidéo de l’emblème #COP21 en 3D que vous avez contribué à créer https://t.co/A3totuSTu7
----
author_id: 4339043357
created_at: 2015-12-02T15:03:24.000Z
text: @SaraVigil_ Bravo ! Découvrez et partagez la vidéo de l’emblème #COP21 en 3D que vous avez contribué à créer https://t.co/A3totuSTu7
----
author_id: 4339043357
created_at: 2015-12-02T19:10:34.000Z
text: @arildhermstad Bravo ! Découvrez et partagez la vidéo de l’emblème #COP21 en 3D que vou

The selected bot cluster (size = 36) shows very strong evidence of automated behavior.
All texts originate from the same author_id.
Highly repetitive template-based messages: Almost all tweets repeat the same short template in French (“Bravo ! Découvrez…”, “Retweetez…”, etc.) with minimal variation.

Once we had the tweets, we linked the accounts back to the accounts.tsv file.
This allowed us to inspect each actor’s annotated attributes(Type, Lang, and Stance) and use these annotations to qualitatively characterise the group.

In [None]:
def summarize_accounts(user_ids, accounts_df, label="group"):
    user_ids = set(user_ids)
    sub = accounts_df[accounts_df["author_id"].isin(user_ids)].copy()
    print(f"\n=== {label}: accounts summary ===")
    print("Total accounts with annotation:", len(sub), "/", len(user_ids))

    if not sub.empty:
        print("\nType:")
        display(sub["Type"].value_counts())

        print("\nLang:")
        display(sub["Lang"].value_counts())

        print("\nStance:")
        display(sub["Stance"].value_counts())

    return sub

bot_accounts_sub = summarize_accounts(bot_user_ids, accounts_df, label="Bot group")


=== Bot group: accounts summary ===
Total accounts with annotation: 2 / 36

Type:


Type
Unclear                1
Private individuals    1
Name: count, dtype: int64


Lang:


Lang
N/A    1
no     1
Name: count, dtype: int64


Stance:


Stance
Unclear    2
Name: count, dtype: int64

Low annotation coverage:
Only 2/36 accounts have manual labels (“Unclear”), and none are labeled as legitimate organizational actors, further suggesting that these accounts were not recognized as real actors.

In [40]:
def top_domains(df, top_n=20, label="group"):
    all_domains = []
    for lst in df["domains"].dropna():
        all_domains.extend([d for d in lst if d])
    cnt = Counter(all_domains)
    top = cnt.most_common(top_n)
    print(f"\n=== Top {top_n} domains for {label} ===")
    for dom, c in top:
        print(f"{dom:40s} {c}")
    return top

_ = top_domains(bot_tweets_df, top_n=20, label="Bot group")


=== Top 20 domains for Bot group ===
bit.ly                                   133858
twitter.com                              14790
ow.ly                                    119
youtu.be                                 85
rfi.my                                   75
buff.ly                                  46
ift.tt                                   23
goo.gl                                   18
unfccc.int                               14
www.facebook.com                         14
shar.es                                  14
fb.me                                    11
www.rfi.fr                               11
dai.ly                                   11
wp.me                                    10
www.gouvernement.fr                      10
support.twitter.com                      10
huff.to                                  9
owl.li                                   9
www.youtube.com                          8


URL repetition at extreme scale:
The group overwhelmingly posts shortened URLs (bit.ly, ow.ly), which appear more than 100k times in the group.
This level of repetition strongly suggests automated link broadcasting.

## qualitative characterisation of Ideology group

In the ideology network, we focus on the giant connected component and apply the Louvain algorithm to detect communities.
We treat each community as a meso-level structure, and we selected community 5 (8,634 accounts) for qualitative analysis.

In [None]:
IDE_COMMS_PATH = DATA_DIR / "G_ide_gc_comms.pkl"
with open(IDE_COMMS_PATH, "rb") as f:
    ide_gc_comms = pickle.load(f)

g_ide_gc_ig = ide_gc_comms.graph

k_ide = 5  # select community index
ide_vertex_indices = ide_gc_comms[k_ide]

ide_user_ids = {
    str(g_ide_gc_ig.vs[idx]["name"])
    for idx in ide_vertex_indices
}

print("Selected ideology community:", k_ide)
print("Number of users:", len(ide_user_ids))
print("Example author_ids:", list(ide_user_ids)[:10])


Selected ideology community: 5
Number of users: 8634
Example author_ids: ['634812478', '3192259786', '2485519698', '1947834289', '2273147136', '374470274', '190883866', '3445003276', '358448639', '3369695447']


In [None]:
# Ideology group tweets
ide_tweets_df = collect_tweets_for_users(ide_user_ids, tweets_path)
print("Ideology group tweets:", ide_tweets_df.shape)
display(ide_tweets_df.head())

Ideology group tweets: (58515, 8)


Unnamed: 0,tweet_id,author_id,created_at,lang,text,urls,domains,created_at_parsed
0,675827457023324160,190544832,2015-12-12T23:59:56.000Z,en,RT @ambafrancefj: 1.5°C clearly mentioned in #...,[],[],2015-12-12 23:59:56+00:00
1,675827390971437056,190544832,2015-12-12T23:59:40.000Z,en,"RT @COP21en: .@LaurentFabius: ""I see there is ...",[https://twitter.com/COP21en/status/6757446434...,[twitter.com],2015-12-12 23:59:40+00:00
2,675827386416541696,518918764,2015-12-12T23:59:39.000Z,es,"RT @WWFnoticias: HOY, el mundo marcó el princi...",[],[],2015-12-12 23:59:39+00:00
3,675827386060025857,239264304,2015-12-12T23:59:39.000Z,en,Paris climate deal welcomed in New Zealand - ...,[https://nz.news.yahoo.com/top-stories/a/30351...,[nz.news.yahoo.com],2015-12-12 23:59:39+00:00
4,675827385976098816,620217524,2015-12-12T23:59:39.000Z,en,"RT @JimHarris: Under Harper, Cda Was HATED for...",[http://bit.ly/1TInWBP],[bit.ly],2015-12-12 23:59:39+00:00


In [None]:
_ = top_domains(ide_tweets_df, top_n=20, label="Ideology group")


=== Top 20 domains for Ideology group ===
twitter.com                              23640
bit.ly                                   5848
ow.ly                                    1965
goo.gl                                   1056
buff.ly                                  672
www.youtube.com                          388
youtu.be                                 353
nyti.ms                                  318
bbc.in                                   284
wef.ch                                   276
u.afp.com                                265
unfccc.int                               251
fb.me                                    231
unfccc6.meta-fusion.com                  222
gu.com                                   217
www.theguardian.com                      208
wrld.bg                                  204
dlvr.it                                  198
amp.twimg.com                            179
cnn.it                                   176


The top domains are news/NGO/media platforms, not dominated by a single shortener.
This is consistent with organic user behavior.

In [44]:
ide_accounts_sub = summarize_accounts(ide_user_ids, accounts_df, label="Ideology group")


=== Ideology group: accounts summary ===
Total accounts with annotation: 21 / 8634

Type:


Type
Advocacy actors        8
Political actors       4
Business actors        3
Journalistic actors    3
Private individuals    2
Unclear                1
Name: count, dtype: int64


Lang:


Lang
en     14
es      4
de      1
fr      1
N/A     1
Name: count, dtype: int64


Stance:


Stance
Unclear    11
For        10
Name: count, dtype: int64

This meso-level structure shows the characteristics of a large, heterogeneous issue community around COP21. The accounts include advocacy organisations, political actors, journalists, and individual users.

Tweets are written in many languages (EN, ES, FR, DE, Farsi, Danish…), reflecting broad international participation.

The stance distribution suggests that many actors support climate negotiations or share updates without explicit stance.

In [None]:
sample_tweets(ide_tweets_df, n=5, label="Ideology group")


=== Sample tweets for Ideology group ===
----
author_id: 15423648
created_at: 2015-11-30T14:10:45.000Z
text: RT @HRMirzadeh: اقتصاد نفتی مهمترین ترمز برای اقدامات دولت #ایران درباره انتشار #کربن و #تغییرات_اقلیمی است. #COP21 #climatechange
----
author_id: 262871846
created_at: 2015-11-30T13:51:34.000Z
text: RT @AFD_France: Jour J pour la @COP21 ! Où nous trouver ? https://t.co/puLqBcUVHG #Pogramme #COP21 #GoCOP21 https://t.co/QPFcGdAvFs
----
author_id: 17758861
created_at: 2015-12-05T14:07:00.000Z
text: Lederen for @WWF's #COP21-delegation @TasneemEssop : nu skal regeringerne holde fast i ambitionerne, følge videnskaben #dkgreen #dkpol
----
author_id: 2545900439
created_at: 2015-12-09T01:47:56.000Z
text: RT @ghoberg: Freaky. I did a lot of reading before #COP21 and there was no indication that US, China, Canada would support 1.5,  https://t.…
----
author_id: 4163221342
created_at: 2015-12-04T17:03:50.000Z


The ideology community behaves like a genuine public/organisational conversation cluster, not an automated bot group.