## 1.1

In [1]:
import json
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data/")

# --- Read accounts.tsv ---
accounts_path = DATA_DIR / "accounts.tsv"
accounts_df = pd.read_csv(
    accounts_path,
    sep="\t",
    dtype=str,            # keep IDs as strings
    keep_default_na=False # avoid NaN for empty fields
)

# Standardize column names
accounts_df.columns = [c.strip() for c in accounts_df.columns]
if "author_id" not in accounts_df.columns:
    for alt in ["id", "user_id", "account_id"]:
        if alt in accounts_df.columns:
            accounts_df = accounts_df.rename(columns={alt: "author_id"})
            break

print("=== accounts.tsv ===")
print(accounts_df.head())
print(accounts_df["Type"].value_counts())


=== accounts.tsv ===
             author_id                 Type Lang Stance
0              8508262  Private individuals   fr    For
1           3297659759      Advocacy actors   es    For
2  1351436889316683778  Journalistic actors   en    For
3            259352661      Advocacy actors   en    For
4             17158610      Advocacy actors   en    For
Type
Advocacy actors        632
Political actors       310
Journalistic actors    271
Business actors        241
Private individuals    227
Unclear                142
Scientific actors       90
                         9
Other                    9
Bots                     4
to-do                    1
Name: count, dtype: int64


In [2]:
tweets_path = DATA_DIR / "tweets.dat"

def parse_tweets(path, max_rows=None):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if max_rows and i >= max_rows:
                break
            try:
                obj = json.loads(line)
                rows.append({
                    "id": obj.get("id"),
                    "author_id": obj.get("author_id"),
                    "text": obj.get("text"),
                    "created_at": obj.get("created_at"),
                    "lang": obj.get("lang"),
                    "referenced_tweets": obj.get("referenced_tweets"),
                    "public_metrics": obj.get("public_metrics"),
                })
            except Exception as e:
                # skip malformed lines
                continue
    return pd.DataFrame(rows)

# 如果第一次运行太慢，可以先解析 200k 行看看
tweets_df = parse_tweets(tweets_path, max_rows=None)
print("Parsed tweets:", len(tweets_df))
tweets_df.head()


Parsed tweets: 2260916


Unnamed: 0,id,author_id,text,created_at,lang,referenced_tweets,public_metrics
0,675827469119832066,1011975294,RT @MinisterTdB: Climate change won’t stop ove...,2015-12-12T23:59:59.000Z,en,"[{'type': 'retweeted', 'id': '6757779674700390...","{'retweet_count': 107, 'reply_count': 0, 'like..."
1,675827469006581760,255144027,RT @LaurenceTubiana: I just can believe it !we...,2015-12-12T23:59:59.000Z,en,"[{'type': 'retweeted', 'id': '6757815547450572...","{'retweet_count': 109, 'reply_count': 0, 'like..."
2,675827468775718912,214748274,RT @COP21en: We did it! #ParisAgreement is ado...,2015-12-12T23:59:59.000Z,en,"[{'type': 'retweeted', 'id': '6757487202442977...","{'retweet_count': 1204, 'reply_count': 0, 'lik..."
3,675827465378504705,449273927,RT @TheGlobalGoals: Incredible news for our wo...,2015-12-12T23:59:58.000Z,en,"[{'type': 'retweeted', 'id': '6757646325990440...","{'retweet_count': 110, 'reply_count': 0, 'like..."
4,675827465336434688,1601937732,RT @StopShenhua: “The people’s resolve is such...,2015-12-12T23:59:58.000Z,en,"[{'type': 'retweeted', 'id': '6757735596884541...","{'retweet_count': 49, 'reply_count': 0, 'like_..."


In [3]:
merged_df = tweets_df.merge(accounts_df, on="author_id", how="inner")

print("=== merged tweets x accounts ===")
print("Total merged rows:", len(merged_df))
merged_df.head()


=== merged tweets x accounts ===
Total merged rows: 161034


Unnamed: 0,id,author_id,text,created_at,lang,referenced_tweets,public_metrics,Type,Lang,Stance
0,675827426363121664,2350315591,RT @LaurenceTubiana: I just can believe it !we...,2015-12-12T23:59:49.000Z,en,"[{'type': 'retweeted', 'id': '6757815547450572...","{'retweet_count': 109, 'reply_count': 0, 'like...",Political actors,en,For
1,675826358082273280,2350315591,RT @RoyalSegolene: Après des années depuis le ...,2015-12-12T23:55:34.000Z,fr,"[{'type': 'retweeted', 'id': '6757499143165255...","{'retweet_count': 156, 'reply_count': 0, 'like...",Political actors,en,For
2,675815010434682880,2350315591,RT @RoyalSegolene: Une conscience universelle ...,2015-12-12T23:10:29.000Z,fr,"[{'type': 'retweeted', 'id': '6757507629416898...","{'retweet_count': 136, 'reply_count': 0, 'like...",Political actors,en,For
3,675814887000580097,2350315591,RT @malinimehra: Yes - the world will be watch...,2015-12-12T23:09:59.000Z,en,"[{'type': 'retweeted', 'id': '6758144821998960...","{'retweet_count': 1, 'reply_count': 0, 'like_c...",Political actors,en,For
4,675788617785221120,2350315591,RT @NatureNews: The key points of the Paris #c...,2015-12-12T21:25:36.000Z,en,"[{'type': 'retweeted', 'id': '6757702837693399...","{'retweet_count': 154, 'reply_count': 0, 'like...",Political actors,en,For


In [4]:
target_types = ["Advocacy actors", "Private individuals"]

analysis_df = merged_df[ merged_df["Type"].isin(target_types) ]
print("Tweets from selected actor types:", len(analysis_df))

analysis_df["Type"].value_counts()


Tweets from selected actor types: 77212


Type
Advocacy actors        49021
Private individuals    28191
Name: count, dtype: int64

In [5]:
analysis_df.to_csv("../data/analysis_data_advocacy_vs_individuals.csv", index=False)
print("Saved analysis data!")

Saved analysis data!


task 1.2 - 1.4 大家手工标注填到他给的share sheet即可.

# TASK 1.5: reliability test

In [6]:
# %pip install krippendorff

In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
import krippendorff

In [8]:
DATA_DIR = Path("../data")

def load_coder_train(n):
    path = DATA_DIR / f"train(CODER{n}).csv"
    df = pd.read_csv(path, sep=";", dtype={"ID": str})
    # 只保留 ID、CODE、TEXT，并把 CODE 改名成 CODE{n}
    df = df.rename(columns={"CODE": f"CODE{n}"})
    return df

train_c1 = load_coder_train(1)
train_c2 = load_coder_train(2)
train_c3 = load_coder_train(3)

print("Coder1 rows:", len(train_c1))
print("Coder2 rows:", len(train_c2))
print("Coder3 rows:", len(train_c3))

train_c1.head()


Coder1 rows: 5000
Coder2 rows: 5000
Coder3 rows: 5000


Unnamed: 0,ID,CODE1,TEXT
0,672233899615850496,-1.0,Piece for @TheWorldPost: 2 wars connected by c...
1,672231727952072704,0.0,Soils and #Ocean Omitted From Paris #COP21 Age...
2,672231250329919488,0.0,"To reverse the Ephedrine Disinformation,@UN mu..."
3,672229503696019456,0.0,See photos &amp; highlights from #COP21 side e...
4,672229362708561921,0.0,Our own Ben Parr writes for SBS News on India'...


In [9]:
# 先以 coder1 为基准，保留 TEXT
train_merged = train_c1[["ID", "TEXT", "CODE1"]]

# 左连接 coder2, coder3 的 CODE 列
train_merged = train_merged.merge(
    train_c2[["ID", "CODE2"]], on="ID", how="outer"
).merge(
    train_c3[["ID", "CODE3"]], on="ID", how="outer"
)

print("Merged rows:", len(train_merged))
train_merged.head()


Merged rows: 5000


Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3
0,672233899615850496,Piece for @TheWorldPost: 2 wars connected by c...,-1.0,-1.0,0.0
1,672231727952072704,Soils and #Ocean Omitted From Paris #COP21 Age...,0.0,-1.0,0.0
2,672231250329919488,"To reverse the Ephedrine Disinformation,@UN mu...",0.0,1.0,1.0
3,672229503696019456,See photos &amp; highlights from #COP21 side e...,0.0,1.0,0.0
4,672229362708561921,Our own Ben Parr writes for SBS News on India'...,0.0,0.0,1.0


In [10]:
# CODE 目前是字符串，可能有空字符串，我们统一转成数值（-1/0/1），空的变 NaN
for col in ["CODE1", "CODE2", "CODE3"]:
    if col in train_merged.columns:
        # 把空字符串和空白转成 NaN
        train_merged[col] = (
            train_merged[col]
            .astype(str)
            .str.strip()
            .replace({"": np.nan})
        )
        # 转成浮点（方便后面计算 alpha）
        train_merged[col] = train_merged[col].astype(float)

train_merged.head()


Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3
0,672233899615850496,Piece for @TheWorldPost: 2 wars connected by c...,-1.0,-1.0,0.0
1,672231727952072704,Soils and #Ocean Omitted From Paris #COP21 Age...,0.0,-1.0,0.0
2,672231250329919488,"To reverse the Ephedrine Disinformation,@UN mu...",0.0,1.0,1.0
3,672229503696019456,See photos &amp; highlights from #COP21 side e...,0.0,1.0,0.0
4,672229362708561921,Our own Ben Parr writes for SBS News on India'...,0.0,0.0,1.0


In [11]:
print(train_merged[["CODE1", "CODE2", "CODE3"]].notna().sum())

CODE1    133
CODE2    129
CODE3    159
dtype: int64


In [12]:
# 至少两个 coder 有非 NaN
mask_at_least_two = train_merged[["CODE1", "CODE2", "CODE3"]].notna().sum(axis=1) >= 2
reli_df = train_merged[mask_at_least_two].copy()

print("Tweets with at least 2 annotations:", len(reli_df))
reli_df.head()

Tweets with at least 2 annotations: 127


Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3
0,672233899615850496,Piece for @TheWorldPost: 2 wars connected by c...,-1.0,-1.0,0.0
1,672231727952072704,Soils and #Ocean Omitted From Paris #COP21 Age...,0.0,-1.0,0.0
2,672231250329919488,"To reverse the Ephedrine Disinformation,@UN mu...",0.0,1.0,1.0
3,672229503696019456,See photos &amp; highlights from #COP21 side e...,0.0,1.0,0.0
4,672229362708561921,Our own Ben Parr writes for SBS News on India'...,0.0,0.0,1.0


In [13]:
# Task 1.5
# Look at how many matches you have, and see if you understand the source of mismatches.
# 对每一行，统计不同 CODE 的数量（忽略 NaN）
def n_unique_codes(row):
    vals = [row["CODE1"], row["CODE2"], row["CODE3"]]
    vals = [v for v in vals if pd.notna(v)]
    return len(set(vals))

reli_df["N_UNIQUE"] = reli_df.apply(n_unique_codes, axis=1)

n_all_agree = (reli_df["N_UNIQUE"] == 1).sum()
n_disagree = (reli_df["N_UNIQUE"] > 1).sum()

print("All coders agree:", n_all_agree)
print("Coders disagree:", n_disagree)

# 看几条不一致的例子
disagreements = reli_df[reli_df["N_UNIQUE"] > 1].head(10)
disagreements[["TEXT", "CODE1", "CODE2", "CODE3"]]


All coders agree: 70
Coders disagree: 57


Unnamed: 0,TEXT,CODE1,CODE2,CODE3
0,Piece for @TheWorldPost: 2 wars connected by c...,-1.0,-1.0,0.0
1,Soils and #Ocean Omitted From Paris #COP21 Age...,0.0,-1.0,0.0
2,"To reverse the Ephedrine Disinformation,@UN mu...",0.0,1.0,1.0
3,See photos &amp; highlights from #COP21 side e...,0.0,1.0,0.0
4,Our own Ben Parr writes for SBS News on India'...,0.0,0.0,1.0
5,"A bit of substance, a bit of color, and hints ...",0.0,0.0,-1.0
8,ICYMI: @albericie flogs @GregHuntMP with a wet...,-1.0,-1.0,0.0
9,Initiatives to eliminate natural #deforestatio...,0.0,1.0,1.0
10,wendykoch: Is your city divesting from fossil ...,1.0,1.0,0.0
12,March in a #ClimateMarch on Nov 29? Think marc...,0.0,0.0,-1.0


In [14]:
# Krippendorff’s alpha
# 取 CODE1–3 组成矩阵，转置成 (n_coders, n_items)
code_matrix = reli_df[["CODE1", "CODE2", "CODE3"]].to_numpy().T

alpha = krippendorff.alpha(
    reliability_data=code_matrix,
    level_of_measurement='ordinal'  # sentiment 有顺序：-1 < 0 < 1
)

print("Krippendorff’s alpha (train, all coders):", alpha)


Krippendorff’s alpha (train, all coders): 0.4996540462324015


In [15]:
code_matrix_12 = reli_df[["CODE1", "CODE2"]].to_numpy().T
alpha_12 = krippendorff.alpha(
    reliability_data=code_matrix_12,
    level_of_measurement='ordinal'
)
print("Alpha (coder1 vs coder2):", alpha_12)

Alpha (coder1 vs coder2): 0.4901411028191377


In [16]:
# 生成一个最终标签（多数组合并）majority vote
def majority_vote(row):
    vals = [row["CODE1"], row["CODE2"], row["CODE3"]]
    vals = [v for v in vals if pd.notna(v)]
    if not vals:
        return np.nan  # 完全没人标
    
    counts = pd.Series(vals).value_counts()
    
    # 如果有明显多数（出现次数最多的比第二多的多）
    if len(counts) == 1 or counts.iloc[0] > counts.iloc[1]:
        return counts.index[0]
    
    # 如果前两名打平，比如 [-1, 1]，可以约定为 0（中性）
    return 0.0

train_merged["CODE_FINAL"] = train_merged.apply(majority_vote, axis=1)

train_merged[["ID", "TEXT", "CODE1", "CODE2", "CODE3", "CODE_FINAL"]].head()


Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3,CODE_FINAL
0,672233899615850496,Piece for @TheWorldPost: 2 wars connected by c...,-1.0,-1.0,0.0,-1.0
1,672231727952072704,Soils and #Ocean Omitted From Paris #COP21 Age...,0.0,-1.0,0.0,0.0
2,672231250329919488,"To reverse the Ephedrine Disinformation,@UN mu...",0.0,1.0,1.0,1.0
3,672229503696019456,See photos &amp; highlights from #COP21 side e...,0.0,1.0,0.0,0.0
4,672229362708561921,Our own Ben Parr writes for SBS News on India'...,0.0,0.0,1.0,0.0


In [17]:
# 保存一下Part 2 用的训练数据
train_merged.to_csv(DATA_DIR / "train.csv", index=False)
print("Saved merged train set.")

Saved merged train set.


In [18]:
# 对 test(CODER1–3).csv 重复同样流程

def load_coder_test(n):
    path = DATA_DIR / f"test(CODER{n}).csv"
    df = pd.read_csv(path, sep=";", dtype={"ID": str})
    df = df.rename(columns={"CODE": f"CODE{n}"})
    return df

# 然后重复上面“合并 → 计算 alpha → 生成 CODE_FINAL”的步骤即可。
test_c1 = load_coder_test(1)
test_c2 = load_coder_test(2)
test_c3 = load_coder_test(3)

print("Coder1 rows (test):", len(test_c1))
print("Coder2 rows (test):", len(test_c2))
print("Coder3 rows (test):", len(test_c3))

test_c1.head()

Coder1 rows (test): 1000
Coder2 rows (test): 1000
Coder3 rows (test): 1000


Unnamed: 0,ID,CODE1,TEXT
0,675826762543140864,1.0,@AmedeusKizito I massive milestone in helping ...
1,675826724605730816,1.0,https://t.co/hZwEOmVXpW Time now for Ireland ...
2,675826603977515008,1.0,Very proud of my country today #France #Succes...
3,675826598772248576,-1.0,Brain farts of an angry and an ignorant man ht...
4,675826488755617793,1.0,I am honestly looking forward to spending a fe...


In [19]:
# 以 coder1 为基准，带 TEXT
test_merged = test_c1[["ID", "TEXT", "CODE1"]]

# merge coder2, coder3 的 CODE 列
test_merged = test_merged.merge(
    test_c2[["ID", "CODE2"]], on="ID", how="outer"
).merge(
    test_c3[["ID", "CODE3"]], on="ID", how="outer"
)

print("Merged test rows:", len(test_merged))
test_merged.head()

Merged test rows: 1000


Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3
0,675826762543140864,@AmedeusKizito I massive milestone in helping ...,1.0,1.0,1.0
1,675826724605730816,https://t.co/hZwEOmVXpW Time now for Ireland ...,1.0,1.0,1.0
2,675826603977515008,Very proud of my country today #France #Succes...,1.0,1.0,1.0
3,675826598772248576,Brain farts of an angry and an ignorant man ht...,-1.0,-1.0,-1.0
4,675826488755617793,I am honestly looking forward to spending a fe...,1.0,0.0,1.0


In [20]:
# 看看 CODE 列是否一致
for col in ["CODE1", "CODE2", "CODE3"]:
    if col in test_merged.columns:
        test_merged[col] = (
            test_merged[col]
            .astype(str)
            .str.strip()
            .replace({"": np.nan})
        )
        test_merged[col] = test_merged[col].astype(float)

print(test_merged[["CODE1", "CODE2", "CODE3"]].notna().sum())


CODE1    101
CODE2     68
CODE3    109
dtype: int64


In [21]:
# 只取至少两个 coder 有标注的行：
mask_at_least_two = test_merged[["CODE1", "CODE2", "CODE3"]].notna().sum(axis=1) >= 2
test_reli_df = test_merged[mask_at_least_two].copy()

print("Test tweets with ≥2 annotations:", len(test_reli_df))
test_reli_df.head()


Test tweets with ≥2 annotations: 101


Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3
0,675826762543140864,@AmedeusKizito I massive milestone in helping ...,1.0,1.0,1.0
1,675826724605730816,https://t.co/hZwEOmVXpW Time now for Ireland ...,1.0,1.0,1.0
2,675826603977515008,Very proud of my country today #France #Succes...,1.0,1.0,1.0
3,675826598772248576,Brain farts of an angry and an ignorant man ht...,-1.0,-1.0,-1.0
4,675826488755617793,I am honestly looking forward to spending a fe...,1.0,0.0,1.0


In [22]:
# 看一下多少条一致 / 不一致
def n_unique_codes(row):
    vals = [row["CODE1"], row["CODE2"], row["CODE3"]]
    vals = [v for v in vals if pd.notna(v)]
    return len(set(vals))

test_reli_df["N_UNIQUE"] = test_reli_df.apply(n_unique_codes, axis=1)

n_all_agree_test = (test_reli_df["N_UNIQUE"] == 1).sum()
n_disagree_test = (test_reli_df["N_UNIQUE"] > 1).sum()

print("Test – all coders agree:", n_all_agree_test)
print("Test – coders disagree:", n_disagree_test)

disagreements_test = test_reli_df[test_reli_df["N_UNIQUE"] > 1].head(10)
disagreements_test[["TEXT", "CODE1", "CODE2", "CODE3"]]


Test – all coders agree: 63
Test – coders disagree: 38


Unnamed: 0,TEXT,CODE1,CODE2,CODE3
4,I am honestly looking forward to spending a fe...,1.0,0.0,1.0
5,Retweeted Agence France-Presse (@AFP): Scient...,-1.0,0.0,0.0
6,My fav 100+ pics from #COP21 #climate protests...,0.0,1.0,1.0
7,Our CEO Dr. Chad Nelsen contributed to The Huf...,0.0,1.0,-1.0
9,Serendipity of #COP21 agreement on same day as...,1.0,1.0,-1.0
10,#COP21 #climatechange #ClimateRefugees @Berni...,1.0,1.0,-1.0
11,Nations strike historic deal on climate (hoax)...,-1.0,-1.0,1.0
13,Forests Emerge as a Big Winner in Paris Agreem...,1.0,1.0,0.0
14,‚ÄúIs it possible?‚Äù We have to find out. : h...,0.0,0.0,1.0
17,@WealthBlog_Ng #COP21 INDCs as communicated by...,0.0,0.0,1.0


In [23]:
# 计算 test 集的 Krippendorff’s α
code_matrix_test = test_reli_df[["CODE1", "CODE2", "CODE3"]].to_numpy().T

alpha_test = krippendorff.alpha(
    reliability_data=code_matrix_test,
    level_of_measurement='ordinal'
)

print("Krippendorff’s alpha (test, all coders):", alpha_test)

Krippendorff’s alpha (test, all coders): 0.5917497538560983


In [24]:
# 为 test 生成最终标签（多数投票）
def majority_vote(row):
    vals = [row["CODE1"], row["CODE2"], row["CODE3"]]
    vals = [v for v in vals if pd.notna(v)]
    if not vals:
        return np.nan
    
    counts = pd.Series(vals).value_counts()
    
    # 有明显多数
    if len(counts) == 1 or counts.iloc[0] > counts.iloc[1]:
        return counts.index[0]
    
    # 打平 (如 -1 和 1)，约定为 0（中性/有争议）
    return 0.0

test_merged["CODE_FINAL"] = test_merged.apply(majority_vote, axis=1)

test_merged[["ID", "TEXT", "CODE1", "CODE2", "CODE3", "CODE_FINAL"]].head()


Unnamed: 0,ID,TEXT,CODE1,CODE2,CODE3,CODE_FINAL
0,675826762543140864,@AmedeusKizito I massive milestone in helping ...,1.0,1.0,1.0,1.0
1,675826724605730816,https://t.co/hZwEOmVXpW Time now for Ireland ...,1.0,1.0,1.0,1.0
2,675826603977515008,Very proud of my country today #France #Succes...,1.0,1.0,1.0,1.0
3,675826598772248576,Brain farts of an angry and an ignorant man ht...,-1.0,-1.0,-1.0,-1.0
4,675826488755617793,I am honestly looking forward to spending a fe...,1.0,0.0,1.0,1.0


In [25]:
# 保存一下Part 2 用的测试数据
test_merged.to_csv(DATA_DIR / "test.csv", index=False)
print("Saved merged test with final code.")

Saved merged test with final code.
