## 1.1

In [1]:
import json
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data/")

# --- Read accounts.tsv ---
accounts_path = DATA_DIR / "accounts.tsv"
accounts_df = pd.read_csv(
    accounts_path,
    sep="\t",
    dtype=str,            # keep IDs as strings
    keep_default_na=False # avoid NaN for empty fields
)

# Standardize column names
accounts_df.columns = [c.strip() for c in accounts_df.columns]
if "author_id" not in accounts_df.columns:
    for alt in ["id", "user_id", "account_id"]:
        if alt in accounts_df.columns:
            accounts_df = accounts_df.rename(columns={alt: "author_id"})
            break

print("=== accounts.tsv ===")
print(accounts_df.head())
print(accounts_df["Type"].value_counts())


=== accounts.tsv ===
             author_id                 Type Lang Stance
0              8508262  Private individuals   fr    For
1           3297659759      Advocacy actors   es    For
2  1351436889316683778  Journalistic actors   en    For
3            259352661      Advocacy actors   en    For
4             17158610      Advocacy actors   en    For
Type
Advocacy actors        632
Political actors       310
Journalistic actors    271
Business actors        241
Private individuals    227
Unclear                142
Scientific actors       90
                         9
Other                    9
Bots                     4
to-do                    1
Name: count, dtype: int64


In [2]:
tweets_path = DATA_DIR / "tweets.dat"

def parse_tweets(path, max_rows=None):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if max_rows and i >= max_rows:
                break
            try:
                obj = json.loads(line)
                rows.append({
                    "id": obj.get("id"),
                    "author_id": obj.get("author_id"),
                    "text": obj.get("text"),
                    "created_at": obj.get("created_at"),
                    "lang": obj.get("lang"),
                    "referenced_tweets": obj.get("referenced_tweets"),
                    "public_metrics": obj.get("public_metrics"),
                })
            except Exception as e:
                # skip malformed lines
                continue
    return pd.DataFrame(rows)

# --- Read tweets.dat ---
tweets_df = parse_tweets(tweets_path, max_rows=None)
print("Parsed tweets:", len(tweets_df))
tweets_df.head()


Parsed tweets: 2260916


Unnamed: 0,id,author_id,text,created_at,lang,referenced_tweets,public_metrics
0,675827469119832066,1011975294,RT @MinisterTdB: Climate change won’t stop ove...,2015-12-12T23:59:59.000Z,en,"[{'type': 'retweeted', 'id': '6757779674700390...","{'retweet_count': 107, 'reply_count': 0, 'like..."
1,675827469006581760,255144027,RT @LaurenceTubiana: I just can believe it !we...,2015-12-12T23:59:59.000Z,en,"[{'type': 'retweeted', 'id': '6757815547450572...","{'retweet_count': 109, 'reply_count': 0, 'like..."
2,675827468775718912,214748274,RT @COP21en: We did it! #ParisAgreement is ado...,2015-12-12T23:59:59.000Z,en,"[{'type': 'retweeted', 'id': '6757487202442977...","{'retweet_count': 1204, 'reply_count': 0, 'lik..."
3,675827465378504705,449273927,RT @TheGlobalGoals: Incredible news for our wo...,2015-12-12T23:59:58.000Z,en,"[{'type': 'retweeted', 'id': '6757646325990440...","{'retweet_count': 110, 'reply_count': 0, 'like..."
4,675827465336434688,1601937732,RT @StopShenhua: “The people’s resolve is such...,2015-12-12T23:59:58.000Z,en,"[{'type': 'retweeted', 'id': '6757735596884541...","{'retweet_count': 49, 'reply_count': 0, 'like_..."


In [3]:
merged_df = tweets_df.merge(accounts_df, on="author_id", how="inner")

print("=== merged tweets x accounts ===")
print("Total merged rows:", len(merged_df))
merged_df.head()


=== merged tweets x accounts ===
Total merged rows: 161034


Unnamed: 0,id,author_id,text,created_at,lang,referenced_tweets,public_metrics,Type,Lang,Stance
0,675827426363121664,2350315591,RT @LaurenceTubiana: I just can believe it !we...,2015-12-12T23:59:49.000Z,en,"[{'type': 'retweeted', 'id': '6757815547450572...","{'retweet_count': 109, 'reply_count': 0, 'like...",Political actors,en,For
1,675826358082273280,2350315591,RT @RoyalSegolene: Après des années depuis le ...,2015-12-12T23:55:34.000Z,fr,"[{'type': 'retweeted', 'id': '6757499143165255...","{'retweet_count': 156, 'reply_count': 0, 'like...",Political actors,en,For
2,675815010434682880,2350315591,RT @RoyalSegolene: Une conscience universelle ...,2015-12-12T23:10:29.000Z,fr,"[{'type': 'retweeted', 'id': '6757507629416898...","{'retweet_count': 136, 'reply_count': 0, 'like...",Political actors,en,For
3,675814887000580097,2350315591,RT @malinimehra: Yes - the world will be watch...,2015-12-12T23:09:59.000Z,en,"[{'type': 'retweeted', 'id': '6758144821998960...","{'retweet_count': 1, 'reply_count': 0, 'like_c...",Political actors,en,For
4,675788617785221120,2350315591,RT @NatureNews: The key points of the Paris #c...,2015-12-12T21:25:36.000Z,en,"[{'type': 'retweeted', 'id': '6757702837693399...","{'retweet_count': 154, 'reply_count': 0, 'like...",Political actors,en,For


In [4]:
# 只保留英文推文
merged_en = merged_df[merged_df["lang"] == "en"].copy()
print("English tweets:", len(merged_en))


English tweets: 83685


In [5]:
def is_retweet(ref):
    """
    ref 是从 Twitter JSON 'referenced_tweets' 字段来的:
    - 可能是 None
    - 可能是 list of dicts, 每个 dict 有 type, id
    如果有任何一个 type == 'retweeted'，我们就认为这是转推
    """
    if isinstance(ref, list) and len(ref) > 0:
        return any(r.get("type") == "retweeted" for r in ref)
    return False

merged_en["is_retweet"] = merged_en["referenced_tweets"].apply(is_retweet)

# 去掉转推，只保留“自己发的”内容
merged_nort = merged_en[~merged_en["is_retweet"]].copy()
print("Non-retweet English tweets:", len(merged_nort))


Non-retweet English tweets: 51778


In [6]:
merged_en["is_retweet"].value_counts(normalize=True)

is_retweet
False    0.618725
True     0.381275
Name: proportion, dtype: float64

In [7]:
merged_nort["text_len"] = merged_nort["text"].str.split().str.len()
merged_long = merged_nort[merged_nort["text_len"] >= 5].copy()
print("After removing super-short tweets:", len(merged_long))

merged_long["text_len"].describe()


After removing super-short tweets: 51688


count    51688.000000
mean        15.541751
std          3.734447
min          5.000000
25%         13.000000
50%         16.000000
75%         18.000000
max         32.000000
Name: text_len, dtype: float64

In [8]:
tweet_counts = (
    merged_long
    .groupby("author_id")
    .size()
    .reset_index(name="n_tweets")
    .sort_values("n_tweets", ascending=False)
)

print("Number of authors:", len(tweet_counts))
tweet_counts.head()


Number of authors: 807


Unnamed: 0,author_id,n_tweets
508,3584633117,1881
309,2469606308,1230
546,389548615,1010
740,77254498,707
54,131079801,638


In [9]:
tweet_counts["n_tweets"].describe()


count     807.000000
mean       64.049566
std       120.383321
min         1.000000
25%         6.000000
50%        26.000000
75%        76.000000
max      1881.000000
Name: n_tweets, dtype: float64

In [None]:
TOP_N = 807  # 根据上面的 describe 调整
top_authors = tweet_counts.head(TOP_N)["author_id"].tolist()
len(top_authors)


807

In [11]:
top_df = merged_long[merged_long["author_id"].isin(top_authors)].copy()
print("Tweets from top authors:", len(top_df))
top_df.head()


Tweets from top authors: 51688


Unnamed: 0,id,author_id,text,created_at,lang,referenced_tweets,public_metrics,Type,Lang,Stance,is_retweet,text_len
96,674295122285232128,2350315591,Parliamentarians fired up for post-Paris actio...,2015-12-08T18:30:59.000Z,en,,"{'retweet_count': 2, 'reply_count': 0, 'like_c...",Political actors,en,For,False,12
97,674293549110833152,2350315591,GLOBE Parliamentarians fired up for post-Paris...,2015-12-08T18:24:44.000Z,en,,"{'retweet_count': 3, 'reply_count': 0, 'like_c...",Political actors,en,For,False,13
98,674292745893228544,2350315591,Parliamentarians fired up for post-Paris actio...,2015-12-08T18:21:33.000Z,en,,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Political actors,en,For,False,11
99,674205522741092352,2350315591,Parliamentarians assert themselves @COP21 w/ G...,2015-12-08T12:34:57.000Z,en,,"{'retweet_count': 1, 'reply_count': 1, 'like_c...",Political actors,en,For,False,13
101,674151832235728896,2350315591,Understanding the Paris End-Game - Excellent g...,2015-12-08T09:01:36.000Z,en,,"{'retweet_count': 3, 'reply_count': 0, 'like_c...",Political actors,en,For,False,16


In [12]:
author_docs = (
    top_df
    .groupby(["author_id", "Type", "Stance"])["text"]
    .apply(lambda xs: " ".join(xs))
    .reset_index(name="full_text")
)

print("Number of author-level docs:", len(author_docs))
author_docs.head()


Number of author-level docs: 807


Unnamed: 0,author_id,Type,Stance,full_text
0,100189549,Advocacy actors,For,@N_Hulot #COP21 For the first time humanity is...
1,1011036600,Advocacy actors,For,Pics from our participation in #COP21 When #Gr...
2,102325445,Political actors,Unclear,Investing for a Greener Future: development fi...
3,102448827,Political actors,Unclear,#HighAmbitionCoalition on the way to #COP21 Pl...
4,102634281,Advocacy actors,For,Read up on the latest news &amp; updates about...


In [13]:
out_path = DATA_DIR / "author_level_docs_en_notrt_min5_top807.csv"
author_docs.to_csv(out_path, index=False)
print("Saved:", out_path)

Saved: ../data/author_level_docs_en_notrt_min5_top807.csv


Keep only English

Remove reposts

Remove ultra-short text
Then count tweets by author based on this data and select the top N authors.

## 1.2


In [1]:
import re
import string
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data")

author_docs = pd.read_csv(DATA_DIR / "author_level_docs_en_notrt_min5_top807.csv")
print(len(author_docs))
author_docs.head()


807


Unnamed: 0,author_id,Type,Stance,full_text
0,100189549,Advocacy actors,For,@N_Hulot #COP21 For the first time humanity is...
1,1011036600,Advocacy actors,For,Pics from our participation in #COP21 When #Gr...
2,102325445,Political actors,Unclear,Investing for a Greener Future: development fi...
3,102448827,Political actors,Unclear,#HighAmbitionCoalition on the way to #COP21 Pl...
4,102634281,Advocacy actors,For,Read up on the latest news &amp; updates about...


In [2]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop = set(stopwords.words("english"))

# 可以顺便加一些你觉得没信息量的短词
extra_stop = {"rt", "via", "cop21", "paris", "https", "http"}
stop |= extra_stop


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yuezhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def basic_tokenize(text):
    if not isinstance(text, str):
        return []
    
    # lower case
    text = text.lower()
    
    # remove urls
    text = re.sub(r"http\S+|www\.\S+", "", text)
    
    # remove mentions
    text = re.sub(r"@\w+", "", text)
    
    # remove hashtags symbol '#' but keep the word
    text = re.sub(r"#", "", text)
    
    # remove numbers
    text = re.sub(r"\d+", "", text)
    
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # split on whitespace
    tokens = text.split()
    
    # remove very short tokens (len<=2)
    tokens = [t for t in tokens if len(t) > 2]
    
    return tokens


In [4]:
# 去掉token
def tokenize_no_stop(text):
    tokens = basic_tokenize(text)
    tokens = [t for t in tokens if t not in stop]
    return tokens

author_docs["tokens_nostop"] = author_docs["full_text"].apply(tokenize_no_stop)
author_docs[["author_id", "tokens_nostop"]].head()


Unnamed: 0,author_id,tokens_nostop
0,100189549,"[cop, first, time, humanity, walking, right, d..."
1,1011036600,"[pics, participation, cop, greenbusiness, join..."
2,102325445,"[investing, greener, future, development, fina..."
3,102448827,"[highambitioncoalition, way, cop, plenary, fin..."
4,102634281,"[read, latest, news, amp, updates, took, place..."


In [5]:
# 保留停用词
def tokenize_keep_stop(text):
    return basic_tokenize(text)

author_docs["tokens_keepstop"] = author_docs["full_text"].apply(tokenize_keep_stop)
author_docs[["author_id", "tokens_keepstop"]].head()


Unnamed: 0,author_id,tokens_keepstop
0,100189549,"[cop, for, the, first, time, humanity, walking..."
1,1011036600,"[pics, from, our, participation, cop, when, gr..."
2,102325445,"[investing, for, greener, future, development,..."
3,102448827,"[highambitioncoalition, the, way, cop, plenary..."
4,102634281,"[read, the, latest, news, amp, updates, about,..."


In [6]:
docs_tokens_nostop = author_docs["tokens_nostop"].tolist()
docs_tokens_keepstop = author_docs["tokens_keepstop"].tolist()

print(len(docs_tokens_nostop), "documents")
print(docs_tokens_nostop[0][:30])  # 看看第一个文档前30个词


807 documents
['cop', 'first', 'time', 'humanity', 'walking', 'right', 'direction', 'december', 'date', 'engraved', 'history', 'cop', 'parisagreement', 'added', 'universal', 'rights', 'humanity', 'human', 'rights', 'cop', 'breaking', 'cop', 'climate', 'change', 'summit', 'reaches', 'deal', 'chance', 'change', 'world']


In [7]:
author_docs.to_pickle(DATA_DIR / "author_docs_with_tokens.pkl")


Converted the long text into a list of tokens.