In [21]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import nltk

In [4]:
DATA_DIR = Path("../data")
data_path = DATA_DIR / "tweets_account_Advocacy_Political.xlsx"
df = pd.read_excel(data_path)
df.head()

Unnamed: 0,tweet_id,author_id,tweet_text,posting_date,language,retweet_count,favorite_count,Type,account_language,Stance
0,675827426363121664,2350315591,RT @LaurenceTubiana: I just can believe it !we...,2015-12-12,en,109,0,Political actors,en,For
1,675827386416541696,518918764,"RT @WWFnoticias: HOY, el mundo marcó el princi...",2015-12-12,es,111,0,Advocacy actors,es,For
2,675827278295777280,314125926,RT @paris_climate: The #Paris agreement means ...,2015-12-12,en,63,0,Advocacy actors,en,For
3,675827060674310144,314125926,RT @UN_Spokesperson: #ParisAgreement: what was...,2015-12-12,en,431,0,Advocacy actors,en,For
4,675826885801193472,96750689,RT @BrighterGreenNY: Wanqing Zhou &amp; @gfc12...,2015-12-12,en,1,0,Advocacy actors,en,For


In [5]:
df["Type"].value_counts()

Type
Advocacy actors     49021
Political actors    27293
Name: count, dtype: int64

In [6]:
df.columns

Index(['tweet_id', 'author_id', 'tweet_text', 'posting_date', 'language',
       'retweet_count', 'favorite_count', 'Type', 'account_language',
       'Stance'],
      dtype='object')

## 1. Clean  Data

In [None]:
# 清洗推文文本的函数
def clean_tweet(text):
    if pd.isna(text):
        return ""

    # 1. 移除 RT @xxx:
    text = re.sub(r"^RT\s+@\w+[\:\.\,\!\?\-\;]*\s*", "", text)
    # 2. 移除 @用户名
    text = re.sub(r"[\.\(\)\[\]\{\}!?,;:'\"\-]*@\w+[\.\(\)\[\]\{\}!?,;:'\"\-]*", "", text)


    # 3. 去掉 URL
    text = re.sub(r"http\S+|www\.\S+", "", text)

    # # 4. 去掉 # 号但保留 hashtag 单词
    # text = text.replace("#", "")

    # === 处理 hashtag ===
    # 1）保留“在句子中的 hashtag” → 只去 #
    text = re.sub(r"(?<![A-Za-z])#([A-Za-z0-9_]+)[\.\,\!\?\:\;\-]*", "", text)

    # 2）删除“独立标签 hashtag”
    text = re.sub(r"(?<![A-Za-z])#([A-Za-z0-9_]+)", "", text)

    # 5. 多个空格合并为一个空格
    text = re.sub(r"\s+", " ", text).strip()

    return text


# 查看是否有重复文本数据
def duplicate_texts(df,col):
    len(df['tweet_text'].unique())
    print(f"总推文数: {len(df)}, 唯一推文数: {len(df[col].unique())}") 
    
    df = df.copy()
    df= df.sort_values(by=['author_id', 'posting_date'])
    return df.drop_duplicates(subset=['author_id', col], keep='last')


In [None]:
print(f"原始推文数: {df.shape}")
# ① 过滤英文
df_eng = df.loc[df["language"] == "en"]
print(f"英文推文数: {df_eng.shape}")

# ② 删除重复文本
df_dups = duplicate_texts(df_eng, "tweet_text")
print(f"去重后英文推文数: {df_dups.shape}")

# ③ 清洗 tweet_text
df_dups["clean_text"] = df_dups["tweet_text"].map(clean_tweet)
print(f"清洗后英文推文数: {df_dups.shape}")


原始推文数: (76314, 10)
英文推文数: (43108, 10)
总推文数: 43108, 唯一推文数: 38632
去重后英文推文数: (42650, 10)
清洗后英文推文数: (42650, 11)


## 2. Test sentiment variance 
* Advocacy actors show higher sentiment variance (stronger emotional extremes), while political actors produce more sentimentally stable content.
* 倡导类账号的情绪更极端，而政治类账号情绪更稳定

In [24]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

def predict_vader(text):
    score = sid.polarity_scores(text)['compound']
    if score > 0.05:
        return 1
    elif score < -0.05:
        return -1
    return 0

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\54241\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
df_dups['pred_vader'] = df_dups["clean_text"].apply(predict_vader)

In [None]:
# 按 Type 统计情绪标签数量
df_dups.groupby("Type")['pred_vader'].value_counts()


In [29]:
# 计算两组的方差
df_dups.groupby("Type")['pred_vader'].var()


Type
Advocacy actors     0.508350
Political actors    0.454317
Name: pred_vader, dtype: float64

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=df_dups, x='pred_vader', hue='Type')
plt.show()
