# Text data generalization

## 1. 读取推文信息，生成用于时间主题分析数据集


In [None]:
import pandas as pd
import json
import os
os.chdir("D:/uppsala/16. data of social mining/data")

# 文件名为 'tweets.dat'
file_path = 'tweets.dat'

# 存储提取出的推文数据的列表
tweet_records = []

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        # 逐行读取文件
        for line in f:
            # 1. 解析每一行为一个 JSON 对象
            try:
                tweet_json = json.loads(line.strip())
            except json.JSONDecodeError:
                # 简单地跳过格式错误的行
                print(f"Skipping badly formatted line: {line[:50]}...")
                continue

            # 2. 提取所需的字段
            tweet_id = tweet_json.get('id')
            author_id = tweet_json.get('author_id')
            text = tweet_json.get('text')
            created_at_str = tweet_json.get('created_at') # 格式如 "2015-12-12T23:59:59.000Z"
            language = tweet_json.get("lang")
            metrics = tweet_json.get("public_metrics", {})
            retweet_count = metrics.get("retweet_count", 0)
            favorite_count = metrics.get("like_count", 0)
           
            # 3. 提取日期部分 (YYYY-MM-DD)
            # 仅取 'T' 之前的部分即为日期
            posting_date = created_at_str.split('T')[0] if created_at_str else None

            # 收集数据
            tweet_records.append({
                'tweet_id': tweet_id,
                'author_id': author_id,
                'tweet_text': text,
                'posting_date': posting_date,
                'language': language,
                'retweet_count': retweet_count,
                'favorite_count': favorite_count
            })

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    # 如果文件不存在，您可以跳过这一步或加载一个模拟数据
    # return None

# 4. 转换为 Pandas DataFrame
tweets= pd.DataFrame(tweet_records)

# 查看结果
print("--- 推文数据集 ---")
print(tweets.head())
print(f"\n总记录数: {len(tweets)}")


--- 推文数据集 ---
             tweet_id   author_id  \
0  675827469119832066  1011975294   
1  675827469006581760   255144027   
2  675827468775718912   214748274   
3  675827465378504705   449273927   
4  675827465336434688  1601937732   

                                          tweet_text posting_date language  \
0  RT @MinisterTdB: Climate change won’t stop ove...   2015-12-12       en   
1  RT @LaurenceTubiana: I just can believe it !we...   2015-12-12       en   
2  RT @COP21en: We did it! #ParisAgreement is ado...   2015-12-12       en   
3  RT @TheGlobalGoals: Incredible news for our wo...   2015-12-12       en   
4  RT @StopShenhua: “The people’s resolve is such...   2015-12-12       en   

   retweet_count  favorite_count  
0            107               0  
1            109               0  
2           1204               0  
3            110               0  
4             49               0  

总记录数: 2260916


In [21]:
print(tweets['posting_date'].value_counts().sort_index())
total_tweets = tweets['posting_date'].value_counts().sum()
print(f"总推文数量: {total_tweets}")

posting_date
2015-11-30    353140
2015-12-01    219577
2015-12-02    226315
2015-12-03    199680
2015-12-04    151656
2015-12-05    103554
2015-12-06     74992
2015-12-07    129884
2015-12-08    125392
2015-12-09    137159
2015-12-10    118877
2015-12-11    118304
2015-12-12    302386
Name: count, dtype: int64
总推文数量: 2260916


## 2.生成推文 + 账户信息的数据集

* 为推文添加发布者的账户的信息：语言（language）、类型（type）和立场（stance）


In [13]:
accounts = pd.read_csv("accounts.tsv", sep="\t", dtype={"author_id": str})
# 重命名 accounts的 Lang 字段为 account_language，和 tweets 数据框中的 language 以示区别

accounts.rename(columns={"Lang": "account_language"}, inplace=True)

print("✅ 已载入  accounts.tsv")
print( accounts.head(5))
print( accounts.shape)


✅ 已载入  accounts.tsv
             author_id                 Type account_language Stance
0              8508262  Private individuals               fr    For
1           3297659759      Advocacy actors               es    For
2  1351436889316683778  Journalistic actors               en    For
3            259352661      Advocacy actors               en    For
4             17158610      Advocacy actors               en    For
(1936, 4)


In [14]:

# === 5. 合并数据 ===
tweets_with_accounts = pd.merge(
    tweets,
    accounts,
    on="author_id",  # 两边列名相同，直接用 on
    how="inner"
)
print("--- 合并后的数据集 ---")
print(tweets_with_accounts.head())
print(f"\n总记录数: {len(tweets_with_accounts)}")

--- 合并后的数据集 ---
             tweet_id   author_id  \
0  675827426363121664  2350315591   
1  675827386416541696   518918764   
2  675827278295777280   314125926   
3  675827253540954112   786625296   
4  675827250982428673   786625296   

                                          tweet_text posting_date language  \
0  RT @LaurenceTubiana: I just can believe it !we...   2015-12-12       en   
1  RT @WWFnoticias: HOY, el mundo marcó el princi...   2015-12-12       es   
2  RT @paris_climate: The #Paris agreement means ...   2015-12-12       en   
3  (La Nouvelle République):#COP21: Un coup de ma...   2015-12-12       fr   
4  (La Provence):#COP21: Un coup de marteau et to...   2015-12-12       fr   

   retweet_count  favorite_count                 Type account_language  \
0            109               0     Political actors               en   
1            111               0      Advocacy actors               es   
2             63               0      Advocacy actors               en

In [25]:
tweets_with_accounts.to_csv("tweets_with_accounts.csv", index=False,encoding="utf-8-sig")

In [24]:
print(tweets_with_accounts['posting_date'].value_counts().sort_index())
total_tweets_accounts = tweets_with_accounts['posting_date'].value_counts().sum()
print(f"推文账户合并信息: {total_tweets_accounts}")

posting_date
2015-11-30    17947
2015-12-01    21812
2015-12-02    13264
2015-12-03    12090
2015-12-04    11288
2015-12-05     8139
2015-12-06     5276
2015-12-07    11661
2015-12-08    10673
2015-12-09    12109
2015-12-10     9164
2015-12-11    10962
2015-12-12    16649
Name: count, dtype: int64
推文账户合并信息: 161034
