In [None]:
import pandas as pd
from pathlib import Path
import zipfile
import re

# 1.Load Data - entries,comments,likes

In [2]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR.parents[1] / "project_data"
DATA_DIR

WindowsPath('d:/uppsala/16. data of social mining/project_data')

In [3]:
def read_tsv_from_zip(zip_path, inner_name, columns=None):
    with zipfile.ZipFile(zip_path, "r") as z:
        with z.open(inner_name) as f:
            return pd.read_csv(
                f,
                sep="\t",              # FriendFeed 是 TSV
                names=columns,
                header=None if columns else "infer",
                na_values="\\N",
                engine="python",
                encoding="utf-8",
                on_bad_lines="skip"
            )

In [4]:
entries_cols = [
    "PostID",
    "PostedBy",
    "SourceName",
    "SourceURL",
    "GeoX",
    "GeoY",
    "Timestamp",
    "Text",
    "NumImg",
    "ImgURL",
    "NumVid",
    "VideoURL"
]

entries1 = read_tsv_from_zip(
    DATA_DIR / "entries1.zip",
    "entries1.csv",          # zip 里真实文件名
    columns=entries_cols
)

entries1.head()


Unnamed: 0,PostID,PostedBy,SourceName,SourceURL,GeoX,GeoY,Timestamp,Text,NumImg,ImgURL,NumVid,VideoURL
0,e/29af803d670fb8d67692095f3ee623e6,newsroom1,Ottawa Citizen - News,http://www.ottawacitizen.com/index.html,,,2010-08-02 18:43:10,RIM 'will not compromise' BlackBerry security ...,1,http://www.ottawacitizen.com/will+compromise+...,0,
1,e/9c8413a376bec6389be4d46d0812c2bb,mehmetinnet,Mehmetin.Net,http://www.mehmetin.net,,,2010-08-02 19:15:52,Fethullah Gülen: Referandum siyasî olarak görü...,1,http://www.mehmetin.net/2010/08/fethullah-gul...,0,
2,e/2d658d97842a466a9513f587f85b0e59,mehmetergin,Mehmetin.Net,http://www.mehmetin.net/,,,2010-08-02 19:15:52,Fethullah Gülen: Referandum siyasî olarak görü...,1,http://www.mehmetin.net/2010/08/fethullah-gul...,0,
3,e/b269ab5d56be4e5e90c1954ecc1ef63a,afriki,,,,,2010-08-02 16:37:57,Мне тут недавно один дизайнер сказал: «Ну прот...,0,,0,
4,e/74f0a50c374a4ad6bddcbcc60c60cad9,hamsafar,,,,,2010-08-02 18:54:39,عرض سلام و خسته نباشيد:),0,,0,


In [5]:
entries1.shape

(3957478, 12)

In [6]:

# 非拉丁脚本
non_latin_re = re.compile(
    r"[\u0400-\u04FF"   # Cyrillic
    r"\u0600-\u06FF"   # Arabic
    r"\u4E00-\u9FFF"   # CJK
    r"]"
)

def contains_non_latin(text):
    return isinstance(text, str) and bool(non_latin_re.search(text))

# 非英文拉丁字符（西/德/瑞/法常见）
NON_EN_LATIN_CHARS = set("ñáéíóúüöäßåäöàâæçèéêëîïôœùûÿ")
def contains_non_english_latin_chars(text):
    if not isinstance(text, str):
        return False
    t = text.lower()
    return any(c in t for c in NON_EN_LATIN_CHARS)

# 非英文功能词
NON_EN_FUNCTION_WORDS = {
    # Spanish
    " el ", " la ", " los ", " las ", " una ", " que ", " por ", " para ",
    # German
    " der ", " die ", " das ", " und ", " nicht ", " mit ",
    # Swedish
    " och ", " att ", " det ", " som ", " är ",
    # French
    " le ", " la ", " les ", " des ", " une ", " pour ", " que "
}
def contains_non_english_function_words(text):
    if not isinstance(text, str):
        return False
    t = " " + text.lower() + " "
    return any(w in t for w in NON_EN_FUNCTION_WORDS)

def is_likely_english_latin(text):
    return not (
        contains_non_latin(text)
        or contains_non_english_latin_chars(text)
        or contains_non_english_function_words(text)
    )



In [7]:
def build_entries_english_once(
    entries):
    """
    一次性完成：
    1) 去重（PostedBy + Text）
    2) 拉丁语系英文快速过滤
    3) ASCII 粗筛
    4)（可选）langid 最终确认
    返回：最终英文 DataFrame
    """

    # 去重
    df = entries.drop_duplicates(subset=["PostedBy", "Text"])

    # 拉丁语系英文快速过滤
    df = df[df["Text"].map(is_likely_english_latin)]

    return df.reset_index(drop=True)


In [8]:
entries1_english = build_entries_english_once(
    entries1)
print(f"Entries before: {entries1.shape}  →  after English filtering: {entries1_english.shape}")


Entries before: (3957478, 12)  →  after English filtering: (2768782, 12)


In [9]:
entries2 = read_tsv_from_zip(
    DATA_DIR / "entries2.zip",
    "entries2.csv",          # zip 里真实文件名
    columns=entries_cols
)

entries2.head()

Unnamed: 0,PostID,PostedBy,SourceName,SourceURL,GeoX,GeoY,Timestamp,Text,NumImg,ImgURL,NumVid,VideoURL
0,e/4ec845b38287101deb6739d0fa64566d,starya,Lolcats 'n' Funny Pictures of Cats - I Can Has...,http://icanhascheezburger.com/,,,2010-08-24 20:00:38,"VIDEO: Kitteh Tricks - <a rel=""nofollow"" href=...",0,,0,
1,e/3ca878bcc2f5ed65c7217e4e8d1b6300,vkamutzki,Google Reader,http://www.google.com/reader/shared/0541245284...,,,2010-08-24 20:01:58,"Scene: Torontohenge Sunrise - <a rel=""nofollow...",0,,0,
2,e/19130fd6b5444f9ba321c58bdf7242a2,biznetsuk,HelloTxt,http://hellotxt.com/,,,2010-08-24 20:02:08,autobuynow.info/cars Car Buying: How to buy ne...,0,,0,
3,e/62945bae3a1433880c0a681081ee8cbb,monikap19,Books Nonfiction,http://bnmoon.blogspot.com/,,,2010-08-24 20:02:00,"Check Out Psychology for $89.28 - <a rel=""nofo...",0,,0,
4,e/41d2105e23fc70c1976387ab53f51c89,pendar,Google Reader,http://www.google.com/reader/shared/1316848039...,,,2010-08-24 20:02:07,رکورد گینس تایپ سریع پیام با استفاده از گوشی گ...,0,,0,


In [10]:
entries2_english = build_entries_english_once(
    entries2)
print(f"Entries before: {entries2.shape}  →  after English filtering: {entries2_english.shape}")    

Entries before: (3966523, 12)  →  after English filtering: (2709193, 12)


In [11]:
entries3 = read_tsv_from_zip(
    DATA_DIR / "entries3.zip",
    "entries3.csv",          # zip 里真实文件名
    columns=entries_cols
)

entries3.head()

Unnamed: 0,PostID,PostedBy,SourceName,SourceURL,GeoX,GeoY,Timestamp,Text,NumImg,ImgURL,NumVid,VideoURL
0,e/56dfd8c1beb24b52b33e5369a1004603,healthrockstar,Ping.fm,http://ping.fm/,,,2010-09-10 05:56:36,"HealthRockstar: weightloss,#fitness 11 News An...",0,,0,
1,e/6d231b54bd0289060de9be6905900961,aggregatore,Aggregatore di blog,http://aggregatore.seoguru.it,,,2010-09-10 05:30:00,Pop Star Coreana Cade Rovinosamente Sul Palco ...,0,,0,
2,e/4d38f93a832c4f1cb2e2446eb15e3888,afro2501,Ping.fm,http://ping.fm/,,,2010-09-10 05:56:36,"Interesting Page: Object moved <a rel=""nofollo...",0,,0,
3,e/0e38d17bf5a47b55ae5782c8a6d985bc,cleanthinking,Cleanthinking.de - Cleantech und Energie News,http://www.cleanthinking.de/,,,2010-09-10 05:56:24,"Hintergrund Geothermie: Die Quelle, die nie ve...",0,,0,
4,e/5cb0fbb41a0567b2e3f228d0d6cec52e,gnomebeatz,Twitter,http://twitter.com/questlove/statuses/24081170630,,,2010-09-10 05:56:36,"@<a rel=""nofollow"" href=""http://twitter.com/ma...",0,,0,


In [12]:
entries3_english = build_entries_english_once(
    entries3)
print(f"Entries before: {entries3.shape}  →  after English filtering: {entries3_english.shape}")    

Entries before: (4419187, 12)  →  after English filtering: (3010122, 12)


In [23]:
entries = pd.concat([entries1_english, entries2_english, entries3_english], ignore_index=True)
entries.shape

(8488097, 12)

In [14]:
comments_cols = [
    "PostID",
    "EntryID",
    "PostedBy",
    "SourceName",
    "SourceURL",
    "GeoX",
    "GeoY",
    "Timestamp",
    "Text",
    "NumImg",
    "ImgURL",
    "NumVid",
    "VideoURL",
]

comments = read_tsv_from_zip(
    DATA_DIR / "comments.zip",
    "commentAugSept.csv",          # zip 里真实文件名
    columns=comments_cols
)

In [None]:
comments.head()

In [15]:
comments_english = build_entries_english_once(
    comments)
print(f"Comments before: {comments.shape}  →  after English filtering: {comments_english.shape}")    

Comments before: (3749889, 13)  →  after English filtering: (1845109, 13)


In [16]:
likes_cols = [
    "UserID",
    "PostID",
    "Timestamp",
]
likes = read_tsv_from_zip(
    DATA_DIR / "likes.zip", 
    "likes.csv",          # zip 里真实文件名
    columns=likes_cols
)

In [None]:
likes

In [None]:
users_cols = [
    "UserID",
    "Type",
    "Name",
    "Reserved",
    "Description"]


import csv

users_zip = DATA_DIR / "users.zip"

with zipfile.ZipFile(users_zip, "r") as z:
    print(z.namelist())  # 先确认文件名

    with z.open("users.csv") as f:
        users = pd.read_csv(
            f,
            sep="|",                  # ⭐ 关键：竖线分隔
            names=users_cols,
            header=None,
            na_values=["\\N", "null"],
            engine="python",
            encoding="utf-8",
            on_bad_lines="skip",
            quoting=csv.QUOTE_NONE
        )


users.head()


['users.csv']


Unnamed: 0,UserID,Type,Name,Reserved,Description
0,2gwsk,user,Ming Quek,2a9209ad,
1,iroach,user,Isaac Roach,20948a19,
2,tonybraun,user,Tony Braun,f68e8b26,
3,junknajidyahoocom1,user,Junknajid Yahoocom,98a02bb6,
4,spaceastronautics,user,Space & Astronautics News,c3178cfa,"NASA, ESA, NOAA, USGS, astronomy, space, weath..."


In [19]:
users['Type'].value_counts()

Type
user     625682
group     46158
Name: count, dtype: int64

In [28]:
users.shape

(671840, 5)

In [29]:
users = users.drop_duplicates(subset=["UserID"])
users.shape

(651870, 5)

# 2.生成网络

In [21]:
import networkx as nx

In [33]:
entries[["PostID", "PostedBy"]].head()   


Unnamed: 0,PostID,PostedBy
0,e/29af803d670fb8d67692095f3ee623e6,newsroom1
1,e/38867ca7a6b80722345569a30c1cf2d4,sconfinando
2,e/36cfd2743b8a4385a374f79fcac358dd,robfelty
3,e/7dd779c201dbe9a9d2a97302cbf3640b,moonboy
4,e/265617049685412692ac555e2fa58b81,elisagianola


In [25]:
comments["PostedBy"]

0              koenigdublin
1              ilportalinux
2                guardianuk
3                      nahi
4                   miocaro
                 ...       
3749884           seyrsefer
3749885         miladabadan
3749886    adobeillustrator
3749887             darksad
3749888           newsroom1
Name: PostedBy, Length: 3749889, dtype: object

In [26]:
likes["UserID"]

0                  yasinde
1         socialnewsturkey
2                    janzu
3               mugecerman
4                  miocaro
                ...       
798107          harunguven
798108                 fsn
798109              younos
798110               sayeh
798111             sooshee
Name: UserID, Length: 798112, dtype: object

In [27]:
users["UserID"]

0                      2gwsk
1                     iroach
2                  tonybraun
3         junknajidyahoocom1
4          spaceastronautics
                 ...        
671835               wertigo
671836        iowaveincenter
671837               lamaron
671838                 yoite
671839          yuricashflow
Name: UserID, Length: 671840, dtype: object

# Step 1 创建无向图

In [42]:
G = nx.Graph() 

# Step 2 构建post_author_map

In [43]:
post_author_map = (
    entries[["PostID", "PostedBy"]]
    .drop_duplicates()
    .set_index("PostID")["PostedBy"]
    .to_dict())

# Step 3 统计 comment_count，无向

In [None]:
comment_pairs = []

for _, row in comments.iterrows():
    commenter = row["PostedBy"]
    post_id = row["EntryID"]

    # 只保留英文帖子
    if post_id not in post_author_map:
        continue

    post_author = post_author_map[post_id]

    # 去掉自评论
    if commenter == post_author:
        continue

    # 无向：排序保证 (u, v) 和 (v, u) 视为同一条边
    u, v = sorted([commenter, post_author])
    comment_pairs.append((u, v))

comment_df = (
    pd.DataFrame(comment_pairs, columns=["u", "v"])
    .value_counts()
    .reset_index(name="comment_count")
)
print(f"Unique comment edges: {comment_df.shape[0]}")

Unique comment edges: 184346


# Step 4 统计 like_count，无向


In [38]:
like_pairs = []

for _, row in likes.iterrows():
    liker = row["UserID"]
    post_id = row["PostID"]

    if post_id not in post_author_map:
        continue

    post_author = post_author_map[post_id]

    # 去掉自赞
    if liker == post_author:
        continue

    u, v = sorted([liker, post_author])
    like_pairs.append((u, v))
like_df = (
    pd.DataFrame(like_pairs, columns=["u", "v"])
    .value_counts()
    .reset_index(name="like_count")
)


In [44]:
print(f"Unique comment edges: {like_df.shape[0]}")

Unique comment edges: 128700


# Step 5 加入边属性：comment_count 和 like_count

In [40]:
edge_df = pd.merge(
    comment_df,
    like_df,
    on=["u", "v"],
    how="outer"
).fillna(0)

edge_df["comment_count"] = edge_df["comment_count"].astype(int)
edge_df["like_count"] = edge_df["like_count"].astype(int)
print(f"Total unique edges (comments + likes): {edge_df.shape[0]}")

Total unique edges (comments + likes): 258567


In [41]:
edge_df.head()

Unnamed: 0,u,v,comment_count,like_count
0,001bizst,nickcodipietro,0,1
1,00yedi,muratkaya,2,0
2,02blog,alicetwain,3,0
3,02blog,gferraresi,1,0
4,02blog,puntomaupunto,0,1


# Step 6 加入边节点属性

In [45]:
for _, r in edge_df.iterrows():
    G.add_edge(
        r["u"], r["v"],
        comment_count=int(r["comment_count"]),
        like_count=int(r["like_count"]),
        weight=int(r["comment_count"]) + int(r["like_count"])
    )

In [46]:
users = users.drop_duplicates(subset=["UserID"]).copy()
node_ids = list(G.nodes())

user_attrs_df = (
    users[users["UserID"].isin(node_ids)]
    .set_index("UserID")[["Type", "Name", "Description"]]   
)

node_attr_dict = user_attrs_df.to_dict("index")
nx.set_node_attributes(G, node_attr_dict)


In [47]:
# quick check
list(G.nodes(data=True))[:5]


[('001bizst', {}),
 ('nickcodipietro',
  {'Type': 'user',
   'Name': 'Nicola Codipietro',
   'Description': 'Architect & Industrial Designer, founder of global web-site http://nicolacodipietro.com & http://centrotropicalepiscine.it'}),
 ('00yedi', {}),
 ('muratkaya',
  {'Type': 'user', 'Name': 'murat kaya', 'Description': 'connecting people'}),
 ('02blog', {'Type': 'user', 'Name': '02blog', 'Description': nan})]

In [None]:
# 部分用户未出现在用户表中，暂时先保留，后续如果需要可以删除
'001bizst' in set(users["UserID"]) 

False

In [49]:
list(G.edges(data=True))[:5]


[('001bizst',
  'nickcodipietro',
  {'comment_count': 0, 'like_count': 1, 'weight': 1}),
 ('nickcodipietro',
  '13to1',
  {'comment_count': 2, 'like_count': 2, 'weight': 4}),
 ('nickcodipietro',
  '1551palermo',
  {'comment_count': 0, 'like_count': 1, 'weight': 1}),
 ('nickcodipietro',
  '1geeky',
  {'comment_count': 0, 'like_count': 1, 'weight': 1}),
 ('nickcodipietro',
  '1seahorse1',
  {'comment_count': 0, 'like_count': 1, 'weight': 1})]

# Step 7  保存图数据

In [50]:
import pickle

file_path_pkl = DATA_DIR / 'network_data.pkl'
with open(file_path_pkl, 'wb') as f: # 注意 'wb' (写入二进制)
    pickle.dump(G, f)