### 1. check data type and column names

In [None]:
import pandas as pd
from pathlib import Path

data_path = Path("../data/")

# tweets.dat 
tweets_path = data_path / "tweets.dat"
num_tweets = sum(1 for _ in open(tweets_path, "r", encoding="utf-8"))
print(f"=== tweets.dat ===")
print(f"Total tweets: {num_tweets:,} lines (≈ number of tweets)")
print("First 3 lines:")
with open(tweets_path, "r", encoding="utf-8") as f:
    for i in range(3):
        print(f.readline().strip())
print("-" * 60)

# accounts.tsv
accounts_path = data_path / "accounts.tsv"
accounts_df = pd.read_csv(accounts_path, sep="\t")
print(f"=== accounts.tsv ===")
print(f"Total accounts: {len(accounts_df):,} rows")
print("Columns:", list(accounts_df.columns))
print("First 3 rows:")
display(accounts_df.head(3))
print("-" * 60)

#  media_list.txt
media_list_path = data_path / "media_list.txt"
num_media = sum(1 for _ in open(media_list_path, "r", encoding="utf-8"))
print(f"=== media_list.txt ===")
print(f"Total media files: {num_media:,} lines (≈ number of images)")
print("First 5 lines:")
with open(media_list_path, "r", encoding="utf-8") as f:
    for i in range(5):
        print(f.readline().strip())


### 2. Preprocessing
Construct an “image-level” table where each row represents one image (media), 
with its corresponding tweet, author, timestamp, engagement metrics, 
as well as account metadata (Type/Lang/Stance) and filename.

In [None]:
import json
from pathlib import Path
from datetime import datetime
import pandas as pd

DATA_DIR = Path("data")

# 1) Read accounts.tsv, disable scientific notation, and treat all columns as strings
accounts_path = DATA_DIR / "accounts.tsv"
accounts_df = pd.read_csv(
    accounts_path,
    sep="\t",
    dtype=str,            # Prevent author_id from being parsed as scientific notation (e.g., 8.50e+06)
    keep_default_na=False  # Prevent empty strings from being converted to NaN
)
# Standardize column names
accounts_df.columns = [c.strip() for c in accounts_df.columns]
if "author_id" not in accounts_df.columns:
    # Try common alternative column names
    for alt in ["user_id", "id", "account_id"]:
        if alt in accounts_df.columns:
            accounts_df = accounts_df.rename(columns={alt: "author_id"})
            break

print("=== accounts.tsv (head) ===")
display(accounts_df.head())

In [None]:
# 2) Read media_list.txt, remove file extensions, and create media_list_df = [media_key, file_name]
#    Example: "3_456462992792498176.jpg" -> media_key = "3_456462992792498176"
media_list_path = DATA_DIR / "media_list.txt"
media_rows = []
with open(media_list_path, "r", encoding="utf-8") as f:
    for line in f:
        fname = line.strip()
        if not fname:
            continue
        stem = Path(fname).stem  # Remove file extension
        media_rows.append({"media_key": stem, "file_name": fname})

media_list_df = pd.DataFrame(media_rows, columns=["media_key", "file_name"])
print("\n=== media_list.txt (head) ===")
display(media_list_df.head())

In [None]:
# 3) Read tweets.dat line by line (JSON Lines format)
#    Compatible with:
#       v2: attachments.media_keys
#       v1: entities.media / extended_entities.media (extract id / id_str and reformat as media_key)
tweets_path = DATA_DIR / "tweets.dat"

# The following helper functions (by Professor G) are used to parse tweet JSON,
# extract media info, and avoid KeyError or format errors.
def safe_get(d, *keys, default=None):
    """Multi-level get to avoid KeyError."""
    cur = d
    for k in keys:
        if not isinstance(cur, dict) or k not in cur:
            return default
        cur = cur[k]
    return cur

def parse_created_at(ts):
    if not ts:
        return None
    # Common format: "2015-12-12T23:59:59.000Z"
    try:
        return datetime.fromisoformat(ts.replace("Z", "+00:00"))
    except Exception:
        return None

def extract_media_entries(tweet):
    """
    Return a list like [ {media_key, source}, ... ]
    Priority is given to v2: attachments.media_keys.
    If v2 is missing, fallback to v1: entities/extended_entities.media.
    These usually contain only id/id_str and type, so we reformat them into
    the v2-style media_key = "3_" + id_str, so they can match with media_list.txt.
    """
    out = []

    # --- v2 path: attachments.media_keys ---
    media_keys = safe_get(tweet, "attachments", "media_keys", default=[])
    if isinstance(media_keys, list):
        for mk in media_keys:
            out.append({
                "media_key": str(mk),
                "source": "v2_attachments"
            })

    # # --- v1 path: extended_entities.media / entities.media ---
    # # If no v2 exists, construct media_key from v1 (Twitter v1 usually has id/id_str only)
    # def add_from_media_list(media_list, tag):
    #     if isinstance(media_list, list):
    #         for m in media_list:
    #             mid = str(m.get("id_str") or m.get("id") or "").strip()
    #             mtype = m.get("type")
    #             if mid:
    #                 # Typically, v2 media_key looks like "3_<id>"; we use the same format for compatibility
    #                 mk = f"3_{mid}"
    #                 out.append({"media_key": mk, "source": tag})

    # ee_media = safe_get(tweet, "extended_entities", "media", default=None)
    # if ee_media:
    #     add_from_media_list(ee_media, "v1_extended_entities")

    # e_media = safe_get(tweet, "entities", "media", default=None)
    # if e_media:
    #     add_from_media_list(e_media, "v1_entities")

    # Deduplicate media entries in the same tweet (by media_key)
    unique = {}
    for m in out:
        unique[m["media_key"]] = m
    return list(unique.values())

In [None]:
# Extract additional basic fields from each tweet:
# tweet_id, author_id, created_at, lang, and engagement metrics 
# (retweet_count, reply_count, like_count, quote_count)
# Each image is stored as one row in image_rows.
# Using the image as the base unit: in later aggregation steps (by day or by account),
# the unit of analysis will be “number of images” or “engagements received by images”.
# Therefore, we split the “tweet:media = 1:n” relationship into one row per image.
image_rows = []

with open(tweets_path, "r", encoding="utf-8") as f:
    for ln, line in enumerate(f, start=1):
        line = line.strip()
        if not line:
            continue
        try:
            tw = json.loads(line)
        except json.JSONDecodeError:
            # Skip malformed lines
            continue

        tweet_id = str(tw.get("id", "")).strip()
        author_id = str(tw.get("author_id", "")).strip()
        created_at = parse_created_at(tw.get("created_at"))
        lang = tw.get("lang")
        metrics = tw.get("public_metrics") or {}
        retweets = metrics.get("retweet_count")
        replies = metrics.get("reply_count")
        likes = metrics.get("like_count")
        quotes = metrics.get("quote_count")

        media_entries = extract_media_entries(tw)
        if not media_entries:
            continue  # This tweet has no images

        for m in media_entries:
            image_rows.append({
                "media_key": m["media_key"],
                "tweet_id": tweet_id,
                "author_id": author_id,
                "created_at": created_at,
                "date": created_at.date().isoformat() if created_at else None,
                "lang": lang,
                "retweet_count": retweets,
                "reply_count": replies,
                "like_count": likes,
                "quote_count": quotes,
                "source_path": m["source"]  # Record extraction source for quality checking
            })

images_df = pd.DataFrame(image_rows)

print("\n=== Extracted images from tweets (head) ===")
display(images_df.head())
print(f"Total images extracted: {len(images_df):,}")


In [None]:
# 4) Merge with media_list (to get file names) and accounts (to get Type/Lang/Stance)
# 4.1 media_key → file_name
images_df = images_df.merge(media_list_df, how="left", on="media_key")

# 4.2 Account metadata
acc_cols = ["author_id", "Type", "Lang", "Stance"]  # Keep required columns for the lab: author_id, Type, Lang, Stance
for c in acc_cols:
    if c not in accounts_df.columns:
        # Error tolerance: if any column is missing, create an empty one to prevent merge errors
        accounts_df[c] = ""
# Align at the row level first; after this, we can perform:
#   groupby("date") for images_by_day
#   groupby("author_id") for images_by_account
images_df = images_df.merge(
    accounts_df[acc_cols].rename(columns={"Lang": "account_lang"}),
    how="left",
    on="author_id"
)

print("\n=== images_df after merge (head) ===")
display(images_df.head())


In [None]:
# Basic descriptive statistics, useful for writing the datasheet
print("\n=== Basic stats ===")
# Total number of images (rows)
print("Total images   :", len(images_df))
# Number of unique media_keys (should equal total images unless duplicates exist)
print("Unique media_key:", images_df["media_key"].nunique())
# Number of unique authors
print("Unique authors  :", images_df["author_id"].nunique())
# Date range
print("Date range      :", images_df["date"].min(), "-", images_df["date"].max())
# Top 5 accounts by number of images
print("\nTop 5 accounts by image count:")
display(images_df["author_id"].value_counts().head(5).to_frame("image_count"))
# Top 5 dates by number of images
print("\nTop 5 dates by image count:")
display(images_df["date"].value_counts().head(5).to_frame("image_count"))


### 3. Generate two tables for subsequent analysis:
##### images_by_day.csv
groupby("date") to calculate daily statistics: number of images, total likes/retweets, number of active accounts, etc.
##### images_by_account.csv
groupby("author_id") to calculate per-account statistics: number of images, engagement summaries, along with Type, Stance, and account_lang.

In [None]:
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

# 1) images_by_day.csv
images_by_day = (
    images_df
    .groupby("date", as_index=False)
    .agg({
        "media_key": "count",             # Number of images per day
        "author_id": pd.Series.nunique,   # Number of unique accounts posting images
        "like_count": "sum",              # Total number of likes
        "retweet_count": "sum"            # Total number of retweets
    })
    .rename(columns={
        "media_key": "num_images",
        "author_id": "num_accounts",
        "like_count": "total_likes",
        "retweet_count": "total_retweets"
    })
    .sort_values("date")
)

images_by_day_path = output_dir / "images_by_day.csv"
images_by_day.to_csv(images_by_day_path, index=False)
print(f"Saved {images_by_day_path} ({len(images_by_day)} rows)")

# Display first few rows
display(images_by_day.head())


# 2) images_by_account.csv
images_by_account = (
    images_df
    .groupby(["author_id", "Type", "Stance", "account_lang"], as_index=False)
    .agg({
        "media_key": "count",           # Number of images
        "like_count": "sum",
        "retweet_count": "sum",
        "reply_count": "sum",
        "quote_count": "sum"
    })
    .rename(columns={
        "media_key": "num_images",
        "like_count": "total_likes",
        "retweet_count": "total_retweets",
        "reply_count": "total_replies",
        "quote_count": "total_quotes"
    })
    .sort_values("num_images", ascending=False)
)

images_by_account_path = output_dir / "images_by_account.csv"
images_by_account.to_csv(images_by_account_path, index=False)
print(f"Saved {images_by_account_path} ({len(images_by_account)} rows)")

# Display first few rows
display(images_by_account.head())


### 4. Data Summary（for datasheet）

In [None]:
print("\n=== Summary ===")
print(f"Total unique images : {images_df['media_key'].nunique():,}")
print(f"Total accounts      : {images_df['author_id'].nunique():,}")
print(f"Date range          : {images_df['date'].min()} → {images_df['date'].max()}")
print(f"Missing file_name   : {images_df['file_name'].isna().sum()}")

# missing ratio by column
missing_ratio = images_df.isna().mean().sort_values(ascending=False).head(10)
print("\nTop 10 columns by missing ratio:")
display(missing_ratio.to_frame("missing_ratio"))
