# APIs continued

Imports and define base URL for bluesky

In [1]:
import requests 
import time
import json as js
import pandas as pd

BASE_URL = "https://api.bsky.app/xrpc"

## Recap
We will recap what we did last week with seach. Feel free to copy from your previous notebook.

In [2]:
endpoint = f"{BASE_URL}/app.bsky.feed.searchPosts"
headers = {"User-Agent": "EMAT-Teaching/1.0 (+contact@example.com)"}
params = {
    "q": "data science",
    "limit": 10,  # start small for demos
}

resp = requests.get(endpoint, params=params, headers=headers, timeout=30)

print("Status:", resp.status_code)

data = resp.json()

print("Top-level keys:", list(data.keys()))

Status: 200
Top-level keys: ['posts', 'cursor']


## Let us use pagination

In [3]:
# Collect records
collected = []
# cursor pointer
cursor = None

In [4]:
params = {
    "q": "fashion supply chain",
    "limit": 100,  # start small for demos
}
resp = requests.get(endpoint, params=params, headers=headers, timeout=30)

data = resp.json()

posts = data.get("posts", []) or data.get("feed", [])
# populate data
collected.extend(posts)
# Cursor
cursor = data.get("cursor")
# pause for 0.5 sec
time.sleep(0.5)
print(cursor)
print(len(collected))

100
68


Now get another batch, this time with cursor

In [5]:
params = {
    "q": "data science",
    "limit": 100,  # start small for demos
}
resp = requests.get(endpoint, params=params, headers=headers, timeout=30)

if cursor:
    print(f"Cursor value is {cursor}")
    params["cursor"] = cursor
else:
    print("No cursor returned; stopping before third call.")
    raise SystemExit

data = resp.json()

posts = data.get("posts", []) or data.get("feed", [])
# populate data
collected.extend(posts)
# Print total collection results
print(f"Collection size is {len(collected)}")

Cursor value is 100
Collection size is 168


## Flatten
Flatten key fields from Bluesky PostView objects.

In [6]:
rows = []
for p in posts:
    #print(js.dumps(p, indent=2))
    stats = {
        "post_uri": p.get("uri"),
        "post_cid": p.get("cid"),
        "text": p.get("record", {}).get("text"),
        "likeCount": p.get("likeCount"),
        "repostCount": p.get("repostCount"),
        "author_did": p.get("author").get("did"),
        "author_handle": p.get("author").get("handle"),
        "author_displayName": p.get("author").get("displayName"),
    }
    rows.append(stats)
posts_df = pd.DataFrame(rows)
posts_df.head(5)

Unnamed: 0,post_uri,post_cid,text,likeCount,repostCount,author_did,author_handle,author_displayName
0,at://did:plc:k6cj2pklvsil3uf6umfvwhwb/app.bsky...,bafyreignczcoggn6mazou36ees4mhisjxzsaoloqxfghn...,"""science says"" <insert the most egregious inte...",1,0,did:plc:k6cj2pklvsil3uf6umfvwhwb,rh3x.bsky.social,Rh3x
1,at://did:plc:n54nrdcpb2exew2suvqxqzd6/app.bsky...,bafyreielpgohxoc5ltmyg3lvjciz7pagosyifw4kfhsmy...,"The ""lies, damn lies, and statistics"" line usu...",0,0,did:plc:n54nrdcpb2exew2suvqxqzd6,talismancer.bsky.social,Talismancer
2,at://did:plc:rhkcyc46ubi523e47bhnkpbb/app.bsky...,bafyreicsqpcg25wmtku3fgb3o3rjrixgly3hb7yi6t3vn...,The Petrol Tank for AI Discovery Might be Runn...,0,0,did:plc:rhkcyc46ubi523e47bhnkpbb,nic221.bsky.social,Nicole Hennig
3,at://did:plc:skfjeknazykd6mx3jflmt5zu/app.bsky...,bafyreiag6hk4zt7yraumxv5fxow54u3nl6isxqvqyx4p4...,I had a great time facilitating a roundtable a...,1,1,did:plc:skfjeknazykd6mx3jflmt5zu,drsarahjwhite.bsky.social,Sarah J White
4,at://did:plc:ja7yvxxk52zxfzfb33j22i36/app.bsky...,bafyreia6jevlrl7o5tmvafpxsy2vu4xpt2qdllm3xfvr3...,🧪 News of the Week! New Species from the Deep ...,0,0,did:plc:ja7yvxxk52zxfzfb33j22i36,dropsofsciencenews.bsky.social,Drops of Science - News from Natural Sciences


## Merge posts data with Author

In [7]:
# Get a list of unique author dids
unique_dids = posts_df["author_did"].dropna().unique().tolist()


In [11]:
# Get author profiles for these dids
all_profiles = []
for d in unique_dids:
    #print(js.dumps(d, indent=2))
    params = []
    params.append(("actor", d))
    #print(d)
    r = requests.get(f"{BASE_URL}/app.bsky.actor.getProfile", params=params, timeout=30)
    data = r.json()
    #print(js.dumps(data, indent=2))

    # Append this profile in our list 
    all_profiles.append({
        "did": data.get("did"),
        "handle": data.get("handle"),
        "displayName": data.get("displayName"),
        "followersCount": data.get("followersCount"),
        "followsCount": data.get("followsCount"),
        "postsCount": data.get("postsCount"),
        "createdAt": data.get("createdAt"),
        "description": data.get("description"),
    })

all_profiles_df = pd.DataFrame(all_profiles)
# This will take a while to load !
all_profiles_df.head(5)

Unnamed: 0,did,handle,displayName,followersCount,followsCount,postsCount,createdAt,description
0,did:plc:k6cj2pklvsil3uf6umfvwhwb,rh3x.bsky.social,Rh3x,4,46,2,2025-02-23T13:58:07.445Z,Software engineer / Hacker / Satanist\nHe/Him ...
1,did:plc:n54nrdcpb2exew2suvqxqzd6,talismancer.bsky.social,Talismancer,42,11,110,2023-07-23T13:07:17.129Z,
2,did:plc:rhkcyc46ubi523e47bhnkpbb,nic221.bsky.social,Nicole Hennig,4849,395,3504,2023-10-08T21:17:09.967Z,E-learning dev & AI educator at U of Arizona L...
3,did:plc:skfjeknazykd6mx3jflmt5zu,drsarahjwhite.bsky.social,Sarah J White,1160,1858,554,2023-10-01T19:31:38.706Z,Conversation doctor
4,did:plc:ja7yvxxk52zxfzfb33j22i36,dropsofsciencenews.bsky.social,Drops of Science - News from Natural Sciences,40,3,97,2024-11-17T19:30:46.039Z,A selection of the latest Natural Sciences new...


## This is where we merge posts and profile


In [12]:
# Classic pandas stitch:
# merge joins rows from the two dataframes based on matching key values.
posts_enriched = posts_df.merge(
    # Adds "author_" to every column name in all_profiles_df
    # Why? To avoid name collisions (e.g., both dataframes could have handle, displayName) 
    # and to make the origin obvious: anything about the author now clearly starts with author_.
    all_profiles_df.add_prefix("author_"),
    # left_on="author_did": use posts_df["author_did"] as the join key on the left.
    left_on="author_did",
    # right_on="author_did": use the prefixed key from the right dataframe (formerly did).
    right_on="author_did",
    # how="left": a left join. Keep every row from posts_df (every post), 
    # even if there is no matching profile. If a profile is missing, 
    # the author columns become NaN. 
    # This is what you want for enrichment—don’t drop posts just because the profile lookup failed.
    how="left"
)

posts_enriched.head(5)

Unnamed: 0,post_uri,post_cid,text,likeCount,repostCount,author_did,author_handle_x,author_displayName_x,author_handle_y,author_displayName_y,author_followersCount,author_followsCount,author_postsCount,author_createdAt,author_description
0,at://did:plc:k6cj2pklvsil3uf6umfvwhwb/app.bsky...,bafyreignczcoggn6mazou36ees4mhisjxzsaoloqxfghn...,"""science says"" <insert the most egregious inte...",1,0,did:plc:k6cj2pklvsil3uf6umfvwhwb,rh3x.bsky.social,Rh3x,rh3x.bsky.social,Rh3x,4,46,2,2025-02-23T13:58:07.445Z,Software engineer / Hacker / Satanist\nHe/Him ...
1,at://did:plc:n54nrdcpb2exew2suvqxqzd6/app.bsky...,bafyreielpgohxoc5ltmyg3lvjciz7pagosyifw4kfhsmy...,"The ""lies, damn lies, and statistics"" line usu...",0,0,did:plc:n54nrdcpb2exew2suvqxqzd6,talismancer.bsky.social,Talismancer,talismancer.bsky.social,Talismancer,42,11,110,2023-07-23T13:07:17.129Z,
2,at://did:plc:rhkcyc46ubi523e47bhnkpbb/app.bsky...,bafyreicsqpcg25wmtku3fgb3o3rjrixgly3hb7yi6t3vn...,The Petrol Tank for AI Discovery Might be Runn...,0,0,did:plc:rhkcyc46ubi523e47bhnkpbb,nic221.bsky.social,Nicole Hennig,nic221.bsky.social,Nicole Hennig,4849,395,3504,2023-10-08T21:17:09.967Z,E-learning dev & AI educator at U of Arizona L...
3,at://did:plc:skfjeknazykd6mx3jflmt5zu/app.bsky...,bafyreiag6hk4zt7yraumxv5fxow54u3nl6isxqvqyx4p4...,I had a great time facilitating a roundtable a...,1,1,did:plc:skfjeknazykd6mx3jflmt5zu,drsarahjwhite.bsky.social,Sarah J White,drsarahjwhite.bsky.social,Sarah J White,1160,1858,554,2023-10-01T19:31:38.706Z,Conversation doctor
4,at://did:plc:ja7yvxxk52zxfzfb33j22i36/app.bsky...,bafyreia6jevlrl7o5tmvafpxsy2vu4xpt2qdllm3xfvr3...,🧪 News of the Week! New Species from the Deep ...,0,0,did:plc:ja7yvxxk52zxfzfb33j22i36,dropsofsciencenews.bsky.social,Drops of Science - News from Natural Sciences,dropsofsciencenews.bsky.social,Drops of Science - News from Natural Sciences,40,3,97,2024-11-17T19:30:46.039Z,A selection of the latest Natural Sciences new...


### Why a left join (not inner/right)?
	•	Left: Keep all posts; add profile data when available. (Best for enrichment.)
	•	Inner: Drop posts with missing profiles (bad for analysis coverage).
	•	Right: Keep all profiles, even if they have no posts (not your goal here).


# Add some analytics


In [13]:
# Useful analytics columns/examples:
posts_enriched["engagement"] = posts_enriched[["likeCount","repostCount"]].fillna(0).sum(axis=1)

In [14]:
posts_enriched["engagement"].head()

0    1
1    0
2    0
3    2
4    0
Name: engagement, dtype: int64

In [15]:
print(posts_enriched.shape)

(100, 16)


In [16]:
posts_enriched.head(5)

Unnamed: 0,post_uri,post_cid,text,likeCount,repostCount,author_did,author_handle_x,author_displayName_x,author_handle_y,author_displayName_y,author_followersCount,author_followsCount,author_postsCount,author_createdAt,author_description,engagement
0,at://did:plc:k6cj2pklvsil3uf6umfvwhwb/app.bsky...,bafyreignczcoggn6mazou36ees4mhisjxzsaoloqxfghn...,"""science says"" <insert the most egregious inte...",1,0,did:plc:k6cj2pklvsil3uf6umfvwhwb,rh3x.bsky.social,Rh3x,rh3x.bsky.social,Rh3x,4,46,2,2025-02-23T13:58:07.445Z,Software engineer / Hacker / Satanist\nHe/Him ...,1
1,at://did:plc:n54nrdcpb2exew2suvqxqzd6/app.bsky...,bafyreielpgohxoc5ltmyg3lvjciz7pagosyifw4kfhsmy...,"The ""lies, damn lies, and statistics"" line usu...",0,0,did:plc:n54nrdcpb2exew2suvqxqzd6,talismancer.bsky.social,Talismancer,talismancer.bsky.social,Talismancer,42,11,110,2023-07-23T13:07:17.129Z,,0
2,at://did:plc:rhkcyc46ubi523e47bhnkpbb/app.bsky...,bafyreicsqpcg25wmtku3fgb3o3rjrixgly3hb7yi6t3vn...,The Petrol Tank for AI Discovery Might be Runn...,0,0,did:plc:rhkcyc46ubi523e47bhnkpbb,nic221.bsky.social,Nicole Hennig,nic221.bsky.social,Nicole Hennig,4849,395,3504,2023-10-08T21:17:09.967Z,E-learning dev & AI educator at U of Arizona L...,0
3,at://did:plc:skfjeknazykd6mx3jflmt5zu/app.bsky...,bafyreiag6hk4zt7yraumxv5fxow54u3nl6isxqvqyx4p4...,I had a great time facilitating a roundtable a...,1,1,did:plc:skfjeknazykd6mx3jflmt5zu,drsarahjwhite.bsky.social,Sarah J White,drsarahjwhite.bsky.social,Sarah J White,1160,1858,554,2023-10-01T19:31:38.706Z,Conversation doctor,2
4,at://did:plc:ja7yvxxk52zxfzfb33j22i36/app.bsky...,bafyreia6jevlrl7o5tmvafpxsy2vu4xpt2qdllm3xfvr3...,🧪 News of the Week! New Species from the Deep ...,0,0,did:plc:ja7yvxxk52zxfzfb33j22i36,dropsofsciencenews.bsky.social,Drops of Science - News from Natural Sciences,dropsofsciencenews.bsky.social,Drops of Science - News from Natural Sciences,40,3,97,2024-11-17T19:30:46.039Z,A selection of the latest Natural Sciences new...,0


# Add each author’s recent posting rate

In [17]:
from collections import Counter
import datetime as dt

## Get author feed
Get recent posts for one actor

In [23]:
def recent_posting_rate(handle_or_did, sample=50):
    params = {"actor": handle_or_did, "limit": 50}
    r = requests.get(f"{BASE_URL}/app.bsky.feed.getAuthorFeed", params=params, timeout=30)
    data = r.json()
    feed = data.get("feed", [])
    timestamps = []
    for item in feed:
        post = item.get("post")
        if post and post.get("indexedAt"):
            timestamps.append(pd.to_datetime(post["indexedAt"], utc=True))
    if not timestamps:
        return {"recent_posts": 0, "per_day_7d": 0.0}
    # Simple rate: posts per day in last 7 days (rough estimate for demo)
    now = pd.Timestamp.utcnow()
    last7 = [t for t in timestamps if (now - t).days < 7]
    return {"recent_posts": len(timestamps), "per_day_7d": round(len(last7)/7.0, 2)}

# Compute small sample
sample_authors = posts_enriched["author_handle_x"].dropna().unique()[:10]
rates = []

for h in sample_authors:
    try:
        # The ** operator unpacks that dictionary into the outer dictionary. 
        rates.append({"author_handle": h, **recent_posting_rate(h)})
        time.sleep(0.2)
    except Exception as e:
        print(e)
        rates.append({"author_handle": h, "recent_posts": None, "per_day_7d": None})

rates_df = pd.DataFrame(rates)

rates_df.head(5)

Unnamed: 0,author_handle,recent_posts,per_day_7d
0,rh3x.bsky.social,2,0.14
1,talismancer.bsky.social,50,7.14
2,nic221.bsky.social,50,5.57
3,drsarahjwhite.bsky.social,50,0.71
4,dropsofsciencenews.bsky.social,50,0.57


### rates.append({"author_handle": h, **recent_posting_rate(h)})
`recent_posting_rate(h)` is a function that calls the Bluesky author feed endpoint (`app.bsky.feed.getAuthorFeed`) and calculates stats like:<br>
`{"recent_posts": 45, "per_day_7d": 2.1}`
<br>
The ** operator unpacks that dictionary into the outer dictionary.<br>
So the final dictionary looks like:<br>
```
{
  "author_handle": "@climate_news",
  "recent_posts": 45,
  "per_day_7d": 2.1
}
```
