# Followers Graphs

The goal here is to construct a follower graph on Mastodon using the `GET /api/v1/directory HTTP/1.1` endpoint

## Setup

In [2]:
import pandas as pd
import requests as re
import time
import os
import sqlite3
import json
import random
from mastodon import Mastodon
import networkx as nx

### Pandas Set up

In [None]:
# pandas options
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

## Check how the followers thing is working

According to https://docs.joinmastodon.org/methods/directory/

The `GET /api/v1/directory HTTP/1.1` endpoint returns "A directory of profiles that your website is aware of."

I think this is exactly what we would want!



In [33]:
offset = 80

In [34]:
req_stem = "https://hci.social/api/v1/directory?limit=80&local=true&order=new&offset={offset}"
req_query = req_stem.format(offset=offset)
r = re.get(req_query)


try:
    profiles = r.json()
except Exception as e:
    print(e, offset)

In [35]:
print(profiles[0])

{'id': '109467428920172086', 'username': 'AKWILL', 'acct': 'AKWILL', 'display_name': 'Alicia', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-12-06T00:00:00.000Z', 'note': '<p>PhD Candidate at UMSI studying mental health informatics, health equity, and UX practices. RPCV, knitter, lizard/cat mom, she/they. Probably proud of you 🌈💅🥳</p>', 'url': 'https://hci.social/@AKWILL', 'uri': 'https://hci.social/users/AKWILL', 'avatar': 'https://hci.social/avatars/original/missing.png', 'avatar_static': 'https://hci.social/avatars/original/missing.png', 'header': 'https://hci.social/headers/original/missing.png', 'header_static': 'https://hci.social/headers/original/missing.png', 'followers_count': 2, 'following_count': 5, 'statuses_count': 0, 'last_status_at': None, 'noindex': False, 'emojis': [], 'roles': [], 'fields': []}


In [36]:
len(profiles)

80

### Some helper functions

In [62]:
def select_relevant_fields(iterable): 
    """
    Just a map to pull certain things out of the json
    
    """
    smaller_json = list(map(lambda x: 
                            dict(
                                id=x["id"],
                                acct=x["acct"],
                                url=x["url"], # so we can tell which server!
                                created_at=x["created_at"],
                                followers_count=x["followers_count"],
                                following_count=x["following_count"],
                                statuses_count=x["statuses_count"],
                                last_status_at=x["last_status_at"]
                            ), 
                            iterable
                           )
                       )
    
    # add the server
    new_list = []
    for a_dict in smaller_json:
        # add the server name
        
        # need both because there are different urls for local versus not
        a_dict["instance"] = a_dict["url"].split("@")[0].split("/users/")[0]
        
        new_list.append(a_dict)
   
    return new_list

In [63]:
pd.DataFrame(select_relevant_fields(profiles))

Unnamed: 0,id,acct,url,created_at,followers_count,following_count,statuses_count,last_status_at,instance
0,109467428920172086,AKWILL,https://hci.social/@AKWILL,2022-12-06T00:00:00.000Z,2,5,0,,https://hci.social/
1,109467229458219426,chiplay,https://hci.social/@chiplay,2022-12-06T00:00:00.000Z,92,9,39,2023-10-13,https://hci.social/
2,109464463797526434,robotnik,https://hci.social/@robotnik,2022-12-06T00:00:00.000Z,0,6,0,,https://hci.social/
3,109463158338968179,labinthewild,https://hci.social/@labinthewild,2022-12-05T00:00:00.000Z,43,47,2,2023-02-09,https://hci.social/
4,109459389416719117,DianaF,https://hci.social/@DianaF,2022-12-05T00:00:00.000Z,20,72,1,2022-12-18,https://hci.social/
...,...,...,...,...,...,...,...,...,...
75,109367202126424699,DrSanchariDas,https://hci.social/@DrSanchariDas,2022-11-18T00:00:00.000Z,22,60,0,,https://hci.social/
76,109367007675028391,wendyboucher,https://hci.social/@wendyboucher,2022-11-18T00:00:00.000Z,0,2,0,,https://hci.social/
77,109366795394162940,arne,https://hci.social/@arne,2022-11-18T00:00:00.000Z,183,285,1,2022-12-20,https://hci.social/
78,109366768853363849,raydahl,https://hci.social/@raydahl,2022-11-18T00:00:00.000Z,210,322,256,2023-09-29,https://hci.social/


In [58]:
"https://bird.makeup/users/thelancetph".split("/users/")[0]

'https://bird.makeup'

In [59]:
"https://bird.makeup/users/thelancetph".split("@")[0].split("/users/")[0]

'https://bird.makeup'

## Do it again!

In [64]:
def pull_profiles(file_path="data/profiles/", local_only=True, offset=0, stride=80, write=True):
    """
    
    """
    profiles_df = pd.DataFrame()
    chunk_df = pd.DataFrame()

    req_stem = "https://hci.social/api/v1/directory?limit=80&local={local_only}&order=new&offset={offset}"
    req_query = req_stem.format(offset=offset, local_only="true" if local_only else "false")
    r = re.get(req_query)

    try:
        profiles = r.json()
    except Exception as e:
        print(e, offset)

    ii = 0

    while len(profiles) > 0:
        # keep going until you run out!

        this_profiles_chunk = pd.DataFrame(select_relevant_fields(profiles))

        # TODO: Don't concat these because we don't want huge CSVs! 
        # the newest thing is at the top, so concat old to the new
        profiles_df = pd.concat([this_profiles_chunk, profiles_df], axis=0)

        # Dataframe for 10 interations
        chunk_df = pd.concat([this_profiles_chunk, chunk_df], axis=0)

        # find out your new min_id
        most_recent_created_time = this_profiles_chunk["created_at"].max()

        if (ii + 1) % 10 == 0:
            print("caching results") 
            if write:
                chunk_df.to_csv(file_path + "{}_profile_chunk_{}.csv".format("local" if local_only else "global", ii), index=False)
            chunk_df = pd.DataFrame()

        ii += 1
        offset += stride

        print("I am so proud of you, the last profile was created at {}".format(most_recent_created_time))

        # don't get booted off the API
        time.sleep(1)

        # make another call
        req_query = req_stem.format(offset=offset, local_only="true" if local_only else "false")
        r = re.get(req_query)

        try:
            profiles = r.json()
        except Exception as e:
            print(e, offset)


    print("DONE! You did so good!!")
            
    if write:
        profiles_df.to_csv(file_path + "hci_social_profiles_{}.csv".format("local" if local_only else "global"), index=False)
    
    return profiles_df


In [53]:
local_profiles = pull_profiles(local_only=True)

I am so proud of you, the last profile was created at 2023-11-09T00:00:00.000Z
I am so proud of you, the last profile was created at 2022-12-06T00:00:00.000Z
I am so proud of you, the last profile was created at 2022-11-18T00:00:00.000Z
I am so proud of you, the last profile was created at 2022-11-11T00:00:00.000Z
I am so proud of you, the last profile was created at 2022-11-07T00:00:00.000Z
I am so proud of you, the last profile was created at 2022-11-04T00:00:00.000Z
I am so proud of you, the last profile was created at 2022-10-31T00:00:00.000Z
I am so proud of you, the last profile was created at 2022-10-27T00:00:00.000Z
I am so proud of you, the last profile was created at 2022-05-12T00:00:00.000Z
DONE! You did so good!!


In [65]:
# this might give us all? like mastodon.social?
all_profiles = pull_profiles(local_only=False)

I am so proud of you, the last profile was created at 2023-11-15T00:00:00.000Z
I am so proud of you, the last profile was created at 2023-11-15T00:00:00.000Z
I am so proud of you, the last profile was created at 2023-11-14T00:00:00.000Z
I am so proud of you, the last profile was created at 2023-11-14T00:00:00.000Z
I am so proud of you, the last profile was created at 2023-11-14T00:00:00.000Z
I am so proud of you, the last profile was created at 2023-11-14T00:00:00.000Z
I am so proud of you, the last profile was created at 2023-11-13T00:00:00.000Z
I am so proud of you, the last profile was created at 2023-11-13T00:00:00.000Z
I am so proud of you, the last profile was created at 2023-11-12T00:00:00.000Z
caching results
I am so proud of you, the last profile was created at 2023-11-12T00:00:00.000Z
I am so proud of you, the last profile was created at 2023-11-12T00:00:00.000Z
I am so proud of you, the last profile was created at 2023-11-11T00:00:00.000Z
I am so proud of you, the last profi

KeyboardInterrupt: 

Too many account here! I think instead, we should pick a big, but not huge instance and do it there!

## Compute following graph!

In [66]:
df = pd.read_csv("data/profiles/hci_social_profiles_local.csv")

In [162]:
df.head(10)

Unnamed: 0,id,acct,url,created_at,followers_count,following_count,statuses_count,last_status_at,instance
0,108286624813793740,cqz,https://hci.social/@cqz,2022-05-12T00:00:00.000Z,49,20,24,2023-11-03,https://hci.social/
1,108286142941695868,Gillian,https://hci.social/@Gillian,2022-05-12T00:00:00.000Z,368,271,36,2022-12-23,https://hci.social/
2,108286014897638184,AndreaHCI,https://hci.social/@AndreaHCI,2022-05-11T00:00:00.000Z,12,9,1,2022-12-26,https://hci.social/
3,108285815791532398,izd,https://hci.social/@izd,2022-05-11T00:00:00.000Z,39,22,37,2023-10-12,https://hci.social/
4,108285424167487793,shirinfeiz,https://hci.social/@shirinfeiz,2022-05-11T00:00:00.000Z,12,20,0,,https://hci.social/
5,108285175735137681,Helena,https://hci.social/@Helena,2022-05-11T00:00:00.000Z,93,89,11,2023-02-27,https://hci.social/
6,108284677476769450,mlam,https://hci.social/@mlam,2022-05-11T00:00:00.000Z,101,77,7,2023-04-12,https://hci.social/
7,108284650613162617,sukrit,https://hci.social/@sukrit,2022-05-11T00:00:00.000Z,291,255,269,2023-11-02,https://hci.social/
8,108284640329969016,parastoo,https://hci.social/@parastoo,2022-05-11T00:00:00.000Z,85,28,7,2023-09-09,https://hci.social/
9,108284496969846045,masmart,https://hci.social/@masmart,2022-05-11T00:00:00.000Z,118,218,26,2023-08-09,https://hci.social/


### Make a networkx thing

In [75]:
G = nx.DiGraph()

In [76]:
# these will be labelled by the index
G.add_nodes_from(list(df.iterrows()))

In [80]:
# make some id to index maps
index_to_local_id_map = {k: v["id"] for k, v in df.iterrows()}
local_id_to_index_map = {v: k for k, v in index_to_local_id_map.items()}


#### Show that we can pull followers/following

In [104]:
def select_relevant_following_fields(iterable): 
    """
    Just a map to pull certain things out of the json
    
    """
    smaller_json = list(map(lambda x: 
                            dict(
                                id=x["id"],
                                acct=x["acct"],
                                url=x["url"], # so we can tell which server!
                                created_at=x["created_at"],
                                followers_count=x["followers_count"],
                                following_count=x["following_count"],
                                statuses_count=x["statuses_count"],
                                last_status_at=x["last_status_at"]
                            ), 
                            iterable
                           )
                       )
    
    # add the server
    new_list = []
    for a_dict in smaller_json:
        # add the server name
        
        # need both because there are different urls for local versus not
        a_dict["instance"] = a_dict["url"].split("@")[0].split("/users/")[0]
        
        new_list.append(a_dict)
   
    return new_list

In [157]:
id = df.iloc[0]["id"]
id

108286624813793740

In [134]:
this_user_followers = []

In [135]:
followers_string = "https://hci.social/api/v1/accounts/{id}/followers?limit={limit}"

In [136]:
followers_query = followers_string.format(id=id, limit=80)
r = re.get(followers_query)

In [137]:
try:
    follower_list = r.json()
except Exception as e:
    print(e, offset)

this_user_followers.extend(follower_list)

In [138]:
len(this_user_followers)

80

In [126]:
str(r.headers["Link"]).split('>; rel="next",')[0].strip("<")

'https://hci.social/api/v1/accounts/108286142941695868/followers?limit=80&max_id=151166'

In [139]:
r = re.get(str(r.headers["Link"]).split('>; rel="next",')[0].strip("<"))

try:
    follower_list = r.json()
except Exception as e:
    print(e, offset)

    
this_user_followers.extend(follower_list)

In [140]:
len(this_user_followers)

160

In [143]:
r = re.get(str(r.headers["Link"]).split('>; rel="next",')[0].strip("<"))

try:
    follower_list = r.json()
except Exception as e:
    print(e, offset)

    
this_user_followers.extend(follower_list)

In [144]:
len(this_user_followers)

240

In [182]:

def get_this_user_network(user_id, which="following", verbose=False):
    """
    Params:
        which (str):
            one of followers or following
    
    
    """

    this_user_followers = []
    this_batch_follower_list = []

    followers_string = "https://hci.social/api/v1/accounts/{id}/{which}?limit={limit}"
    next_query = followers_string.format(id=user_id, limit=80, which=which)
    r = re.get(next_query)

    if verbose:
        print(next_query)

    
    # dumpster fire of a loop!

    while True:
        r = re.get(next_query)

        try:
            this_batch_follower_list = r.json()
            next_query = str(r.headers["Link"]).split('>; rel="next",')[0].strip("<")


        except Exception as e:
            print(e, id)
            break

        if len(this_batch_follower_list) == 0:
            break

        this_user_followers.extend(select_relevant_following_fields(this_batch_follower_list))

        if '; rel="prev"' in next_query:

            break

        if verbose:
            print(next_query)
        
    return this_user_followers
    


In [183]:
test = get_this_user_network(df.iloc[7]["id"], which="following")

len(test)

255

In [174]:
len(this_user_followers)

291