In [12]:
import pandas as pd
import numpy as np
import requests as re
import time
import os
import sqlite3
import json
import random
# from mastodon import Mastodon
import networkx as nx

import matplotlib.pyplot as plt

In [13]:
# pandas options
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

In [80]:
def select_relevant_following_fields(iterable, instance): 
    """
    Just a map to pull certain things out of the json
    
    """
    smaller_json = list(map(lambda x: 
                            dict(
                                id=x["id"],
                                acct=x["acct"],
                                url=x["url"], # so we can tell which server!
                                created_at=x["created_at"],
                                followers_count=x["followers_count"],
                                following_count=x["following_count"],
                                statuses_count=x["statuses_count"],
                                last_status_at=x["last_status_at"]
                            ), 
                            iterable
                           )
                       )
    
    # add the server
    new_list = []
    for a_dict in smaller_json:
        # add the server name
        
        # need both because there are different urls for local versus not
        a_dict["instance"] = a_dict["url"].split("@")[0].split("/users/")[0]
        
        #check that the user instance is the one we need
        if a_dict['instance'] == instance:
            new_list.append(a_dict)
   
    return new_list

In [81]:
def pull_profiles(file_path="data/profiles/", local_only=True, offset=0, stride=80, write=True):
    """
    
    """
    profiles_df = pd.DataFrame()
    chunk_df = pd.DataFrame()

    req_stem = "https://hci.social/api/v1/directory?limit=80&local={local_only}&order=new&offset={offset}"
    req_query = req_stem.format(offset=offset, local_only="true" if local_only else "false")
    r = re.get(req_query)

    try:
        profiles = r.json()
    except Exception as e:
        print(e, offset)

    ii = 0

    while len(profiles) > 0:
        # keep going until you run out!

        this_profiles_chunk = pd.DataFrame(select_relevant_fields(profiles))

        # TODO: Don't concat these because we don't want huge CSVs! 
        # the newest thing is at the top, so concat old to the new
        profiles_df = pd.concat([this_profiles_chunk, profiles_df], axis=0)

        # Dataframe for 10 interations
        chunk_df = pd.concat([this_profiles_chunk, chunk_df], axis=0)

        # find out your new min_id
        most_recent_created_time = this_profiles_chunk["created_at"].max()

        if (ii + 1) % 10 == 0:
            print("caching results") 
            if write:
                chunk_df.to_csv(file_path + "{}_profile_chunk_{}.csv".format("local" if local_only else "global", ii), index=False)
            chunk_df = pd.DataFrame()

        ii += 1
        offset += stride

        print("I am so proud of you, the last profile was created at {}".format(most_recent_created_time))

        # don't get booted off the API
        time.sleep(1)

        # make another call
        req_query = req_stem.format(offset=offset, local_only="true" if local_only else "false")
        r = re.get(req_query)

        try:
            profiles = r.json()
        except Exception as e:
            print(e, offset)


    print("DONE! You did so good!!")
            
    if write:
        profiles_df.to_csv(file_path + "hci_social_profiles_{}.csv".format("local" if local_only else "global"), index=False)
    
    return profiles_df


In [82]:
# local_profiles = pull_profiles(local_only=True)

## Trying shit out
- Get all the followers and following for the user
- Concat them

In [101]:
users_profiles = pd.read_csv("data/profiles/hci_social_profiles_local.csv")

In [102]:
users_profiles.head()

Unnamed: 0,id,acct,url,created_at,followers_count,following_count,statuses_count,last_status_at,instance
0,108286624813793740,cqz,https://hci.social/@cqz,2022-05-12T00:00:00.000Z,48,20,24,2023-11-03,https://hci.social/
1,108286142941695868,Gillian,https://hci.social/@Gillian,2022-05-12T00:00:00.000Z,368,271,36,2022-12-23,https://hci.social/
2,108286014897638184,AndreaHCI,https://hci.social/@AndreaHCI,2022-05-11T00:00:00.000Z,13,9,1,2022-12-26,https://hci.social/
3,108285815791532398,izd,https://hci.social/@izd,2022-05-11T00:00:00.000Z,39,22,37,2023-10-12,https://hci.social/
4,108285424167487793,shirinfeiz,https://hci.social/@shirinfeiz,2022-05-11T00:00:00.000Z,12,20,0,,https://hci.social/


In [103]:
def get_this_user_network_on_local_instance(user_id, instance, which="following", verbose=False):
    """
    Either get followers or following for a specific ID
    
    Params:
        which (str):
            one of followers or following
        instance (str):
            name of hci.social
    
    
    """

    this_user_followers = []
    this_batch_follower_list = []

    followers_string = "https://hci.social/api/v1/accounts/{id}/{which}?limit={limit}"
    next_query = followers_string.format(id=user_id, limit=80, which=which)
    r = re.get(next_query)

    if verbose:
        print(next_query)

    
    # dumpster fire of a loop!
    while True:
        r = re.get(next_query)

        try:
            this_batch_follower_list = r.json()
            # please don't break! but if it does, check here first!
            next_query = str(r.headers["Link"]).split('>; rel="next",')[0].strip("<")

        except Exception as e:
            print(e, id)
            break

        if len(this_batch_follower_list) == 0:
            break

        this_user_followers.extend(select_relevant_following_fields(this_batch_follower_list, instance))

        if '; rel="prev"' in next_query:
            # I'm gonna puke
            break

        if verbose:
            print(next_query)
        
    return this_user_followers

In [104]:
test_user = get_this_user_network_on_local_instance(users_profiles.iloc[0]["id"], instance='https://hci.social/', which="followers")

len(test_user)

25

In [105]:
all_users_df = users_profiles.copy()

In [89]:
this_user_df = pd.DataFrame.from_records(test_user,
                       columns=['id', 'acct', 'url', 'created_at','followers_count','following_count','statuses_count', 'last_status_at','instance'])

In [92]:
pd.concat([all_users_df, this_user_df])

Unnamed: 0,id,acct,url,created_at,followers_count,following_count,statuses_count,last_status_at,instance
0,108286624813793740,cqz,https://hci.social/@cqz,2022-05-12T00:00:00.000Z,48,20,24,2023-11-03,https://hci.social/
1,108286142941695868,Gillian,https://hci.social/@Gillian,2022-05-12T00:00:00.000Z,368,271,36,2022-12-23,https://hci.social/
2,108286014897638184,AndreaHCI,https://hci.social/@AndreaHCI,2022-05-11T00:00:00.000Z,13,9,1,2022-12-26,https://hci.social/
3,108285815791532398,izd,https://hci.social/@izd,2022-05-11T00:00:00.000Z,39,22,37,2023-10-12,https://hci.social/
4,108285424167487793,shirinfeiz,https://hci.social/@shirinfeiz,2022-05-11T00:00:00.000Z,12,20,0,,https://hci.social/
...,...,...,...,...,...,...,...,...,...
20,108236223653826827,axz,https://hci.social/@axz,2022-05-03T00:00:00.000Z,2421,1177,803,2023-11-19,https://hci.social/
21,108280547247259304,kjfeng,https://hci.social/@kjfeng,2022-05-11T00:00:00.000Z,85,30,53,2023-10-19,https://hci.social/
22,108280517522446278,jbigham,https://hci.social/@jbigham,2022-05-11T00:00:00.000Z,2895,1160,3850,2023-11-19,https://hci.social/
23,108236105999236686,msbernst,https://hci.social/@msbernst,2022-05-03T00:00:00.000Z,857,230,79,2023-10-23,https://hci.social/


## Expanding the users dataset

In [144]:
def create_network_df(user_id):
    users_followers = get_this_user_network_on_local_instance(user_id, instance='https://hci.social/', which="followers")
    users_followers = get_this_user_network_on_local_instance(user_id, instance='https://hci.social/', which="following")
    
    users_followers_df = pd.DataFrame.from_records(users_followers,
                       columns=['id', 'acct', 'url', 'created_at','followers_count','following_count','statuses_count', 'last_status_at','instance'])
    
    users_followeing_df = pd.DataFrame.from_records(users_followers,
                       columns=['id', 'acct', 'url', 'created_at','followers_count','following_count','statuses_count', 'last_status_at','instance'])
    
    concated_network = pd.concat([users_followers_df, users_followeing_df]).drop_duplicates().reset_index(drop=True)
    
    print("compeleted network for user:", user_id)
    time.sleep(np.random.rand(1)[0] * 3)
    
    return concated_network

In [None]:
for index, user in all_users_df.iterrows():
    this_user_df = create_network_df(user['id'])
    all_users_df = pd.concat([all_users_df, this_user_df])
    all_users_df = all_users_df.drop_duplicates(subset=['id'])

In [138]:
len(all_users_df)

22094

In [139]:
all_users_df_clean = all_users_df.drop_duplicates(subset=['id'])

In [140]:
all_users_df_clean.reset_index(drop=True, inplace=True)

In [141]:
all_users_df_clean.to_csv("data/profiles/extended_df.csv")

In [142]:
all_users_df_clean[all_users_df_clean['acct'] == 'Anas']

Unnamed: 0,id,acct,url,created_at,followers_count,following_count,statuses_count,last_status_at,instance
1127,109245646986992073,Anas,https://hci.social/@Anas,2022-10-28T00:00:00.000Z,46,48,24,2023-11-11,https://hci.social/


## Running another interation of the code to extend the dataset

In [150]:
for index, user in all_users_df_clean.iterrows():
    this_user_df = create_network_df(user['id'])
    all_users_df = pd.concat([all_users_df, this_user_df])
    all_users_df = all_users_df.drop_duplicates(subset=['id'])

compeleted network for user: 108286624813793740
compeleted network for user: 108286142941695868
compeleted network for user: 108286014897638184
compeleted network for user: 108285815791532398
compeleted network for user: 108285424167487793
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 108285175735137681
compeleted network for user: 108284677476769450
compeleted network for user: 108284650613162617
compeleted network for user: 108284640329969016
compeleted network for user: 108284496969846045
compeleted network for user: 108284491496238772
compeleted network for user: 108284472777287286
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 108284451199412068
compeleted network for user: 108284448676877930
compeleted network for user: 108284310168001085
compeleted network for user: 108284264938521441
compeleted network for user: 108284252022830475
compeleted network for user: 108284088264614478
compeleted netwo

'link' <built-in function id>
compeleted network for user: 109253964349901029
'link' <built-in function id>
compeleted network for user: 109253931038230175
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109253290398268251
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109253204895796949
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109252454803243622
compeleted network for user: 109252336625412468
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109252321582544851
compeleted network for user: 109252189679826823
compeleted network for user: 109251984214907175
compeleted network for user: 109251643418653166
compeleted network for user: 109251007471853832
compeleted network for user: 109250532605333138
compeleted network for user: 109249587069859571
compeleted network for user: 109249518119144366
compeleted network for user:

compeleted network for user: 109302351848093002
compeleted network for user: 109302231076401095
compeleted network for user: 109302162128969370
compeleted network for user: 109302104569352815
compeleted network for user: 109302067848012048
'link' <built-in function id>
compeleted network for user: 109301935417443270
compeleted network for user: 109301638892669439
compeleted network for user: 109301440426132885
compeleted network for user: 109301065562392335
compeleted network for user: 109300614052171791
compeleted network for user: 109300564324955535
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109300228694892432
compeleted network for user: 109300194370757836
compeleted network for user: 109299936299909395
compeleted network for user: 109299701700270487
compeleted network for user: 109299266358679169
compeleted network for user: 109299115096863503
compeleted network for user: 109299055782031159
compeleted network for user: 10929904093196207

compeleted network for user: 109303334928107154
compeleted network for user: 109303305009636798
compeleted network for user: 109303287077879344
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109303084131851411
compeleted network for user: 109302989260629774
compeleted network for user: 109366636468892302
compeleted network for user: 109366545645139795
compeleted network for user: 109366456760400672
compeleted network for user: 109366013550148841
compeleted network for user: 109365811459186116
compeleted network for user: 109365806130445647
compeleted network for user: 109365763897345965
compeleted network for user: 109365750074592831
compeleted network for user: 109365689925526195
'link' <built-in function id>
compeleted network for user: 109365640318412019
compeleted network for user: 109365307491071199
compeleted network for user: 109365123954993214
compeleted network for user: 109365050178910468
compeleted network for user: 10936492746133511

compeleted network for user: 109369940840244406
compeleted network for user: 109369505709659605
compeleted network for user: 109368940667665696
compeleted network for user: 109368562759262502
'link' <built-in function id>
compeleted network for user: 109368379392668571
compeleted network for user: 109368181787589645
compeleted network for user: 109367480955794194
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109367202126424699
'link' <built-in function id>
compeleted network for user: 109367007675028391
compeleted network for user: 109366795394162940
compeleted network for user: 109366768853363849
compeleted network for user: 109366689489304275
compeleted network for user: 111379138966500320
'link' <built-in function id>
compeleted network for user: 111098127756385669
compeleted network for user: 111072544868136604
compeleted network for user: 111052657062696195
'link' <built-in function id>
compeleted network for user: 110893937406555779
comp

'link' <built-in function id>
compeleted network for user: 109263282248659400
compeleted network for user: 108280673743424065
'link' <built-in function id>
compeleted network for user: 109301935417443270
compeleted network for user: 109366241785086382
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109243180054675546
compeleted network for user: 109308210156353996
compeleted network for user: 109246485146891623
compeleted network for user: 109286753423453633
'link' <built-in function id>
compeleted network for user: 109364387865374911
compeleted network for user: 109356585783941560
'link' <built-in function id>
compeleted network for user: 109275808009030079
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 108291693973206064
compeleted network for user: 109353448516631051
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 108326060209624978
compeleted network for user:

'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109253964349901029
compeleted network for user: 108449722370868684
compeleted network for user: 109237690835839360
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 108280677942867383
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109289174211147532
compeleted network for user: 109249027905682266
compeleted network for user: 109303305009636798
compeleted network for user: 109246925270944146
compeleted network for user: 109287853681360380
compeleted network for user: 109236492681129439
compeleted network for user: 108289175834751293
compeleted network for user: 108324117483679048
compeleted network for user: 108365058432292992
compeleted network for user: 108332265809840423
compeleted network for user: 108325214318731650
compeleted network for user: 108291129043692085
compeleted network for user: 108280987290032170
comp

'link' <built-in function id>
compeleted network for user: 109308539041800130
compeleted network for user: 108381185205431914
compeleted network for user: 108236268879552821
compeleted network for user: 109366799845269096
compeleted network for user: 109366943464906469
compeleted network for user: 109303618080632020
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109300380141553422
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 108296405684953974
compeleted network for user: 108292834480782365
'link' <built-in function id>
compeleted network for user: 109280317497461978
compeleted network for user: 108295405531541455
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 108283162462559154
compeleted network for user: 108281570708175251
compeleted network for user: 108289477786945253
compeleted network for user: 109299034501431062
'link' <built-in function id>
'link' <bu

compeleted network for user: 109243943869810816
compeleted network for user: 109253290398268251
compeleted network for user: 109248900673468722
compeleted network for user: 109252189679826823
compeleted network for user: 109246443246176928
compeleted network for user: 109249041104936542
compeleted network for user: 109246904496761512
compeleted network for user: 109246477924463986
compeleted network for user: 109245600698050521
compeleted network for user: 109240401653967860
compeleted network for user: 109244765885093703
compeleted network for user: 108440085737885433
compeleted network for user: 108238329972013885
compeleted network for user: 108281727134166790
compeleted network for user: 109422448152419358
compeleted network for user: 109523540144379772
compeleted network for user: 109246687291782999
compeleted network for user: 108295900253586397
compeleted network for user: 109340720554149337
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user:

'link' <built-in function id>
compeleted network for user: 109293158840427651
compeleted network for user: 109290386057295877
compeleted network for user: 109292193545233343
compeleted network for user: 109288378899101481
compeleted network for user: 109288484565808884
compeleted network for user: 109279922921495057
compeleted network for user: 109283683613152354
compeleted network for user: 109275110217837399
compeleted network for user: 108291978222356024
compeleted network for user: 108291284133279142
compeleted network for user: 108285558049650677
compeleted network for user: 108285395091589420
compeleted network for user: 108285424167487793
compeleted network for user: 108280834472844252
compeleted network for user: 108280802989861879
compeleted network for user: 109309230422905830
compeleted network for user: 109378992255331681
compeleted network for user: 109385834474271752
compeleted network for user: 109247203894372980
'link' <built-in function id>
'link' <built-in function id

compeleted network for user: 109671487654204383
compeleted network for user: 109428838520224778
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109269978389614405
compeleted network for user: 109683011500290031
compeleted network for user: 109279537372071601
compeleted network for user: 109602186857989529
compeleted network for user: 109668115660458384
compeleted network for user: 109349717979443119
compeleted network for user: 109292577100142579
compeleted network for user: 109458299435082430
compeleted network for user: 109546587355225186
compeleted network for user: 109540139050744843
compeleted network for user: 109333946687763016
compeleted network for user: 109441400580044767
compeleted network for user: 109292154428172935
compeleted network for user: 109464184699995457
compeleted network for user: 109315836042600371
compeleted network for user: 109360052762288188
compeleted network for user: 108357952719219784
compeleted network for user:

compeleted network for user: 109304898885789370
compeleted network for user: 109281967385946308
compeleted network for user: 109298076261385521
compeleted network for user: 109293139737082387
compeleted network for user: 109420920073675301
compeleted network for user: 109289251767450196
compeleted network for user: 109326657352603252
compeleted network for user: 109307693691798608
compeleted network for user: 109299055782031159
compeleted network for user: 109299743320185551
compeleted network for user: 109376880925044438
compeleted network for user: 109366545645139795
compeleted network for user: 109240517495272423
'link' <built-in function id>
compeleted network for user: 109301520822439452
compeleted network for user: 109380719292586060
compeleted network for user: 109308705426178443
compeleted network for user: 109387573844083265
compeleted network for user: 109296793184725587
compeleted network for user: 109375482898253442
compeleted network for user: 109238449215450884
compeleted

compeleted network for user: 109297730214569993
compeleted network for user: 109328175627964002
compeleted network for user: 109509162646675004
compeleted network for user: 109550617268554419
compeleted network for user: 109252474140785361
compeleted network for user: 109318454615323988
compeleted network for user: 109309206452625700
compeleted network for user: 109304390729221618
compeleted network for user: 109440638686143533
compeleted network for user: 109372656808843389
compeleted network for user: 109308729402553996
compeleted network for user: 109266787399025444
compeleted network for user: 109527325054251079
compeleted network for user: 109281985969092709
compeleted network for user: 109287527688377034
compeleted network for user: 109286190386113101
compeleted network for user: 108562568122196333
compeleted network for user: 109239128579221475
compeleted network for user: 110813089652467251
compeleted network for user: 110541492338186348
compeleted network for user: 11077008961

compeleted network for user: 109247247703025234
compeleted network for user: 109251984214907175
compeleted network for user: 109287902689731202
compeleted network for user: 109711036820059329
compeleted network for user: 109327467567897494
compeleted network for user: 109286944735021347
compeleted network for user: 109288569732614651
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109292221431315534
compeleted network for user: 109524473909284475
compeleted network for user: 109383554639079176
compeleted network for user: 109382512398131387
compeleted network for user: 110642684459658245
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109294106167794337
compeleted network for user: 109295030721437146
compeleted network for user: 109347113730051689
compeleted network for user: 109341170941929272
compeleted network for user: 109253441564846036
compeleted network for user: 109299040931962074
compeleted netwo

'link' <built-in function id>
compeleted network for user: 109257621270378650
compeleted network for user: 109296628541552855
compeleted network for user: 109383711092659925
compeleted network for user: 109284048723312060
compeleted network for user: 109345732909824961
compeleted network for user: 109377086429283738
compeleted network for user: 109364637871758195
compeleted network for user: 109312653929403203
compeleted network for user: 109365806130445647
compeleted network for user: 109377676582106376
compeleted network for user: 109296193037530235
'link' <built-in function id>
compeleted network for user: 109384330474736206
compeleted network for user: 109377355043143797
compeleted network for user: 109377419215492633
compeleted network for user: 109530554260821942
compeleted network for user: 109421985603907452
compeleted network for user: 110574158344377012
compeleted network for user: 110236120706656777
compeleted network for user: 109290570222750229
compeleted network for user:

In [151]:
len(all_users_df)

2165

In [152]:
all_users_df_clean_extended = all_users_df.drop_duplicates(subset=['id'])
all_users_df_clean_extended.reset_index(drop=True, inplace=True)
all_users_df_clean_extended.to_csv("data/profiles/extended_df_complete.csv")

In [153]:
all_users_df_clean_extended.head()

Unnamed: 0,id,acct,url,created_at,followers_count,following_count,statuses_count,last_status_at,instance
0,108286624813793740,cqz,https://hci.social/@cqz,2022-05-12T00:00:00.000Z,48,20,24,2023-11-03,https://hci.social/
1,108286142941695868,Gillian,https://hci.social/@Gillian,2022-05-12T00:00:00.000Z,368,271,36,2022-12-23,https://hci.social/
2,108286014897638184,AndreaHCI,https://hci.social/@AndreaHCI,2022-05-11T00:00:00.000Z,13,9,1,2022-12-26,https://hci.social/
3,108285815791532398,izd,https://hci.social/@izd,2022-05-11T00:00:00.000Z,39,22,37,2023-10-12,https://hci.social/
4,108285424167487793,shirinfeiz,https://hci.social/@shirinfeiz,2022-05-11T00:00:00.000Z,12,20,0,,https://hci.social/
