In [12]:
import pandas as pd
import numpy as np
import requests as re
import time
import os
import sqlite3
import json
import random
# from mastodon import Mastodon
import networkx as nx

import matplotlib.pyplot as plt

In [13]:
# pandas options
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

In [80]:
def select_relevant_following_fields(iterable, instance): 
    """
    Just a map to pull certain things out of the json
    
    """
    smaller_json = list(map(lambda x: 
                            dict(
                                id=x["id"],
                                acct=x["acct"],
                                url=x["url"], # so we can tell which server!
                                created_at=x["created_at"],
                                followers_count=x["followers_count"],
                                following_count=x["following_count"],
                                statuses_count=x["statuses_count"],
                                last_status_at=x["last_status_at"]
                            ), 
                            iterable
                           )
                       )
    
    # add the server
    new_list = []
    for a_dict in smaller_json:
        # add the server name
        
        # need both because there are different urls for local versus not
        a_dict["instance"] = a_dict["url"].split("@")[0].split("/users/")[0]
        
        #check that the user instance is the one we need
        if a_dict['instance'] == instance:
            new_list.append(a_dict)
   
    return new_list

In [81]:
def pull_profiles(file_path="data/profiles/", local_only=True, offset=0, stride=80, write=True):
    """
    
    """
    profiles_df = pd.DataFrame()
    chunk_df = pd.DataFrame()

    req_stem = "https://hci.social/api/v1/directory?limit=80&local={local_only}&order=new&offset={offset}"
    req_query = req_stem.format(offset=offset, local_only="true" if local_only else "false")
    r = re.get(req_query)

    try:
        profiles = r.json()
    except Exception as e:
        print(e, offset)

    ii = 0

    while len(profiles) > 0:
        # keep going until you run out!

        this_profiles_chunk = pd.DataFrame(select_relevant_fields(profiles))

        # TODO: Don't concat these because we don't want huge CSVs! 
        # the newest thing is at the top, so concat old to the new
        profiles_df = pd.concat([this_profiles_chunk, profiles_df], axis=0)

        # Dataframe for 10 interations
        chunk_df = pd.concat([this_profiles_chunk, chunk_df], axis=0)

        # find out your new min_id
        most_recent_created_time = this_profiles_chunk["created_at"].max()

        if (ii + 1) % 10 == 0:
            print("caching results") 
            if write:
                chunk_df.to_csv(file_path + "{}_profile_chunk_{}.csv".format("local" if local_only else "global", ii), index=False)
            chunk_df = pd.DataFrame()

        ii += 1
        offset += stride

        print("I am so proud of you, the last profile was created at {}".format(most_recent_created_time))

        # don't get booted off the API
        time.sleep(1)

        # make another call
        req_query = req_stem.format(offset=offset, local_only="true" if local_only else "false")
        r = re.get(req_query)

        try:
            profiles = r.json()
        except Exception as e:
            print(e, offset)


    print("DONE! You did so good!!")
            
    if write:
        profiles_df.to_csv(file_path + "hci_social_profiles_{}.csv".format("local" if local_only else "global"), index=False)
    
    return profiles_df


In [82]:
# local_profiles = pull_profiles(local_only=True)

## Trying shit out
- Get all the followers and following for the user
- Concat them

In [101]:
users_profiles = pd.read_csv("data/profiles/hci_social_profiles_local.csv")

In [102]:
users_profiles.head()

Unnamed: 0,id,acct,url,created_at,followers_count,following_count,statuses_count,last_status_at,instance
0,108286624813793740,cqz,https://hci.social/@cqz,2022-05-12T00:00:00.000Z,48,20,24,2023-11-03,https://hci.social/
1,108286142941695868,Gillian,https://hci.social/@Gillian,2022-05-12T00:00:00.000Z,368,271,36,2022-12-23,https://hci.social/
2,108286014897638184,AndreaHCI,https://hci.social/@AndreaHCI,2022-05-11T00:00:00.000Z,13,9,1,2022-12-26,https://hci.social/
3,108285815791532398,izd,https://hci.social/@izd,2022-05-11T00:00:00.000Z,39,22,37,2023-10-12,https://hci.social/
4,108285424167487793,shirinfeiz,https://hci.social/@shirinfeiz,2022-05-11T00:00:00.000Z,12,20,0,,https://hci.social/


In [103]:
def get_this_user_network_on_local_instance(user_id, instance, which="following", verbose=False):
    """
    Either get followers or following for a specific ID
    
    Params:
        which (str):
            one of followers or following
        instance (str):
            name of hci.social
    
    
    """

    this_user_followers = []
    this_batch_follower_list = []

    followers_string = "https://hci.social/api/v1/accounts/{id}/{which}?limit={limit}"
    next_query = followers_string.format(id=user_id, limit=80, which=which)
    r = re.get(next_query)

    if verbose:
        print(next_query)

    
    # dumpster fire of a loop!
    while True:
        r = re.get(next_query)

        try:
            this_batch_follower_list = r.json()
            # please don't break! but if it does, check here first!
            next_query = str(r.headers["Link"]).split('>; rel="next",')[0].strip("<")

        except Exception as e:
            print(e, id)
            break

        if len(this_batch_follower_list) == 0:
            break

        this_user_followers.extend(select_relevant_following_fields(this_batch_follower_list, instance))

        if '; rel="prev"' in next_query:
            # I'm gonna puke
            break

        if verbose:
            print(next_query)
        
    return this_user_followers

In [104]:
test_user = get_this_user_network_on_local_instance(users_profiles.iloc[0]["id"], instance='https://hci.social/', which="followers")

len(test_user)

25

In [105]:
all_users_df = users_profiles.copy()

In [89]:
this_user_df = pd.DataFrame.from_records(test_user,
                       columns=['id', 'acct', 'url', 'created_at','followers_count','following_count','statuses_count', 'last_status_at','instance'])

In [92]:
pd.concat([all_users_df, this_user_df])

Unnamed: 0,id,acct,url,created_at,followers_count,following_count,statuses_count,last_status_at,instance
0,108286624813793740,cqz,https://hci.social/@cqz,2022-05-12T00:00:00.000Z,48,20,24,2023-11-03,https://hci.social/
1,108286142941695868,Gillian,https://hci.social/@Gillian,2022-05-12T00:00:00.000Z,368,271,36,2022-12-23,https://hci.social/
2,108286014897638184,AndreaHCI,https://hci.social/@AndreaHCI,2022-05-11T00:00:00.000Z,13,9,1,2022-12-26,https://hci.social/
3,108285815791532398,izd,https://hci.social/@izd,2022-05-11T00:00:00.000Z,39,22,37,2023-10-12,https://hci.social/
4,108285424167487793,shirinfeiz,https://hci.social/@shirinfeiz,2022-05-11T00:00:00.000Z,12,20,0,,https://hci.social/
...,...,...,...,...,...,...,...,...,...
20,108236223653826827,axz,https://hci.social/@axz,2022-05-03T00:00:00.000Z,2421,1177,803,2023-11-19,https://hci.social/
21,108280547247259304,kjfeng,https://hci.social/@kjfeng,2022-05-11T00:00:00.000Z,85,30,53,2023-10-19,https://hci.social/
22,108280517522446278,jbigham,https://hci.social/@jbigham,2022-05-11T00:00:00.000Z,2895,1160,3850,2023-11-19,https://hci.social/
23,108236105999236686,msbernst,https://hci.social/@msbernst,2022-05-03T00:00:00.000Z,857,230,79,2023-10-23,https://hci.social/


## Expanding the users dataset

In [106]:
def create_network_df(user_id):
    users_followers = get_this_user_network_on_local_instance(user_id, instance='https://hci.social/', which="followers")
    users_followers = get_this_user_network_on_local_instance(user_id, instance='https://hci.social/', which="following")
    
    users_followers_df = pd.DataFrame.from_records(users_followers,
                       columns=['id', 'acct', 'url', 'created_at','followers_count','following_count','statuses_count', 'last_status_at','instance'])
    
    users_followeing_df = pd.DataFrame.from_records(users_followers,
                       columns=['id', 'acct', 'url', 'created_at','followers_count','following_count','statuses_count', 'last_status_at','instance'])
    
    concated_network = pd.concat([users_followers_df, users_followeing_df]).drop_duplicates().reset_index(drop=True)
    
    print("compeleted network for user:", user_id)
    time.sleep(np.random.rand(1)[0] * 2)
    
    return concated_network

In [107]:
for index, user in all_users_df.iterrows():
    this_user_df = create_network_df(user['id'])
    all_users_df = pd.concat([all_users_df, this_user_df])
    all_users_df = all_users_df.drop_duplicates(subset=['id'])

compeleted network for user: 108286624813793740
compeleted network for user: 108286142941695868
compeleted network for user: 108286014897638184
compeleted network for user: 108285815791532398
compeleted network for user: 108285424167487793
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 108285175735137681
compeleted network for user: 108284677476769450
compeleted network for user: 108284650613162617
compeleted network for user: 108284640329969016
compeleted network for user: 108284496969846045
compeleted network for user: 108284491496238772
compeleted network for user: 108284472777287286
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 108284451199412068
compeleted network for user: 108284448676877930
compeleted network for user: 108284310168001085
compeleted network for user: 108284264938521441
compeleted network for user: 108284252022830475
compeleted network for user: 108284088264614478
compeleted netwo

'link' <built-in function id>
compeleted network for user: 109253964349901029
'link' <built-in function id>
compeleted network for user: 109253931038230175
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109253290398268251
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109253204895796949
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109252454803243622
compeleted network for user: 109252336625412468
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109252321582544851
compeleted network for user: 109252189679826823
compeleted network for user: 109251984214907175
compeleted network for user: 109251643418653166
compeleted network for user: 109251007471853832
compeleted network for user: 109250532605333138
compeleted network for user: 109249587069859571
compeleted network for user: 109249518119144366
compeleted network for user:

compeleted network for user: 109302768077593747
compeleted network for user: 109302351848093002
compeleted network for user: 109302231076401095
compeleted network for user: 109302162128969370
compeleted network for user: 109302104569352815
compeleted network for user: 109302067848012048
'link' <built-in function id>
compeleted network for user: 109301935417443270
compeleted network for user: 109301638892669439
compeleted network for user: 109301440426132885
compeleted network for user: 109301065562392335
compeleted network for user: 109300614052171791
compeleted network for user: 109300564324955535
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109300228694892432
compeleted network for user: 109300194370757836
compeleted network for user: 109299936299909395
compeleted network for user: 109299701700270487
compeleted network for user: 109299266358679169
compeleted network for user: 109299115096863503
compeleted network for user: 10929905578203115

'link' <built-in function id>
compeleted network for user: 109304598285199360
compeleted network for user: 109304366540725733
compeleted network for user: 109304173438084607
'link' <built-in function id>
compeleted network for user: 109304082468998713
compeleted network for user: 109304026368365371
compeleted network for user: 109303982014584857
compeleted network for user: 109303743092797103
compeleted network for user: 109303722580188467
compeleted network for user: 109303645790566894
compeleted network for user: 109303618080632020
'link' <built-in function id>
compeleted network for user: 109303545466752312
compeleted network for user: 109303334928107154
compeleted network for user: 109303305009636798
compeleted network for user: 109303287077879344
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 109303084131851411
compeleted network for user: 109302989260629774
compeleted network for user: 109366636468892302
compeleted network for user: 10936

compeleted network for user: 109377355043143797
compeleted network for user: 109376977929065344
compeleted network for user: 109376484293731354
compeleted network for user: 109375819463935760
compeleted network for user: 109375097061209702
compeleted network for user: 109374562408305834
compeleted network for user: 109374301916304620
compeleted network for user: 109372656808843389
compeleted network for user: 109372559768279556
compeleted network for user: 109371384002613227
compeleted network for user: 109370937431743023
compeleted network for user: 109370516487211713
compeleted network for user: 109370170221477636
compeleted network for user: 109369940840244406
compeleted network for user: 109369505709659605
compeleted network for user: 109368940667665696
compeleted network for user: 109368562759262502
'link' <built-in function id>
compeleted network for user: 109368379392668571
compeleted network for user: 109368181787589645
compeleted network for user: 109367480955794194
'link' <bu

In [138]:
len(all_users_df)

22094

In [139]:
all_users_df_clean = all_users_df.drop_duplicates(subset=['id'])

In [140]:
all_users_df_clean.reset_index(drop=True, inplace=True)

In [141]:
all_users_df_clean.to_csv("data/profiles/extended_df.csv")

In [142]:
all_users_df_clean[all_users_df_clean['acct'] == 'Anas']

Unnamed: 0,id,acct,url,created_at,followers_count,following_count,statuses_count,last_status_at,instance
1127,109245646986992073,Anas,https://hci.social/@Anas,2022-10-28T00:00:00.000Z,46,48,24,2023-11-11,https://hci.social/


## Running another interation of the code to extend the dataset

In [None]:
for index, user in all_users_df_clean.iterrows():
    this_user_df = create_network_df(user['id'])
    all_users_df = pd.concat([all_users_df, this_user_df])
    all_users_df = all_users_df.drop_duplicates(subset=['id'])

compeleted network for user: 108286624813793740
compeleted network for user: 108286142941695868
compeleted network for user: 108286014897638184
compeleted network for user: 108285815791532398
compeleted network for user: 108285424167487793
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 108285175735137681
compeleted network for user: 108284677476769450
compeleted network for user: 108284650613162617
compeleted network for user: 108284640329969016
compeleted network for user: 108284496969846045
compeleted network for user: 108284491496238772
compeleted network for user: 108284472777287286
'link' <built-in function id>
'link' <built-in function id>
compeleted network for user: 108284451199412068
compeleted network for user: 108284448676877930
compeleted network for user: 108284310168001085
compeleted network for user: 108284264938521441
compeleted network for user: 108284252022830475
compeleted network for user: 108284088264614478
compeleted netwo

In [None]:
all_users_df_clean_extended = all_users_df.drop_duplicates(subset=['id'])
all_users_df_clean_extended.reset_index(drop=True, inplace=True)
all_users_df_clean_extended.to_csv("data/profiles/extended_df_complete.csv")