In [None]:
import pandas as pd

df = pd.read_csv("data/my_following.csv")

In [4]:
display(df)

Unnamed: 0,fetch_followers,username,display_name,user_id,bio,location,followers_count,following_count,tweet_count,verified,blue_verified,profile_url,profile_image,created_at
0,False,systematicls,sysls,,,,0,0,0,True,False,https://x.com/systematicls,,
1,False,jump_,Jump Crypto,,,,0,0,0,True,False,https://x.com/jump_,,
2,False,tszzl,roon,,,,0,0,0,True,False,https://x.com/tszzl,,
3,False,gaby_goldberg,Gaby Goldberg,,,,0,0,0,True,False,https://x.com/gaby_goldberg,,
4,False,_Dave__White_,Dave White,,,,0,0,0,True,False,https://x.com/_Dave__White_,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,False,shreyanj98,Shreyan Jain,,,,0,0,0,False,False,https://x.com/shreyanj98,,
225,False,shayonsengupta,shayon,,,,0,0,0,True,False,https://x.com/shayonsengupta,,
226,False,eglyman,Eric Glyman,,,,0,0,0,True,False,https://x.com/eglyman,,
227,False,southpkcommons,South Park Commons,,,,0,0,0,True,False,https://x.com/southpkcommons,,


In [None]:
# Select users to fetch followers for by username
users_to_fetch = [
    'systematicls',
    'jump_',
    'tszzl',
    'gaby_goldberg',
    '_Dave__White_',
    'CryptoHayes',
    'zhusu',
    'arjunblj',
    'FEhrsam',
    'VitalikButerin',
    'brian_armstrong',
    'matthuang',
    'paradigm',
    'balajis',
    'eladgil',
    'bgurley',
    'cobie',
    'hasufl',
    # Add more usernames here as needed
]

# Update the fetch_followers column based on the list
df['fetch_followers'] = df['username'].isin(users_to_fetch)

# Show how many users are selected
print(f"Selected {df['fetch_followers'].sum()} users out of {len(df)} total")
print("\nSelected users:")
for username in df[df['fetch_followers']]['username']:
    print(f"  - @{username}")

# Save back to CSV
df.to_csv("data/my_following.csv", index=False)
print("\n✅ Saved updated CSV with selections")

In [None]:
# View selected users in a table
selected_df = df[df['fetch_followers']][['username', 'display_name', 'verified']].copy()
selected_df.reset_index(drop=True, inplace=True)
display(selected_df)

In [None]:
# Alternative: Select all verified users (or use other criteria)
# Uncomment to use:

# # Select all verified users
# df['fetch_followers'] = df['verified'] == True

# # Or select top N users (assuming they're sorted by importance)
# df['fetch_followers'] = False  # Reset all
# df.loc[:20, 'fetch_followers'] = True  # Select top 20

# # Or select users matching a pattern
# crypto_users = df['username'].str.contains('crypto|btc|eth|defi', case=False, na=False)
# df['fetch_followers'] = crypto_users

# Save after any selection method
# df.to_csv("data/my_following.csv", index=False)
# print(f"Selected {df['fetch_followers'].sum()} users")

## Researcher Network Analysis

Analyze who the researchers follow and find the most commonly followed accounts (supernodes)

In [None]:
# Load the researcher following network
import pandas as pd
import numpy as np
from pathlib import Path

# Check if the network file exists
network_file = Path('data/researcher_following_network.csv')
if network_file.exists():
    network_df = pd.read_csv(network_file)
    print(f"Loaded {len(network_df)} following relationships")
    print(f"From {network_df['researcher'].nunique()} researchers")
    print(f"Following {network_df['follows'].nunique()} unique accounts")
else:
    print("Network file not found yet. Run fetch_researcher_following.py first!")

In [None]:
# Find supernodes - accounts followed by the most researchers
if network_file.exists():
    # Count how many researchers follow each account
    follow_counts = network_df.groupby('follows').agg({
        'researcher': 'count',
        'follows_display_name': 'first'
    }).rename(columns={'researcher': 'follower_count'}).reset_index()
    
    # Sort by most followed
    follow_counts = follow_counts.sort_values('follower_count', ascending=False)
    
    # Calculate percentage of researchers who follow each account
    total_researchers = network_df['researcher'].nunique()
    follow_counts['percentage'] = (follow_counts['follower_count'] / total_researchers * 100).round(1)
    
    # Display top supernodes
    print("TOP 30 SUPERNODES (Most followed by researchers)")
    print("="*60)
    top_nodes = follow_counts.head(30)[['follows', 'follows_display_name', 'follower_count', 'percentage']]
    top_nodes.index = range(1, len(top_nodes) + 1)
    display(top_nodes)

In [None]:
# Visualize the distribution
if network_file.exists():
    import matplotlib.pyplot as plt
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Distribution of follower counts
    ax1.hist(follow_counts['follower_count'], bins=50, edgecolor='black')
    ax1.set_xlabel('Number of Researchers Following')
    ax1.set_ylabel('Number of Accounts')
    ax1.set_title('Distribution of Follower Counts')
    ax1.axvline(x=follow_counts['follower_count'].median(), color='red', linestyle='--', label=f'Median: {follow_counts["follower_count"].median():.0f}')
    ax1.legend()
    
    # Top 20 as bar chart
    top20 = follow_counts.head(20)
    ax2.barh(range(len(top20)), top20['follower_count'])
    ax2.set_yticks(range(len(top20)))
    ax2.set_yticklabels([f"@{u[:15]}..." if len(u) > 15 else f"@{u}" for u in top20['follows']], fontsize=8)
    ax2.set_xlabel('Number of Researchers Following')
    ax2.set_title('Top 20 Most Followed Accounts')
    ax2.invert_yaxis()
    
    plt.tight_layout()
    plt.show()

In [None]:
# Find researchers who follow specific supernodes
if network_file.exists():
    def who_follows(username):
        """Show which researchers follow a specific account"""
        followers = network_df[network_df['follows'] == username]['researcher'].tolist()
        if followers:
            print(f"Researchers who follow @{username}: ({len(followers)} total)")
            print("-" * 40)
            for researcher in sorted(followers):
                print(f"  @{researcher}")
        else:
            print(f"No researchers follow @{username}")
    
    # Example: Check who follows the top supernode
    if len(follow_counts) > 0:
        top_account = follow_counts.iloc[0]['follows']
        who_follows(top_account)

In [None]:
# Find accounts that are NOT in your following list but are popular among researchers
if network_file.exists():
    # Load your following list
    my_following_df = pd.read_csv('data/my_following.csv')
    my_following = set(my_following_df['username'].tolist())
    
    # Find supernodes you don't follow
    suggestions = follow_counts[~follow_counts['follows'].isin(my_following)].head(20)
    
    print("SUGGESTED ACCOUNTS TO FOLLOW")
    print("(Popular among researchers but not in your following list)")
    print("="*60)
    
    for idx, row in suggestions.iterrows():
        print(f"{row['follower_count']:2d} researchers ({row['percentage']:5.1f}%) follow @{row['follows']}")
        if pd.notna(row['follows_display_name']) and row['follows_display_name']:
            print(f"   Name: {row['follows_display_name']}")

## Simple Network Analysis

In [None]:
# Load the network data
import pandas as pd

network_df = pd.read_csv('data/researcher_following_network.csv')
print(f"Total relationships: {len(network_df)}")
print(f"Researchers analyzed: {network_df['researcher'].nunique()}")
print(f"Unique accounts they follow: {network_df['follows'].nunique()}")

In [None]:
# Find top accounts by number of researchers following them
top_accounts = network_df['follows'].value_counts().reset_index()
top_accounts.columns = ['username', 'researcher_count']

# Add percentage
total_researchers = network_df['researcher'].nunique()
top_accounts['percentage'] = (top_accounts['researcher_count'] / total_researchers * 100).round(1)

# Show top 30
print("TOP 30 ACCOUNTS FOLLOWED BY RESEARCHERS")
print("="*60)
top_30 = top_accounts.head(30)
for idx, row in top_30.iterrows():
    print(f"{row['researcher_count']:2d} researchers ({row['percentage']:5.1f}%) - @{row['username']}")

# Save this as a DataFrame for further analysis
top_accounts_df = top_accounts

In [None]:
# Load your researcher list to check which top accounts are researchers themselves
with open('data/researchers.txt', 'r') as f:
    my_researchers = [line.strip() for line in f.readlines() if line.strip()]

# Check which top accounts are in your researcher list
top_accounts_df['is_researcher'] = top_accounts_df['username'].isin(my_researchers)

# Show top accounts that ARE researchers
researchers_following_researchers = top_accounts_df[top_accounts_df['is_researcher']].head(20)
print("\nTOP RESEARCHERS FOLLOWED BY OTHER RESEARCHERS")
print("="*60)
for idx, row in researchers_following_researchers.iterrows():
    print(f"{row['researcher_count']:2d} researchers ({row['percentage']:5.1f}%) - @{row['username']}")

In [None]:
# Find top accounts that are NOT researchers (potential new follows)
non_researchers = top_accounts_df[~top_accounts_df['is_researcher']].head(20)
print("\nTOP NON-RESEARCHER ACCOUNTS TO POTENTIALLY FOLLOW")
print("="*60)
for idx, row in non_researchers.iterrows():
    print(f"{row['researcher_count']:2d} researchers ({row['percentage']:5.1f}%) - @{row['username']}")

In [None]:
# Check which researchers follow which other researchers
researcher_network = network_df[network_df['follows'].isin(my_researchers)]
researcher_connections = researcher_network.groupby('follows').size().sort_values(ascending=False)

print("\nRESEARCHER INTERCONNECTIONS")
print("="*60)
print("How many researchers follow each researcher in your list:")
print()
for researcher, count in researcher_connections.head(15).items():
    percentage = (count / total_researchers * 100)
    print(f"{count:2d} researchers ({percentage:5.1f}%) follow @{researcher}")

In [None]:
# Compare with your current following list
my_following = pd.read_csv('data/my_following.csv')
my_following_usernames = set(my_following['username'].tolist())

# Find top accounts you're NOT following yet
not_following = top_accounts_df[~top_accounts_df['username'].isin(my_following_usernames)].head(20)

print("\nSUGGESTED NEW ACCOUNTS TO FOLLOW")
print("(Popular among researchers but not in your following list)")
print("="*60)
for idx, row in not_following.iterrows():
    researcher_label = " [RESEARCHER]" if row['is_researcher'] else ""
    print(f"{row['researcher_count']:2d} researchers ({row['percentage']:5.1f}%) - @{row['username']}{researcher_label}")