# Network Analysis: Researchers & Operators

This notebook analyzes:
1. Operator network - common accounts followed by operators
2. Researcher network - common accounts followed by researchers  
3. Cross-network analysis - accounts followed by both groups
4. Enhanced analysis with 'followed_by' lists

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

## Load Data

In [None]:
# Load researcher network
researcher_network = pd.read_csv('data/researcher_following_network.csv')
print(f"Researcher network: {len(researcher_network)} relationships")
print(f"Researchers: {researcher_network['researcher'].nunique()}")
print(f"Unique accounts followed: {researcher_network['follows'].nunique()}")

# Load operator network
operator_network = pd.read_csv('data/operator_following_network.csv')
print(f"\nOperator network: {len(operator_network)} relationships")
print(f"Operators: {operator_network['operator'].nunique()}")
print(f"Unique accounts followed: {operator_network['follows'].nunique()}")

## 1. Operator Network Analysis

Find common accounts followed by operators (supernodes)

In [None]:
# Count how many operators follow each account
operator_follow_counts = operator_network.groupby('follows').agg({
    'operator': lambda x: list(x),
    'follows_display_name': 'first'
}).rename(columns={'operator': 'followed_by'}).reset_index()

# Add count column
operator_follow_counts['count'] = operator_follow_counts['followed_by'].apply(len)

# Calculate percentage
total_operators = operator_network['operator'].nunique()
operator_follow_counts['percentage'] = (operator_follow_counts['count'] / total_operators * 100).round(1)

# Sort by count
operator_follow_counts = operator_follow_counts.sort_values('count', ascending=False)

# Reorder columns: follows, follows_display_name, count, percentage, followed_by
operator_follow_counts = operator_follow_counts[['follows', 'follows_display_name', 'count', 'percentage', 'followed_by']]

print(f"TOP 30 ACCOUNTS FOLLOWED BY OPERATORS")
print("="*80)
display(operator_follow_counts.head(30))

In [None]:
# Visualize operator network
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Distribution
ax1.hist(operator_follow_counts['count'], bins=30, edgecolor='black')
ax1.set_xlabel('Number of Operators Following')
ax1.set_ylabel('Number of Accounts')
ax1.set_title('Distribution of Follower Counts (Operators)')
ax1.axvline(x=operator_follow_counts['count'].median(), color='red', linestyle='--', 
            label=f'Median: {operator_follow_counts["count"].median():.0f}')
ax1.legend()

# Top 20 bar chart
top20 = operator_follow_counts.head(20)
ax2.barh(range(len(top20)), top20['count'], color='steelblue')
ax2.set_yticks(range(len(top20)))
ax2.set_yticklabels([f"@{u[:20]}" for u in top20['follows']], fontsize=8)
ax2.set_xlabel('Number of Operators Following')
ax2.set_title('Top 20 Accounts Followed by Operators')
ax2.invert_yaxis()

plt.tight_layout()
plt.show()

## 2. Researcher Network Analysis (Enhanced)

With 'followed_by' column showing who follows each account

In [None]:
# Count how many researchers follow each account
researcher_follow_counts = researcher_network.groupby('follows').agg({
    'researcher': lambda x: list(x),
    'follows_display_name': 'first'
}).rename(columns={'researcher': 'followed_by'}).reset_index()

# Add count column
researcher_follow_counts['count'] = researcher_follow_counts['followed_by'].apply(len)

# Calculate percentage
total_researchers = researcher_network['researcher'].nunique()
researcher_follow_counts['percentage'] = (researcher_follow_counts['count'] / total_researchers * 100).round(1)

# Sort by count
researcher_follow_counts = researcher_follow_counts.sort_values('count', ascending=False)

# Reorder columns
researcher_follow_counts = researcher_follow_counts[['follows', 'follows_display_name', 'count', 'percentage', 'followed_by']]

print(f"TOP 30 ACCOUNTS FOLLOWED BY RESEARCHERS")
print("="*80)
display(researcher_follow_counts.head(30))

In [None]:
# Visualize researcher network
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Distribution
ax1.hist(researcher_follow_counts['count'], bins=30, edgecolor='black', color='green', alpha=0.7)
ax1.set_xlabel('Number of Researchers Following')
ax1.set_ylabel('Number of Accounts')
ax1.set_title('Distribution of Follower Counts (Researchers)')
ax1.axvline(x=researcher_follow_counts['count'].median(), color='red', linestyle='--',
            label=f'Median: {researcher_follow_counts["count"].median():.0f}')
ax1.legend()

# Top 20 bar chart
top20 = researcher_follow_counts.head(20)
ax2.barh(range(len(top20)), top20['count'], color='forestgreen')
ax2.set_yticks(range(len(top20)))
ax2.set_yticklabels([f"@{u[:20]}" for u in top20['follows']], fontsize=8)
ax2.set_xlabel('Number of Researchers Following')
ax2.set_title('Top 20 Accounts Followed by Researchers')
ax2.invert_yaxis()

plt.tight_layout()
plt.show()

## 3. Cross-Network Analysis

Find accounts followed by BOTH researchers and operators

In [None]:
# Get sets of accounts followed by each group
researcher_follows = set(researcher_network['follows'].unique())
operator_follows = set(operator_network['follows'].unique())

# Find intersection
common_follows = researcher_follows & operator_follows

print(f"Accounts followed by at least one researcher: {len(researcher_follows)}")
print(f"Accounts followed by at least one operator: {len(operator_follows)}")
print(f"Accounts followed by BOTH groups: {len(common_follows)}")

# Create combined analysis for common accounts
common_analysis = []

for account in common_follows:
    # Get researcher data
    researcher_data = researcher_follow_counts[researcher_follow_counts['follows'] == account]
    researcher_count = researcher_data['count'].values[0] if len(researcher_data) > 0 else 0
    researcher_pct = researcher_data['percentage'].values[0] if len(researcher_data) > 0 else 0
    researcher_followers = researcher_data['followed_by'].values[0] if len(researcher_data) > 0 else []
    
    # Get operator data
    operator_data = operator_follow_counts[operator_follow_counts['follows'] == account]
    operator_count = operator_data['count'].values[0] if len(operator_data) > 0 else 0
    operator_pct = operator_data['percentage'].values[0] if len(operator_data) > 0 else 0
    operator_followers = operator_data['followed_by'].values[0] if len(operator_data) > 0 else []
    
    # Get display name
    display_name = ''
    if len(researcher_data) > 0:
        display_name = researcher_data['follows_display_name'].values[0]
    elif len(operator_data) > 0:
        display_name = operator_data['follows_display_name'].values[0]
    
    common_analysis.append({
        'account': account,
        'display_name': display_name,
        'researcher_count': researcher_count,
        'researcher_pct': researcher_pct,
        'operator_count': operator_count,
        'operator_pct': operator_pct,
        'total_count': researcher_count + operator_count,
        'followed_by_researchers': researcher_followers,
        'followed_by_operators': operator_followers
    })

common_df = pd.DataFrame(common_analysis)
common_df = common_df.sort_values('total_count', ascending=False)

print(f"\nTOP 30 ACCOUNTS FOLLOWED BY BOTH RESEARCHERS AND OPERATORS")
print("="*80)
display(common_df.head(30))

In [None]:
# Visualize cross-network analysis
fig, ax = plt.subplots(figsize=(12, 8))

top30 = common_df.head(30)
x = np.arange(len(top30))
width = 0.35

ax.barh(x - width/2, top30['researcher_count'], width, label='Researchers', color='forestgreen', alpha=0.8)
ax.barh(x + width/2, top30['operator_count'], width, label='Operators', color='steelblue', alpha=0.8)

ax.set_yticks(x)
ax.set_yticklabels([f"@{acc[:20]}" for acc in top30['account']], fontsize=8)
ax.set_xlabel('Number of Followers')
ax.set_title('Top 30 Accounts Followed by Both Researchers and Operators')
ax.legend()
ax.invert_yaxis()

plt.tight_layout()
plt.show()

In [None]:
# Scatter plot: researcher popularity vs operator popularity
fig, ax = plt.subplots(figsize=(10, 8))

ax.scatter(common_df['researcher_count'], common_df['operator_count'], alpha=0.5, s=50)

# Annotate top accounts
top_common = common_df.head(10)
for _, row in top_common.iterrows():
    ax.annotate(f"@{row['account'][:15]}", 
                (row['researcher_count'], row['operator_count']),
                fontsize=8, alpha=0.7)

ax.set_xlabel('Number of Researchers Following')
ax.set_ylabel('Number of Operators Following')
ax.set_title('Account Popularity: Researchers vs Operators')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Network Statistics

In [None]:
print("NETWORK STATISTICS")
print("="*80)

print(f"\nResearcher Network:")
print(f"  Total researchers: {total_researchers}")
print(f"  Total relationships: {len(researcher_network)}")
print(f"  Unique accounts followed: {len(researcher_follows)}")
print(f"  Avg accounts followed per researcher: {len(researcher_network) / total_researchers:.1f}")

print(f"\nOperator Network:")
print(f"  Total operators: {total_operators}")
print(f"  Total relationships: {len(operator_network)}")
print(f"  Unique accounts followed: {len(operator_follows)}")
print(f"  Avg accounts followed per operator: {len(operator_network) / total_operators:.1f}")

print(f"\nCross-Network:")
print(f"  Accounts followed by both groups: {len(common_follows)}")
print(f"  Accounts unique to researchers: {len(researcher_follows - operator_follows)}")
print(f"  Accounts unique to operators: {len(operator_follows - researcher_follows)}")
print(f"  Overlap percentage: {len(common_follows) / len(researcher_follows | operator_follows) * 100:.1f}%")

## 5. Export Results

In [None]:
# Save enhanced dataframes with followed_by lists
operator_follow_counts.to_csv('data/operator_supernodes.csv', index=False)
print("Saved operator supernodes to data/operator_supernodes.csv")

researcher_follow_counts.to_csv('data/researcher_supernodes.csv', index=False)
print("Saved researcher supernodes to data/researcher_supernodes.csv")

common_df.to_csv('data/cross_network_supernodes.csv', index=False)
print("Saved cross-network supernodes to data/cross_network_supernodes.csv")

## 6. Who Should I Follow?

Find high-value accounts not currently followed

In [None]:
# Load your following list
my_following = pd.read_csv('data/my_following.csv')
my_following_set = set(my_following['username'].tolist())

# Find common accounts you don't follow
suggestions = common_df[~common_df['account'].isin(my_following_set)].copy()

print(f"SUGGESTED ACCOUNTS TO FOLLOW")
print(f"(Popular among both researchers and operators, but not in your following list)")
print("="*80)
display(suggestions.head(20))