In [2]:
import os
import requests
from dotenv import load_dotenv, find_dotenv
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)


# Local module
from pathlib import Path
import sys

# assumes notebook is in xapi-ex1/notebooks
project_root = Path.cwd().parent  
sys.path.append(str(project_root / 'src'))

from users import (
    get_recent_followings_cached, 
    get_recent_followers_cached,
    load_follow_cache,
    save_follow_cache,
    search_tweets_advanced,
    get_user_tweets, 
    get_user_tweets_cached,
    save_tweet_cache,
    load_tweet_cache,
)
from schema.tweets import (
    collapse_dicts, 
    collapse_dataframe, 
    TRUNCATED_TWEET_FIELDS
)
from parse import (
    extract_first_username,
    classify_by_engagement_quantile,
)

# Load env
load_dotenv(find_dotenv())
API_KEY = os.getenv('twitter_apiio_key')
if not API_KEY:
    raise RuntimeError('twitter_apiio_key not set in environment')

In [3]:
import plotly.express as px
import plotly.graph_objects as go

In [4]:
# Params
USERNAME = 'apralky' 
FOLLOWING_LIMIT = 200  # multiples of 20-200 recommended
FOLLOWERS_LIMIT = 600

In [5]:
# Get followings
followings = get_recent_followings_cached(USERNAME, limit=FOLLOWING_LIMIT, api_key=API_KEY)

✅ Cache hit: 200 followings


In [6]:
# Normalize to DataFrame
df_followings = pd.json_normalize(followings.get('followings', []))
print(f"Rows: {len(df_followings)}")

df_followings.head(2)

Rows: 200


Unnamed: 0,id,name,screen_name,userName,location,url,description,email,protected,verified,followers_count,following_count,friends_count,favourites_count,statuses_count,media_tweets_count,created_at,profile_banner_url,profile_image_url_https,can_dm
0,953893733468172288,Sam,futurenomics,futurenomics,,https://t.co/APdEYafKBM,building @0xtristero @ycombinator alum @stanfo...,,False,False,12734,1538,1538,57095,21930,1626,Thu Jan 18 07:36:03 +0000 2018,https://pbs.twimg.com/profile_banners/95389373...,https://pbs.twimg.com/profile_images/187784758...,False
1,1521870690630221824,Patrick Casey,restoreorderusa,restoreorderusa,"Washington, D.C.",https://t.co/dR38x52zpT,Writer and host of Restoring Order | Moderate ...,,False,False,74697,2115,2115,45000,15759,2001,Wed May 04 15:13:58 +0000 2022,https://pbs.twimg.com/profile_banners/15218706...,https://pbs.twimg.com/profile_images/190537595...,False


In [7]:
df_followings.iloc[0]

id                                                        953893733468172288
name                                                                     Sam
screen_name                                                     futurenomics
userName                                                        futurenomics
location                                                                    
                                                 ...                        
media_tweets_count                                                      1626
created_at                                    Thu Jan 18 07:36:03 +0000 2018
profile_banner_url         https://pbs.twimg.com/profile_banners/95389373...
profile_image_url_https    https://pbs.twimg.com/profile_images/187784758...
can_dm                                                                 False
Name: 0, Length: 20, dtype: object

## Get tweets for all followers

Query time
| count | time | size_kb |
|-------|------|---------|
| 200   | 17   | 700     |
| 500   | 45   | 1900    |
| 1000  | 100  | 4000    |

Recommended, 200 limit per account for tweets

In [8]:
# TWEET_LIMIT = 200 
TWEET_LIMIT = 1000
start_date = None
end_date = None
min_faves = 0
INCLUDE_REPLIES = True


In [9]:
resp = get_user_tweets_cached(
    api_key=API_KEY,
    username=USERNAME,
    limit=TWEET_LIMIT,
    start_date=start_date,
    end_date=end_date,
    min_faves=min_faves,
    include_replies=INCLUDE_REPLIES,
)
tweets = resp["tweets"]

✅ C-Hit: Q: `from:apralky`


In [10]:
truncated2 = [
    'type',
    'url',
    'lang',
]

truncated_fields = [
    field for field in TRUNCATED_TWEET_FIELDS
    if field not in truncated2
]

In [11]:
df_tweets = pd.json_normalize(tweets)
df_tmini = collapse_dataframe(df_tweets, fields=truncated_fields)

In [12]:
df_tmini.head(2)

Unnamed: 0,id,createdAt,text,retweetCount,replyCount,likeCount,quoteCount,viewCount,bookmarkCount,isReply,inReplyToId,inReplyToUsername,author.userName,author.url,author.id,author.isBlueVerified,author.followers,author.following
0,1965463701307417023,Tue Sep 09 17:13:57 +0000 2025,@fvderop the mutual based system is ineffectiv...,0,1,13,0,608,3,True,1965462971762839948,fvderop,apralky,https://x.com/apralky,1700977169177206784,True,24872,627
1,1965462653440324010,Tue Sep 09 17:09:47 +0000 2025,@NateWitkin disagree I think the invisible han...,0,1,7,0,450,2,True,1965462326586270177,NateWitkin,apralky,https://x.com/apralky,1700977169177206784,True,24872,627


In [13]:
df_tmini.describe()

Unnamed: 0,retweetCount,replyCount,likeCount,quoteCount,viewCount,bookmarkCount,author.followers,author.following
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,15.509,11.039,326.307,3.042,22322.53,87.126,24873.28,627.0
std,157.330076,70.202435,1906.42753,23.817714,111593.0,400.517458,1.866838,0.0
min,0.0,0.0,0.0,0.0,45.0,0.0,24872.0,627.0
25%,0.0,0.0,7.0,0.0,479.0,1.0,24872.0,627.0
50%,0.0,1.0,25.0,0.0,2435.5,3.0,24872.0,627.0
75%,2.0,5.0,153.25,1.0,10159.25,21.0,24876.0,627.0
max,4647.0,2083.0,48191.0,668.0,2567531.0,6777.0,24876.0,627.0


In [14]:
# is labels
format_string = "%a %b %d %H:%M:%S %z %Y"

df_formatted = df_tmini.copy()
df_formatted['createdAt'] = pd.to_datetime(df_formatted['createdAt'], format=format_string)

# Wrap text at 30 characters and replace newlines with HTML breaks
df_formatted['text'] = df_formatted['text'].str.wrap(50)
df_formatted['text'] = df_formatted['text'].apply(lambda x: x.replace('\n', '<br>'))

# truncate set
# df_formatted = df_formatted.head(100)

fig = px.scatter(
    df_formatted, 
    y='viewCount', 
    x='createdAt',
    color='isReply',
    hover_data=['text'],
    log_y=True,
)

# Format x-axis to show only year-month
fig.update_layout(
    title=f'Views for @{USERNAME}',
    height=600,
    xaxis=dict(
        tickmode='auto',
        nticks=10,  # Limit number of ticks
    )
)

fig.show()


## Research
- Correlation heatmap to understand average engagements and total engagement
- Scatter plot to understand correlation BY GROUP

### Research conclusion. 
(You may run cells below to confirm the research again)

With a sample of 200, 1000 tweets:
- 200 tweet samples is good enough to understand an account
- 1000 tweet samples is more than enough. 
- AVG engagement per account is as good as total engagement
- You NEED MORE samples if an account really likes writing threads. (Threads screw up the analytics. MAKE SURE you exclude self in analysis)

To find a high elo account:
- check a user's top replied accounts >50% of cumulative replies. Group them as "T1"
- Instead of using cumulative replies, you may use cumulative "total engagement" (like + quote + bookmark)
- check any other user that generates more engagement that the average of "T1" of tested user. This is another source of high elo account. (could be a lucky celebrity hit tho)

With enough runs for 100 different accounts:
- An account that appears in T1 for multiple different account is high elo. 


In [15]:
reply_tweets = df_tmini[df_tmini['isReply'] == True].copy()
reply_tweets['inReplyToUsername'] = reply_tweets['text'].apply(extract_first_username, custom_message=USERNAME)
reply_tweets['bvi'] = round(reply_tweets['bookmarkCount'] / reply_tweets['viewCount'] * 1000, 4)
reply_tweets['bli'] = round(reply_tweets['bookmarkCount'] / reply_tweets['likeCount'] * 100, 4)

# Create comprehensive engagement stats
engagement_stats = reply_tweets.groupby('inReplyToUsername').agg({
    'id': 'count',                    # Number of replies
    'retweetCount': ['sum', 'mean'],  # Total and average retweets
    'likeCount': ['sum', 'mean'],     # Total and average likes
    'replyCount': ['sum', 'mean'],    # Total and average replies
    'quoteCount': ['sum', 'mean'],    # Total and average quotes
    'viewCount': ['sum', 'mean'],     # Total and average views
    'bookmarkCount': ['sum', 'mean'],  # Total and average bookmarks
    'bvi': ['mean'],  # Total and average bvi
    'bli': ['mean'],  # Total and average bli
}).round(2)

# flatten column names
engagement_stats.columns = [
    'reply_count', 
    'total_retweets', 'avg_retweets', 
    'total_likes', 'avg_likes',
    'total_replies', 'avg_replies', 
    'total_quotes', 'avg_quotes',
    'total_views', 'avg_views', 
    'total_bookmarks', 'avg_bookmarks',
    'avg_bvi', 
    'avg_bli'
]
engagement_stats.reset_index(inplace=True)

No username found in text: what will likely happen is that internet people will arrive at the revealed preference of self-organizing in an academia style model from first principles... you can help them out as an entrepreneur
No username found in text: or you can just let shoe on head run your discourse, this is the infodemocratic alternative
No username found in text: https://t.co/vT0mwjV9OY
No username found in text: there's simply no such thing as wholesome capitalism
No username found in text: populism is obviously a transition phase, it is by its nature unsustainable... you promise people low inflation low borrowing low taxes low unemployment, obviously fail to deliver it, and we move on to the next stage (state capitalism or communism)
No username found in text: I want to write more about why seemingly everyone contending with the question of where unchecked capitalism eventually ends up in the 20th century concluded that it's either hyper-authoritarian state capitalism or commun

In [16]:
engagement_stats

Unnamed: 0,inReplyToUsername,reply_count,total_retweets,avg_retweets,total_likes,avg_likes,total_replies,avg_replies,total_quotes,avg_quotes,total_views,avg_views,total_bookmarks,avg_bookmarks,avg_bvi,avg_bli
0,0x49fa98,3,0,0.0,65,21.67,2,0.67,0,0.0,2013,671.00,5,1.67,2.58,9.57
1,0xPajke,2,0,0.0,26,13.00,1,0.50,0,0.0,1747,873.50,5,2.50,3.29,24.17
2,0xUnihax0r,1,0,0.0,58,58.00,2,2.00,0,0.0,3992,3992.00,9,9.00,2.25,15.52
3,1bharadvaja,4,0,0.0,26,6.50,1,0.25,0,0.0,2257,564.25,4,1.00,3.97,23.59
4,250bpmrape,1,0,0.0,48,48.00,3,3.00,0,0.0,2264,2264.00,4,4.00,1.77,8.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,youngtroon,2,0,0.0,14,7.00,3,1.50,0,0.0,930,465.00,3,1.50,3.14,20.83
280,zaitoonx,1,0,0.0,3,3.00,1,1.00,0,0.0,252,252.00,1,1.00,3.97,33.33
281,zephyr_z9,10,1,0.1,60,6.00,5,0.50,1,0.1,3845,384.50,31,3.10,7.27,48.13
282,zeynebnkaya,1,0,0.0,29,29.00,2,2.00,0,0.0,1668,1668.00,44,44.00,26.38,151.72


In [42]:
engagement_stats.sort_values(by='reply_count', ascending=False, inplace=True)
engagement_stats.query('reply_count > 1').head(10)['reply_count'].value_counts()

reply_count
32     2
9      2
37     1
226    1
11     1
10     1
8      1
6      1
Name: count, dtype: int64

In [18]:
numeric_cols = engagement_stats.select_dtypes(include=[np.number]).columns

In [19]:
# px.imshow(
#     engagement_stats[numeric_cols].query('reply_count < 2').corr(),
#     color_continuous_scale='RdBu',
#     zmin=-1,
#     zmax=1
# ).update_layout(
#     height=800,
#     width=1400,
#     title='Engagement, single reply'
# )


In [20]:
# px.imshow(
#     engagement_stats.query('reply_count > 1')[numeric_cols].corr(),
#     color_continuous_scale='RdBu',
#     zmin=-1,
#     zmax=1
# ).update_layout(
#     height=800,
#     width=1400,
#     title='Engagement, >1 reply, including self'
# )

In [21]:
# px.imshow(
#     engagement_stats.query('reply_count > 1').query(f'inReplyToUsername != "{USERNAME}"')[numeric_cols].corr(),
#     color_continuous_scale='RdBu',
#     zmin=-1,
#     zmax=1
# ).update_layout(
#     height=800,
#     width=1400,
#     title='Engagement, >1 reply, excluding self'
# )

In [69]:
metrics = [
    'reply_count',
    'total_retweets',
    'total_likes',
    'total_replies',
    'total_quotes',
    'total_views',
    'total_bookmarks',
    # 'avg_retweets',
    # 'avg_likes',
    # 'avg_replies',
    # 'avg_quotes',
    # 'avg_views',
    # 'avg_bookmarks',
    # 'avg_bvi',
    # 'avg_bli',
]
temp = engagement_stats.query('reply_count > 1')

moots = [
    'AashishReddy',
    'Abel_summation',
    'oxbquant',
    'TenreiroDaniel',
    'zephyr_z9',
    'thermo2ndlaw',
    'teortaxesTex',
    'LandsharkRides',
    'jacobrintamaki',
]

# Fix the syntax error - use .isin() for the "not in" condition
conditions = [
    (temp['inReplyToUsername'] == USERNAME),
    (temp['inReplyToUsername'].isin(moots)),
    (temp['inReplyToUsername'] != USERNAME) & (~temp['inReplyToUsername'].isin(moots)),  # Fixed: ~ instead of "not in"
]

values = ['Author', 'Moot', 'Other']
temp['user_category'] = np.select(conditions, values, default='Unknown')

target_col = 'total_views' 

In [70]:
# for col in metrics:
#     if col == target_col:
#         continue
#     fig = px.scatter(
#         temp,
#         x=col,
#         y=target_col,
#         color='user_category',
#         hover_data=['inReplyToUsername'],
#         trendline='ols',
#         # log_x=True,
#         log_y=True,
#     )
#     fig.update_layout(
#         height=800,
#         # width=400
#     )
#     fig.show()

In [71]:
def classify_by_cumsum_auto(df, username_col, target_col, n_categories=4):
    """
    Automatically classify users by cumulative sum with smart percentiles.
    
    Parameters:
    - df: DataFrame with engagement stats
    - username_col: Column name for usernames
    - target_col: Column name for engagement metric to classify by
    - n_categories: Number of categories to create
    """
    # Sort by target column in descending order
    df_sorted = df.sort_values(target_col, ascending=False).copy()
    
    # Calculate cumulative sum and percentage
    df_sorted['cumsum'] = df_sorted[target_col].cumsum()
    total_sum = df_sorted[target_col].sum()
    df_sorted['cumsum_percent'] = df_sorted['cumsum'] / total_sum
    
    # Find natural breakpoints (where cumulative percentage changes significantly)
    if n_categories == 3:
        percentiles = [0.5, 0.8, 1.0]  # Top 50%, 50-80%, 80-100%
    elif n_categories == 4:
        percentiles = [0.2, 0.5, 0.8, 1.0]  # Top 20%, 20-50%, 50-80%, 80-100%
    elif n_categories == 5:
        percentiles = [0.1, 0.3, 0.6, 0.85, 1.0]  # Top 10%, 10-30%, 30-60%, 60-85%, 85-100%
    else:
        # Create evenly spaced percentiles
        percentiles = [i/n_categories for i in range(1, n_categories+1)]
    
    # Create conditions and labels
    conditions = []
    labels = []
    
    for i, percentile in enumerate(percentiles):
        if i == 0:
            conditions.append(df_sorted['cumsum_percent'] <= percentile)
            labels.append(f'Top-{int(percentile*100)}%')
        else:
            prev_percentile = percentiles[i-1]
            conditions.append(
                (df_sorted['cumsum_percent'] > prev_percentile) & 
                (df_sorted['cumsum_percent'] <= percentile)
            )
            labels.append(f'{int(prev_percentile*100)}%-{int(percentile*100)}%')
    
    # Apply classification
    df_sorted[f'{target_col}_category'] = np.select(conditions, labels, default='Bottom')
    
    return df_sorted

In [83]:
target_col = 'total_views' 
quantile_col = 'reply_count'

In [89]:
# Usage:
t2 = classify_by_cumsum_auto(
    engagement_stats.query(f'inReplyToUsername != "{USERNAME}"').query(f'{quantile_col} > 1'), 
    'inReplyToUsername', 
    quantile_col,
    n_categories=5
)

In [90]:
t2

Unnamed: 0,inReplyToUsername,reply_count,total_retweets,avg_retweets,total_likes,avg_likes,total_replies,avg_replies,total_quotes,avg_quotes,total_views,avg_views,total_bookmarks,avg_bookmarks,avg_bvi,avg_bli,cumsum,cumsum_percent,reply_count_category
94,_AashishReddy,37,3,0.08,501,13.54,42,1.14,3,0.08,59327,1603.43,79,2.14,4.71,25.85,37,0.0925,Top-10%
10,Abel_summation,32,0,0.00,341,10.66,25,0.78,0,0.00,36056,1126.75,78,2.44,4.40,33.07,69,0.1725,10%-30%
208,oxbquant,32,1,0.03,224,7.00,12,0.38,1,0.03,29458,920.56,61,1.91,2.99,33.57,101,0.2525,10%-30%
77,TenreiroDaniel,11,1,0.09,135,12.27,3,0.27,0,0.00,14913,1355.73,29,2.64,4.59,35.79,112,0.2800,10%-30%
281,zephyr_z9,10,1,0.10,60,6.00,5,0.50,1,0.10,3845,384.50,31,3.10,7.27,48.13,122,0.3050,30%-60%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,supersnowxbt,2,1,0.50,69,34.50,1,0.50,0,0.00,5793,2896.50,5,2.50,0.86,7.94,392,0.9800,85%-100%
279,youngtroon,2,0,0.00,14,7.00,3,1.50,0,0.00,930,465.00,3,1.50,3.14,20.83,394,0.9850,85%-100%
283,zmo_zeemo,2,2,1.00,236,118.00,11,5.50,1,0.50,8442,4221.00,8,4.00,0.97,3.58,396,0.9900,85%-100%
275,xlr8harder,2,1,0.50,147,73.50,2,1.00,0,0.00,49871,24935.50,10,5.00,0.40,6.22,398,0.9950,85%-100%


In [91]:
for col in metrics:
    if col == target_col:
        continue
    fig = px.scatter(
        t2,
        x=col,
        y=target_col,
        color=f'{quantile_col}_category',
        hover_data=['inReplyToUsername', quantile_col, target_col],
        trendline='ols',
        # log_x=True,
        log_y=True,
    )
    fig.update_layout(
        height=800,
        # width=400
    )
    fig.show()