In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4'
os.environ['BOKEH_ALLOW_WS_ORIGIN'] = 'llm-interface-2-api.legit-ai.co.id'

# Tested Visualization: Bokeh + CuXFilter

In [4]:
import json
import cudf
import cugraph
import cuxfilter

## Preprocess Sample Data

In [5]:
DATAPATH = '../sample_data/sample-data-crawler-twt.json'

with open(DATAPATH, 'r') as f:
    raw_data = json.load(f)

print(f"Loaded {len(raw_data)} rows")
print(f"Sample: {raw_data[5]}")

Loaded 211 rows
Sample: {'id': '1828256366173266411', 'user_screen_name': 'PambinyaTaehyun', 'user_followers_count': 19, 'user_friends_count': 68, 'user_created_at': '2019-10-07T09:21:27+00:00', 'user_status_count': 14661, 'user_is_verified': False, 'user_description': 'genre idup: komedi tragedi', 'user_media_count': 183, 'text': '@ardisatriawan Bjir dah nyampe ranah internasional', 'post_created_at': '2024-08-27T02:20:59+00:00', 'views_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'is_retweeted': False, 'entities_hashtag': [], 'entities_mentions': ['#ardisatriawan'], 'in_quote_username': '', 'in_quote_id': '-', 'in_reply_username': 'ardisatriawan', 'in_reply_id': '1828253940003283240', 'media_urls': [], 'created_at': '2024-12-11T11:24:02.816745', 'updated_at': '2024-12-11T11:24:02.816745'}


In [6]:
# Process raw data to build graph's edges (by user)
edges_user = []
for interaction in raw_data:
    source = interaction['user_screen_name']

    # quotes interaction
    if interaction['in_quote_username'].strip('-'):
        destination = interaction['in_quote_username']
        edges_user.append([source, destination, 'quote'])

    # replies interaction
    if interaction['in_reply_username'].strip('-'):
        destination = interaction['in_reply_username']
        edges_user.append([source, destination, 'reply'])

print(f"Generated edges: {len(edges_user)}")

Generated edges: 216


In [7]:
# Process raw data to build graph's edges (by text)
def find_interaction_on_id(id):
    for d in raw_data:
        if d['id'] == id:
            return d
    
edges_text = []
for interaction in raw_data:
    source = interaction['text']

    # quotes interaction
    try:
        if interaction['in_quote_id'].strip('-'):
            destination = find_interaction_on_id(interaction['in_quote_id'])['text']
            edges_text.append([source, destination, 'quote'])
    except:
        print(f'quoted parent not found!')

    # replies interaction
    try:
        if interaction['in_reply_id'].strip('-'):
            destination = find_interaction_on_id(interaction['in_reply_id'])['text']
            edges_text.append([source, destination, 'reply'])
    except:
        print(f'replied parent not found!')        

print(f"Generated edges: {len(edges_text)}")

quoted parent not found!
quoted parent not found!
quoted parent not found!
quoted parent not found!
quoted parent not found!
quoted parent not found!
Generated edges: 210


In [8]:
data = cudf.DataFrame(raw_data)
data.head()

Unnamed: 0,id,user_screen_name,user_followers_count,user_friends_count,user_created_at,user_status_count,user_is_verified,user_description,user_media_count,text,...,is_retweeted,entities_hashtag,entities_mentions,in_quote_username,in_quote_id,in_reply_username,in_reply_id,media_urls,created_at,updated_at
0,1828253940003283240,ardisatriawan,69800,2665,2010-07-25T03:31:53+00:00,12517,False,Mulai males ngedit meme soang.\n\nResearch fel...,548,Media Singapura sudah mulai mencium gelagat an...,...,False,[],[],,-,-,-,[],2024-12-11T11:24:02.816215,2024-12-11T11:24:02.816218
1,1828351087885328471,martintjandra,364,1333,2009-07-29T06:56:10+00:00,16212,False,"In conclusion, you’re not ChatGPT, unlike me.",506,@ardisatriawan Lagi rame gini bukannya berusah...,...,False,[],[#ardisatriawan],,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.816581,2024-12-11T11:24:02.816582
2,1828254349610672324,ulanbatori,3,35,2024-07-11T00:59:17+00:00,3508,False,,60,@ardisatriawan 🤣ok arya kamandanu,...,False,[],[#ardisatriawan],,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.816639,2024-12-11T11:24:02.816640
3,1828276281395552310,hmadZak,257,768,2012-04-18T10:23:37+00:00,11131,False,Berjalan adalah sebuah kebutuhan,302,@ardisatriawan Makin menarik ini mulyono,...,False,[],[#ardisatriawan],,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.816678,2024-12-11T11:24:02.816679
4,1828256055849296275,Abdillahh_Ali,25,176,2021-10-29T07:52:24+00:00,2277,False,"Football, politics, and film enthusiast",19,@ardisatriawan Sebenarnya masalah gratifikasi ...,...,False,[],[#ardisatriawan],,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.816712,2024-12-11T11:24:02.816712


In [9]:
data.rename({'id':'user_id'}, axis=1, inplace=True)

Unnamed: 0,user_id,user_screen_name,user_followers_count,user_friends_count,user_created_at,user_status_count,user_is_verified,user_description,user_media_count,text,...,is_retweeted,entities_hashtag,entities_mentions,in_quote_username,in_quote_id,in_reply_username,in_reply_id,media_urls,created_at,updated_at
0,1828253940003283240,ardisatriawan,69800,2665,2010-07-25T03:31:53+00:00,12517,False,Mulai males ngedit meme soang.\n\nResearch fel...,548,Media Singapura sudah mulai mencium gelagat an...,...,False,[],[],,-,-,-,[],2024-12-11T11:24:02.816215,2024-12-11T11:24:02.816218
1,1828351087885328471,martintjandra,364,1333,2009-07-29T06:56:10+00:00,16212,False,"In conclusion, you’re not ChatGPT, unlike me.",506,@ardisatriawan Lagi rame gini bukannya berusah...,...,False,[],[#ardisatriawan],,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.816581,2024-12-11T11:24:02.816582
2,1828254349610672324,ulanbatori,3,35,2024-07-11T00:59:17+00:00,3508,False,,60,@ardisatriawan 🤣ok arya kamandanu,...,False,[],[#ardisatriawan],,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.816639,2024-12-11T11:24:02.816640
3,1828276281395552310,hmadZak,257,768,2012-04-18T10:23:37+00:00,11131,False,Berjalan adalah sebuah kebutuhan,302,@ardisatriawan Makin menarik ini mulyono,...,False,[],[#ardisatriawan],,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.816678,2024-12-11T11:24:02.816679
4,1828256055849296275,Abdillahh_Ali,25,176,2021-10-29T07:52:24+00:00,2277,False,"Football, politics, and film enthusiast",19,@ardisatriawan Sebenarnya masalah gratifikasi ...,...,False,[],[#ardisatriawan],,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.816712,2024-12-11T11:24:02.816712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,1828387932388167838,nandazman,154,175,2010-07-02T14:18:20+00:00,1473,False,Equilibrium,241,@ardisatriawan ieu rame din @dindariztia,...,False,[],"[#ardisatriawan, #dindariztia]",,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.898939,2024-12-11T11:24:02.898939
207,1828421953704861858,Ib27189947Arief,1081,6874,2020-08-04T12:49:41+00:00,1322,False,,1,@ardisatriawan Produk luar merajalela...menyog...,...,False,[],[#ardisatriawan],,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.898969,2024-12-11T11:24:02.898969
208,1828422249462034906,febul0us,575,432,2011-02-19T12:40:57+00:00,8900,False,,427,@ardisatriawan Manyala Erina ku,...,False,[],[#ardisatriawan],,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.898998,2024-12-11T11:24:02.898999
209,1828425752855093687,echanthought,543,318,2012-03-15T08:58:44+00:00,25599,False,꒰ 🌻 ꒱ؘ ࿐ ࿔*:･ﾟ 𝓎𝑜𝓊𝓇 𝒻𝒶𝓋 𝓈𝓊𝓃𝒻𝓁𝑜𝓌𝑒𝓇𝓈 ๑ꕤꮺǂ ...,1069,@ardisatriawan wah ramee wkwkwk @edmirablee,...,False,[],"[#ardisatriawan, #edmirablee]",,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.899028,2024-12-11T11:24:02.899028


In [10]:
edges_user[:5]

[['martintjandra', 'ardisatriawan', 'reply'],
 ['ulanbatori', 'ardisatriawan', 'reply'],
 ['hmadZak', 'ardisatriawan', 'reply'],
 ['Abdillahh_Ali', 'ardisatriawan', 'reply'],
 ['PambinyaTaehyun', 'ardisatriawan', 'reply']]

## Visualization by User Graph

In [11]:
# Build Graph
G = cugraph.Graph() 
G.from_cudf_edgelist(data, source='user_id', destination='in_reply_id')
edges = G.edges()

In [12]:
ITERATIONS=500
THETA=10.0

g1_layout = cugraph.layout.force_atlas2(G,
    max_iter=ITERATIONS,
    strong_gravity_mode=False,
    outbound_attraction_distribution=True,
    lin_log_mode=False,
    barnes_hut_optimize=True,
    barnes_hut_theta=THETA,
    verbose=False
)

In [13]:
g1_layout

Unnamed: 0,x,y,vertex
0,3.207886,161.607285,1828291290821243358
1,10.432313,198.799042,1828280282962895146
2,202.691391,74.519821,1828290174775894166
3,-89.473518,-28.572798,1828276445547733178
4,-0.031832,1.823857,1828267605850984611
...,...,...,...
207,-0.061973,2.693297,1828313339908497467
208,-0.033754,2.137071,1828263532900815172
209,-0.033486,2.165435,1828280340903010410
210,-0.029945,2.187370,1830491785069760942


In [14]:
data.sample()

Unnamed: 0,user_id,user_screen_name,user_followers_count,user_friends_count,user_created_at,user_status_count,user_is_verified,user_description,user_media_count,text,...,is_retweeted,entities_hashtag,entities_mentions,in_quote_username,in_quote_id,in_reply_username,in_reply_id,media_urls,created_at,updated_at
77,1828417483310809502,AbdulWa52303403,1785,2852,2020-05-10T05:07:14+00:00,2807,False,apa adanya,32,@ardisatriawan Gak mungkin duit hasil jual mar...,...,False,[],[#ardisatriawan],,-,ardisatriawan,1828253940003283240,[],2024-12-11T11:24:02.846934,2024-12-11T11:24:02.846934


In [15]:
final_df = g1_layout.merge(
    data[['user_id','in_reply_id','user_screen_name','in_reply_username','user_followers_count','user_friends_count']],
    left_on='vertex',
    right_on='user_id',
    suffixes=('', '_original')
)

# Check
final_df.head()

Unnamed: 0,x,y,vertex,user_id,in_reply_id,user_screen_name,in_reply_username,user_followers_count,user_friends_count
0,-0.033754,2.137071,1828263532900815172,1828263532900815172,1828253940003283240,retamialffn,ardisatriawan,84,139
1,-0.033486,2.165435,1828280340903010410,1828280340903010410,1828253940003283240,naoireisha,ardisatriawan,23,459
2,-0.029945,2.18737,1830491785069760942,1830491785069760942,1828253940003283240,dickysubhan22,ardisatriawan,135,123
3,-0.152978,3.520883,1828269539450331565,1828269539450331565,1828253940003283240,kusmawan_11,ardisatriawan,52,372
4,-0.064587,2.724308,1828287097066922217,1828287097066922217,1828253940003283240,ffinnnX,ardisatriawan,7,14


In [16]:
final_df.head()

Unnamed: 0,x,y,vertex,user_id,in_reply_id,user_screen_name,in_reply_username,user_followers_count,user_friends_count
0,-0.033754,2.137071,1828263532900815172,1828263532900815172,1828253940003283240,retamialffn,ardisatriawan,84,139
1,-0.033486,2.165435,1828280340903010410,1828280340903010410,1828253940003283240,naoireisha,ardisatriawan,23,459
2,-0.029945,2.18737,1830491785069760942,1830491785069760942,1828253940003283240,dickysubhan22,ardisatriawan,135,123
3,-0.152978,3.520883,1828269539450331565,1828269539450331565,1828253940003283240,kusmawan_11,ardisatriawan,52,372
4,-0.064587,2.724308,1828287097066922217,1828287097066922217,1828253940003283240,ffinnnX,ardisatriawan,7,14


In [17]:
edges.head()

Unnamed: 0,user_id,in_reply_id
0,1828265207438983544,1828253940003283240
1,1828265762538299889,1828253940003283240
2,1828266868957946362,1828253940003283240
3,1828431257132507408,1828253940003283240
4,1828273966009798991,1828253940003283240


In [18]:
# filter out non edges
numeric_columns = ['user_id', 'in_reply_id', 'user_followers_count', 'user_friends_count']

for col in numeric_columns:
    if col in final_df.columns:
        final_df[col] = cudf.to_numeric(final_df[col], errors='coerce')

# final_df['in_reply_id'] = cudf.to_numeric(final_df['in_reply_id'])
# final_df['in_reply_id']

In [21]:
final_df['user_id'] = cudf.to_numeric(final_df['user_id'])
final_df['in_reply_id'] = cudf.to_numeric(final_df['in_reply_id'])

edges['user_id'] = cudf.to_numeric(edges['user_id'])
edges['in_reply_id'] = cudf.to_numeric(edges['in_reply_id'])

# Check data types
print("Final_df dtypes:")
print(final_df[['user_id', 'in_reply_id']].dtypes)
print("\nEdges dtypes:")
print(edges[['user_id', 'in_reply_id']].dtypes)

cux_df = cuxfilter.DataFrame.load_graph((final_df, edges))

Final_df dtypes:
user_id          int64
in_reply_id    float64
dtype: object

Edges dtypes:
user_id        int64
in_reply_id    int64
dtype: object


In [22]:
chart1 = cuxfilter.charts.graph(
          edge_source='user_id', edge_target='in_reply_id',
          edge_color_palette=['gray', 'black'],
      #     ode_pixel_shade_type='linear',
          edge_render_type='curved', #curved, direct
          edge_transparency=0.6, #0.1 - 0.9
          title='Sample Graph 1'
      )

d = cux_df.dashboard(
    [chart1],
    theme=cuxfilter.themes.rapids,
    title="Network Graph"
)

d.show(port=8090)

Dashboard running at port 8090


AttributeError: 'NoneType' object has no attribute 'stop'