# Setup

In [20]:
!pip install pandas



## Dependencies

In [11]:
from collections import defaultdict
import os
import pandas as pd
from pathlib import Path
import plotly.express as px

data_path = Path(os.getcwd()) / 'data' / 'gravity_jobs'
dataset_files = list(data_path.iterdir())

datasets = defaultdict(list)
for platform in ['x', 'reddit', 'youtube']:
    for dataset_file in dataset_files:
        if platform in dataset_file.name:
            datasets[platform].append(dataset_file)

{platform: [d.name for d in dataset_list] for platform, dataset_list in datasets.items()}

{'x': ['solana.x.csv', 'solana.x.parquet'],
 'reddit': ['solana.reddit.csv', 'solana.reddit.parquet'],
 'youtube': ['mrbeast.youtube.csv', 'mrbeast.youtube.parquet']}

# X Datasets

In [14]:
df = pd.read_parquet(datasets['x'][-1])
df.head()

Unnamed: 0,uri,label,username,text,tweet_hashtags,timestamp,url,media,user_id,user_display_name,user_verified,tweet_id,is_reply,is_quote,conversation_id,in_reply_to_user_id,datetime
0,https://x.com/08Rahma_/status/1960723511577641206,#solana,08Rahma_,#Solana,[#Solana],2025-08-27T15:18:00+00:00,https://x.com/08Rahma_/status/1960723511577641206,,1577443432238551041,Rahma🌹🌹,False,1960723511577641206,True,False,1960670638323925330,1.6647338195412664e+18,2025-08-27T15:18:00+00:00
1,https://x.com/0xAlpha_10/status/19607182655812...,#solana,0xAlpha_10,#Solana leads network revenue for the 23rd wee...,[#Solana],2025-08-27T14:57:00+00:00,https://x.com/0xAlpha_10/status/19607182655812...,[https://pbs.twimg.com/media/GzXe2XmXQAAqDRr.jpg],1472171298449408002,0xAlpha Raiders,False,1960718265581212007,False,False,1960718265581212007,,2025-08-27T14:57:00+00:00
2,https://x.com/0xBlazerrr/status/19608184132423...,#solana,0xBlazerrr,#Solana,[#Solana],2025-08-27T21:35:00+00:00,https://x.com/0xBlazerrr/status/19608184132423...,,1912386455554621440,0xBlazerrr,False,1960818413242343904,True,False,1960670638323925330,1.6647338195412664e+18,2025-08-27T21:35:00+00:00
3,https://x.com/0xBot_project/status/19605236820...,#solana,0xBot_project,🚀 LOVE Gains: 29.23x 🚀\n💲 Call MC: 138.10K\n📊 ...,"[#Solana, #CryptoGains, #0xBot, #Shitcoiners, ...",2025-08-27T02:04:00+00:00,https://x.com/0xBot_project/status/19605236820...,,1816596819968729088,0xBot Project - SOON 👀,True,1960523682075361374,False,False,1960523682075361374,,2025-08-27T02:04:00+00:00
4,https://x.com/0xBot_project/status/19605248805...,#solana,0xBot_project,🚀 Nosey Gains: 32.67x 🚀\n💲 Call MC: 75.14K\n📊 ...,"[#Solana, #CryptoGains, #0xBot, #Shitcoiners, ...",2025-08-27T02:08:00+00:00,https://x.com/0xBot_project/status/19605248805...,,1816596819968729088,0xBot Project - SOON 👀,True,1960524880572502354,False,False,1960524880572502354,,2025-08-27T02:08:00+00:00


In [15]:
df.dtypes

uri                    object
label                  object
username               object
text                   object
tweet_hashtags         object
timestamp              object
url                    object
media                  object
user_id                object
user_display_name      object
user_verified          object
tweet_id               object
is_reply               object
is_quote               object
conversation_id        object
in_reply_to_user_id    object
datetime               object
dtype: object

# Reddit Datasets

In [16]:
df = pd.read_parquet(datasets['reddit'][-1])
df.head()

Unnamed: 0,uri,label,id,username,communityName,body,title,createdAt,dataType,parentId,url,datetime
0,https://www.reddit.com/r/3commasCommunity/comm...,r/3commascommunity,t1_n8d7afw,KlutzyWord1848,r/3commasCommunity,"Hola, creo q te puedo ayudar, lo principal es ...",,2025-08-12T22:05:00+00:00,comment,t3_u0xzef,https://www.reddit.com/r/3commasCommunity/comm...,2025-08-12T22:05:00+00:00
1,https://www.reddit.com/r/AMA/comments/1mfu97t/...,r/ama,t1_n6lqf03,JewelTamexJuno,r/AMA,A bot that uses multiple Solana wallets to swa...,,2025-08-02T22:55:00+00:00,comment,t1_n6lq53v,https://www.reddit.com/r/AMA/comments/1mfu97t/...,2025-08-02T22:55:00+00:00
2,https://www.reddit.com/r/AccidentalRenaissance...,r/accidentalrenaissance,t1_n8nrtfr,QuyNguyen9995,r/AccidentalRenaissance,https://preview.redd.it/bfdmdgbazzif1.jpeg?wid...,,2025-08-14T14:45:00+00:00,comment,t3_1mpa9nx,https://www.reddit.com/r/AccidentalRenaissance...,2025-08-14T14:45:00+00:00
3,https://www.reddit.com/r/AhmedabadGoneWildddd/...,r/ahmedabadgonewildddd,t1_n6nmh8k,AutoModerator,r/AhmedabadGoneWildddd,Hey there! Thanks for your submission to r/Ahm...,,2025-08-03T06:55:00+00:00,comment,t3_1mgcor7,https://www.reddit.com/r/AhmedabadGoneWildddd/...,2025-08-03T06:55:00+00:00
4,https://www.reddit.com/r/AhmedabadGoneWildddd/...,r/ahmedabadgonewildddd,t1_n6pnphn,AutoModerator,r/AhmedabadGoneWildddd,Hey there! Thanks for your submission to r/Ahm...,,2025-08-03T15:51:00+00:00,comment,t3_1mgmocx,https://www.reddit.com/r/AhmedabadGoneWildddd/...,2025-08-03T15:51:00+00:00


In [17]:
df.dtypes

uri              object
label            object
id               object
username         object
communityName    object
body             object
title            object
createdAt        object
dataType         object
parentId         object
url              object
datetime         object
dtype: object

# YouTube Datasets

In [18]:
df = pd.read_parquet(datasets['youtube'][-1])
df.head()

Unnamed: 0,url,label,transcript,upload_date,job_id,hotkey,rn,datetime
0,https://www.youtube.com/watch?v=50G0kIty7Cg,#ytc_c_mrbeast,"[{""start"":0.08,""end"":1.6,""text"":""Are you subsc...",2025-09-16T17:00:00+00:00,crawler-0-gzct-tsb8iupfoghtntvm,on-demand-data,1,2025-09-16T17:00:00+00:00
1,https://www.youtube.com/watch?v=XNtTO_339kU,#ytc_c_mrbeast,"[{""start"":0.08,""end"":4.759,""text"":""This is ter...",2025-08-27T16:00:00+00:00,crawler-0-gzct-tsb8iupfoghtntvm,on-demand-data,1,2025-08-27T16:00:00+00:00
2,https://www.youtube.com/watch?v=uA9Zcg3vHvo,#ytc_c_na-chan-e2f79e5b,"[{""start"":0.16,""end"":4.279,""text"":""This about ...",2025-09-01T20:45:00+00:00,crawler-0-gzct-tsb8iupfoghtntvm,on-demand-data,1,2025-09-01T20:45:00+00:00
3,https://www.youtube.com/watch?v=3ih2bPKSWsQ,#ytc_c_na-chan-e2f79e5b,"[{""start"":0.16,""end"":1.68,""text"":""Speed is liv...",2025-09-12T17:00:00+00:00,crawler-0-gzct-tsb8iupfoghtntvm,on-demand-data,1,2025-09-12T17:00:00+00:00
4,https://www.youtube.com/watch?v=4l97aNza_Zc,#ytc_c_mrbeast,"[{""start"":0,""end"":7.374,""text"":""Take off your ...",2025-09-13T16:00:00+00:00,crawler-0-gzct-tsb8iupfoghtntvm,on-demand-data,1,2025-09-13T16:00:00+00:00


In [19]:
df.dtypes

url            object
label          object
transcript     object
upload_date    object
job_id         object
hotkey         object
rn              int64
datetime       object
dtype: object