# Load and unzip files

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
! ls "/content/gdrive/MyDrive/Colab Notebooks/MO436/data/TwiBot-20-Format22.zip"

'/content/gdrive/MyDrive/Colab Notebooks/MO436/data/TwiBot-20-Format22.zip'


In [None]:
! unzip  "/content/gdrive/MyDrive/Colab Notebooks/MO436/data/TwiBot-20-Format22.zip"

Archive:  /content/gdrive/MyDrive/Colab Notebooks/MO436/data/TwiBot-20-Format22.zip
   creating: Twibot-20/
  inflating: Twibot-20/node.json     
  inflating: Twibot-20/split.csv     
  inflating: Twibot-20/user_info.pt  
  inflating: Twibot-20/test.ipynb    
  inflating: Twibot-20/edge.csv      
  inflating: Twibot-20/label.csv     


In [None]:
! mv "/content/Twibot-20" "./gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20"

In [None]:
!ls "./gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20"

edge.csv  label.csv  node.json	split.csv  test.ipynb  user_info.pt


# Make parquet files

## Twibot 20

In [None]:
!pip install ijson -q

Collecting ijson
  Downloading ijson-3.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (111 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.8/111.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ijson
Successfully installed ijson-3.2.3


In [None]:
import pandas as pd
from pathlib import Path

data_folder_path = Path("./gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20")
# data_folder_node_path = Path("./gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20/node")

In [None]:
import ijson
from tqdm import tqdm

def _handle_mixed_tweet_profile_dataset(df):

  if ("text" in df.columns) and ("name" in df.columns):
    return (
        df[df["text"].isna()].drop(columns = ["text"]),
        df[df["text"].notna()][["id","text"]]
        )

  elif (not "text" in df.columns) and ("name" in df.columns):
    return (
        df,
        pd.DataFrame()
        )

  else:
    return (
        pd.DataFrame(),
        df
    )

def _write_profile_and_tweet_data(items, saving_folder, saving_prefix, p):
  df = pd.DataFrame.from_records(items)
  profile_df, tweet_df = _handle_mixed_tweet_profile_dataset(df)

  if len(profile_df)  > 0:
    (profile_df.to_parquet(saving_folder/"profile"/f"{saving_prefix}.part{p}.parquet"))
  if len(tweet_df)  > 0:
    (tweet_df.to_parquet(saving_folder/"tweets"/f"{saving_prefix}.part{p}.parquet"))

  return

def parse_json_by_chunks(file_path, saving_path, saving_prefix, chunksize = 2_000_000):

  saving_folder = Path(saving_path)
  (saving_folder/"profile").mkdir(parents=True, exist_ok=True)
  (saving_folder/"tweets").mkdir(parents=True, exist_ok=True)
  # Open the JSON file
  with open(file_path, 'r') as file:
    # Parse the JSON objects one by one
    parser = ijson.items(file, 'item')
    # Iterate over the JSON objects
    i = 0
    p = 0
    items = []
    for item in tqdm(parser):
        # Process each JSON object as needed
        items.append(item)
        if i >= chunksize:
          _write_profile_and_tweet_data(
              items, saving_folder, saving_prefix, p)
          p+=1
          #reset counters
          i=0
          items=[]
        i+=1

    if len(items) > 0:
      _write_profile_and_tweet_data(items, saving_folder, saving_prefix, p)

  return saving_folder

In [None]:
parse_json_by_chunks(
    file_path = data_folder_path/"node.json",
    saving_path = data_folder_path/"node",
    saving_prefix = "node",
    chunksize = 5_000_000,
    )

In [None]:
list((data_folder_path/"node/profile").glob("*"))

[PosixPath('gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20/node/profile/node.part0.parquet')]

In [None]:
list((data_folder_path/"node/tweets").glob("*"))

[PosixPath('gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20/node/tweets/node.part0.parquet'),
 PosixPath('gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20/node/tweets/node.part1.parquet'),
 PosixPath('gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20/node/tweets/node.part2.parquet'),
 PosixPath('gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20/node/tweets/node.part3.parquet'),
 PosixPath('gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20/node/tweets/node.part4.parquet'),
 PosixPath('gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20/node/tweets/node.part5.parquet'),
 PosixPath('gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20/node/tweets/node.part6.parquet')]

## Join profile with other data

In [None]:
edge_df = pd.read_csv(data_folder_path/"edge.csv")
label_df = pd.read_csv(data_folder_path/"label.csv")
split_df = pd.read_csv(data_folder_path/"label.csv")

In [None]:
edge_df.relation.unique()

array(['post', 'friend', 'follow'], dtype=object)

## Twibot 22

# Explore Data

In [None]:
import pandas as pd
from pathlib import Path

/content


In [None]:


domain_data_file = "Twi20_domain.parquet"
neighbor_data_file = "Twi20_neighbor.parquet"
label_data_file = "Twi20_label.parquet"
profile_data_file = "Twi20_profile.parquet"
tweets_data_file = "Twi20_tweets.parquet"

In [None]:
label_df = pd.read_parquet(data_folder_path/label_data_file)
domain_df = pd.read_parquet(data_folder_path/domain_data_file)
neighbor_df = pd.read_parquet(data_folder_path/neighbor_data_file)
profile_df = pd.read_parquet(data_folder_path/profile_data_file)
tweet_df = pd.read_parquet(data_folder_path/tweets_data_file)

In [None]:
profile_df

Unnamed: 0,ID,name,screen_name,location,profile_location,description,protected,followers_count,friends_count,listed_count,...,profile_image_url,profile_image_url_https,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,has_extended_profile,default_profile,default_profile_image
0,17461978,SHAQ,SHAQ,"Orlando, FL","{'id': '55b4f9e5c516e0b6', 'url': 'https://api...","VERY QUOTATIOUS, I PERFORM RANDOM ACTS OF SHAQ...",False,15349596,692,45568,...,http://pbs.twimg.com/profile_images/1673907275...,https://pbs.twimg.com/profile_images/167390727...,2FC2EF,181A1E,252429,666666,True,False,False,False
1,1297437077403885568,Jennifer Fishpaw,JenniferFishpaw,,,,False,0,44,0,...,http://pbs.twimg.com/profile_images/1297437406...,https://pbs.twimg.com/profile_images/129743740...,1DA1F2,C0DEED,DDEEF6,333333,True,True,True,False
2,17685258,Brad Parscale,parscale,Florida,,Owner @ Parscale Strategy. Senior Advisor Digi...,False,762839,475,3201,...,http://pbs.twimg.com/profile_images/1295453225...,https://pbs.twimg.com/profile_images/129545322...,AB2316,FFFFFF,FFFFFF,666666,False,False,False,False
3,15750898,FOX 13 Tampa Bay,FOX13News,"Tampa, FL",,Bringing you the important stuff like breaking...,False,327587,4801,1744,...,http://pbs.twimg.com/profile_images/1293193013...,https://pbs.twimg.com/profile_images/129319301...,0B2F8A,FFFFFF,E8EEF0,333333,True,False,False,False
4,1659167666,Vonte The Plug 🎤🔌,VonteThePlugNC,"Jacksonville Beach, FL","{'id': '5e281c17a74c170f', 'url': 'https://api...",MOTIVATION 3 OUT NOW 🔥 Singles: ‘Lil Shawdy’ &...,False,13324,647,44,...,http://pbs.twimg.com/profile_images/1181662400...,https://pbs.twimg.com/profile_images/118166240...,1DA1F2,C0DEED,DDEEF6,333333,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9456,452754350,Alan Reifman,AlanReifman,"Lubbock, Texas",,Texas Tech professor of human devt and family ...,False,7760,8104,106,...,http://pbs.twimg.com/profile_images/4718599734...,https://pbs.twimg.com/profile_images/471859973...,0084B4,FFFFFF,DDEEF6,333333,True,False,False,False
9457,850435801687183362,Junk Wax Investment Services ($19.99 Per Month),CardsFromAttic,JunkWaxSylvania,,Satirizing the sports card industry one tweet ...,False,8446,408,55,...,http://pbs.twimg.com/profile_images/1300649646...,https://pbs.twimg.com/profile_images/130064964...,1DA1F2,C0DEED,DDEEF6,333333,True,False,True,False
9458,2188795745,B,bkgreen09,United States,,,False,309,1961,3,...,http://pbs.twimg.com/profile_images/1168654618...,https://pbs.twimg.com/profile_images/116865461...,3B94D9,000000,000000,000000,False,True,False,False
9459,940687680,bilal koç,bilalko14,,,,False,154,1019,0,...,http://pbs.twimg.com/profile_images/2840708085...,https://pbs.twimg.com/profile_images/284070808...,1DA1F2,C0DEED,DDEEF6,333333,True,False,True,False
