# This notebook provides an alternative view at dataset exploratory stats also provided in exploratory_stats.py. 

### Requires the Twibot-22 (https://twibot22.github.io/) to be installed and placed inside the /data directory

Expected directory structure: 
- data/Twibot-22/label_new.json
- data/Twibot-22/node_new.json

#### node_new contains both tweet and user data each stored in their own object, keyed by tweet/user ID
#### label_new contains labels (human/bot) for each user, keyed by user ID

### Imports and Data Loading

In [49]:
import numpy as np
from nltk import RegexpTokenizer
import string
import json
from pathlib import Path
import unicodedata

In [50]:
# Helper function for loading and returning json data as an object
def get_file(filename):
    data = {}
    try:
        with open(filename, 'r') as f:
            data = json.load(f)
        print("JSON data loaded successfully:")
    except Exception as e:
        print(e)
    return {"data": data}

# Helper function for checking if text contains devanagari characters
# For the sake of interpretability, we will be omitting these tokens from our results for some statistics
def contains_devanagari_unicodedata(text):
    '''Check if text contains Devanagari characters'''
    for char in text:
        if 'DEVANAGARI' in unicodedata.name(char, '').upper():
            return True
    return False

In [51]:
# Get the notebook's directory, then go to project root
notebook_dir = Path().resolve()
project_root = notebook_dir.parent

# Now reference files relative to project root
node_new_path = project_root / "data" / "Twibot-22" / "node_new.json"
label_new_path = project_root / "data" / "Twibot-22" / "label_new.json"

# Load the two json files from the Twibot22 dataset 
node_new_Twibot22 = get_file(node_new_path)
label_new_Twibot22 = get_file(label_new_path)

JSON data loaded successfully:
JSON data loaded successfully:


### Dataset navigation and basic stats

In [52]:
# node_new_Twibot22 is stored as a list of tweet objects/dicts with a unique tweet ID as the key, 
# The key is "t" followed by the unique tweet ID
# author (user) ID has the same format, except it is prefaced by "u" 
point1 = node_new_Twibot22["data"]["t1498018021658431488"]
point1_author_id = point1["author_id"]

In [53]:
# Printing tweet data object
print("Tweet data object:")
print(point1)
print()

# Printing the tweet's label (human/bot)
print("Tweet label:", label_new_Twibot22["data"]["u"+str(point1_author_id)])

Tweet data object:
{'attachments': None, 'author_id': 20441260, 'context_annotations': None, 'conversation_id': 1498018021658431488, 'created_at': '2022-02-27 19:31:42+00:00', 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'rhpsia', 'name': 'Sia At The Game', 'id': 809332867, 'id_str': '809332867', 'indices': [3, 10]}], 'urls': []}, 'geo': None, 'id': 't1498018021658431488', 'in_reply_to_user_id': None, 'lang': 'en', 'possibly_sensitive': False, 'public_metrics': {'retweet_count': 10, 'reply_count': None, 'like_count': 0, 'quote_count': None}, 'referenced_tweets': None, 'reply_settings': None, 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'text': 'RT @rhpsia: BREAKING: Telecom giant Ericsson used slush funds, trips for Iraqi officials &amp; payoffs through middlemen to executives ‚Äî and po‚Ä¶', 'withheld': None}

Tweet label: human


In [54]:
# Count top-level keys in data/Twibot-22/node_new.json

# Double check that the dataset jsons were properly loaded:
try:
    # node_new_Twibot22 is expected to be a dict returned by get_file() earlier in this notebook
    data_obj = node_new_Twibot22 if not isinstance(node_new_Twibot22, dict) else node_new_Twibot22.get("data", node_new_Twibot22)
except NameError:
    # If the variable isn't defined (cell not run), load the file now
    print("node_new_Twibot22 not found in the notebook state - loading from file")
    node_new_Twibot22 = get_file("data/Twibot-22/node_new.json")
    data_obj = node_new_Twibot22.get("data", node_new_Twibot22)

# Ensure we have a mapping-like object
if not isinstance(data_obj, dict):
    print("Unexpected format: expected a JSON object/dict for node data.")
# Proceed with counting the number of top-level keys in node_new.json AKA # of tweets in dataset
else:
    user_keys = [k for k in data_obj.keys() if isinstance(k, str) and k.startswith("u")]
    user_count = len(user_keys)

    tweet_keys = [k for k in data_obj.keys() if isinstance(k, str) and k.startswith("t")]
    tweet_count = len(tweet_keys)
    
    print(f"Scanned {len(data_obj)} top-level keys in the node object (# of tweets + user data objects in dataset).")
    print(f"Found {user_count} keys starting with 'u' in data/Twibot-22/node_new.json (# of users total in the dataset)")
    print(f"Found {tweet_count} keys starting with 't' in data/Twibot-22/node_new.json (# of tweets total in the dataset)")
    
    # Show up to 10 sample keys to verify
    print()
    print("Sample tweet keys (up to 10):", tweet_keys[:10])
    print("Sample user keys (up to 10):", user_keys[:])

Scanned 34936 top-level keys in the node object (# of tweets + user data objects in dataset).
Found 3708 keys starting with 'u' in data/Twibot-22/node_new.json (# of users total in the dataset)
Found 31228 keys starting with 't' in data/Twibot-22/node_new.json (# of tweets total in the dataset)

Sample tweet keys (up to 10): ['t1498018021658431488', 't1497947823005962245', 't1497902014763786241', 't1497736000008867841', 't1497640323631222784', 't1497417782450864130', 't1497402632301854722', 't1497288192969654273', 't1497277930623361033', 't1497275497126019076']
Sample user keys (up to 10): ['u15211869', 'u1109809482656149504', 'u234059290', 'u1322906536029626368', 'u21920591', 'u933227158398386182', 'u49009937', 'u1009495181882773506', 'u73710441', 'u542721191', 'u73125667', 'u374208504', 'u224965117', 'u35708687', 'u1360768923814264832', 'u256644046', 'u1076116243265060864', 'u368644797', 'u13678962', 'u945957031323639808', 'u1863061536', 'u1425141260256301059', 'u305236970', 'u123160

In [55]:
num_bot = 0 
num_human = 0
num_unlabeled = 0

for cur_tweet_id in tweet_keys: 
    cur_tweet = node_new_Twibot22["data"][cur_tweet_id]
    cur_author_id = cur_tweet["author_id"]
    
    if "u"+str(cur_author_id) in label_new_Twibot22["data"]: 
        cur_author_label = label_new_Twibot22["data"]["u"+str(cur_author_id)]
    else:
        num_unlabeled+= 1 # ids nonexistent in labels

    if (cur_author_label == "human"):
        num_human += 1
    if (cur_author_label == "bot"):
        num_bot += 1

    node_new_Twibot22["data"][cur_tweet_id]["author_label"] = cur_author_label

print("Number of bot tweets in dataset: ", num_bot)
print("Number of human tweets in dataset: ", num_human)
print("Number of unlabeled tweets in dataset: ", num_unlabeled)

Number of bot tweets in dataset:  8171
Number of human tweets in dataset:  23057
Number of unlabeled tweets in dataset:  644


### Basic exploratory statistics of our dataset using the same regex pattern and stopwords utilized in pipeline.py

In [56]:
tweet_regex = r"(?:@[\w_]+)|(?:\#[\w_]+)|(?:(?i:https?://)\S+)|(?:(?i:www\.)\S+)|(?:\w+(?:-\w+)*(?:'\w+)?)|(?:[^\w\s])"
tknzr = RegexpTokenizer(pattern= tweet_regex)

# Use default stopwords used in StopWordsRemover() class used in pipeline.py
# https://github.com/apache/spark/blob/master/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt

pyspark_stopwords = set([
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and",
    "any", "are", "aren", "aren't", "as", "at", "be", "because", "been", "before",
    "being", "below", "between", "both", "but", "by", "can", "can't", "cannot",
    "could", "couldn", "couldn't", "d", "did", "didn", "didn't", "do", "does",
    "doesn", "doesn't", "doing", "don", "don't", "down", "during", "each", "few",
    "for", "from", "further", "had", "hadn", "hadn't", "has", "hasn", "hasn't",
    "have", "haven", "haven't", "having", "he", "he'd", "he'll", "he's", "her",
    "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's",
    "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn", "isn't",
    "it", "it's", "its", "itself", "let's", "ll", "m", "me", "more", "most",
    "mustn", "mustn't", "my", "myself", "needn", "needn't", "no", "nor", "not",
    "now", "o", "of", "off", "on", "once", "only", "or", "other", "ought", "our",
    "ours", "ourselves", "out", "over", "own", "re", "s", "same", "shan", "shan't",
    "she", "she'd", "she'll", "she's", "should", "should've", "shouldn", "shouldn't",
    "so", "some", "such", "t", "than", "that", "that's", "the", "their", "theirs",
    "them", "themselves", "then", "there", "there's", "these", "they", "they'd",
    "they'll", "they're", "they've", "this", "those", "through", "to", "too",
    "under", "until", "up", "ve", "very", "was", "wasn", "wasn't", "we", "we'd",
    "we'll", "we're", "we've", "were", "weren", "weren't", "what", "what's", "when",
    "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why",
    "why's", "will", "with", "won", "won't", "wouldn", "wouldn't", "y", "you",
    "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"
])

pyspark_stopwords.update(string.punctuation)
set_1 = {'‚Ä¶', '‚Äô', '...', '‚Äú', '‚Äù','‚Äò'}
pyspark_stopwords.update(set_1)


### Tweet Length Analysis

In [57]:
bot_lengths = np.array([])
human_lengths = np.array([])
for cur_tweet_id in tweet_keys: 
    cur_tweet = node_new_Twibot22["data"][cur_tweet_id]
    cur_author_id = cur_tweet["author_id"]
    if "u"+str(cur_author_id) not in label_new_Twibot22["data"]: 
        continue
    cur_author_label = label_new_Twibot22["data"]["u"+str(cur_author_id)]
    tokenized_tweet = tknzr.tokenize(cur_tweet["text"])
    if (cur_author_label == "human"):
        human_lengths = np.append(human_lengths, len(tokenized_tweet))
    if (cur_author_label == "bot"):
        bot_lengths = np.append(bot_lengths, len(tokenized_tweet))
def stats(arr):
    mean = np.mean(arr)
    std = np.std(arr)
    q2 = np.percentile(arr, 50) 
    minimum = np.min(arr)
    maximum = np.max(arr)

    print("Mean:", mean)
    print("Standard deviation:", std)
    print("Median (Q2):", q2)
    print("Minimum:", minimum)
    print("Maximum:", maximum)

print("Human tweet length stats:")
stats(human_lengths)
print("\nBot tweet length stats:")
stats(bot_lengths)

Human tweet length stats:
Mean: 25.66122776514645
Standard deviation: 17.97132716696504
Median (Q2): 25.0
Minimum: 1.0
Maximum: 210.0

Bot tweet length stats:
Mean: 23.397645038636085
Standard deviation: 15.191378330006938
Median (Q2): 22.0
Minimum: 1.0
Maximum: 188.0


### Frequent grams, bigrams and trigrams

In [58]:
# Most frequent grams

bot_dict = {}
human_dict = {}
for cur_tweet_id in tweet_keys: 
    cur_tweet = node_new_Twibot22["data"][cur_tweet_id]
    cur_author_id = cur_tweet["author_id"]
    if "u"+str(cur_author_id) not in label_new_Twibot22["data"]: 
        continue
    cur_author_label = label_new_Twibot22["data"]["u"+str(cur_author_id)]
    tokenized_tweet = tknzr.tokenize(cur_tweet["text"])
    filtered_tweet = [word for word in tokenized_tweet 
                      if word not in pyspark_stopwords and not contains_devanagari_unicodedata(word)]
    if (cur_author_label == "human"):
        for token in filtered_tweet:
            if human_dict.get(token, 0) == 0:
                human_dict[token] = 1
            else:
                 human_dict[token] = human_dict[token]+1
    if (cur_author_label == "bot"):
        for token in filtered_tweet:
            if bot_dict.get(token, 0) == 0:
                bot_dict[token] = 1
            else:
                 bot_dict[token] = bot_dict[token]+1
sorted_bot = dict(sorted(bot_dict.items(), key=lambda item: item[1], reverse=True))
sorted_human = dict(sorted(human_dict.items(), key=lambda item: item[1], reverse=True))
print("Top 100 bot tokens:")
print(list(sorted_bot.items())[:100])
print("\nTop 100 human tokens:")
print(list(sorted_human.items())[:100])

    


Top 100 bot tokens:
[('RT', 2693), ('I', 1012), ('The', 479), ('amp', 461), ('just', 358), ('today', 278), ('like', 265), ('3', 259), ('1', 249), ('We', 247), ('Ô∏è', 244), ('de', 232), ('new', 217), ('us', 214), ('2021', 210), ('2', 200), ('Punk', 200), ('#ZeroTrust', 195), ('üí¢', 195), ('This', 193), ('DOS', 191), ('one', 187), ('„Äê', 187), ('A', 178), ('time', 174), ('@SnaptADC', 172), ('What', 170), ('üî•', 169), ('5', 168), ('2022', 159), ('„Äë', 157), ('#cryptoart', 154), ('know', 145), ('gt', 145), ('people', 142), ('It', 141), ('New', 141), ('You', 140), ('get', 139), ('pips', 134), ('How', 132), ('Check', 127), ('0', 126), ('If', 124), ('#ZTNA', 124), ('#NFTGiveaway', 124), ('work', 122), ('first', 121), ('#NFTartist', 121), ('#NFTProject', 121), ('#NFTdrop', 121), ('In', 120), ('#NFTCommumity', 120), ('#tezos', 120), ('#nftcollector', 120), ('make', 119), ('would', 118), ('win', 116), ('\u2063', 116), ('üÜï', 115), ('Get', 114), ('More', 109), ('think', 109), ('follow', 

In [59]:
# Most frequent bigrams
from nltk import bigrams

bot_dict = {}
human_dict = {}
for cur_tweet_id in tweet_keys: 
    cur_tweet = node_new_Twibot22["data"][cur_tweet_id]
    cur_author_id = cur_tweet["author_id"]
    if "u"+str(cur_author_id) not in label_new_Twibot22["data"]: 
        continue
    cur_author_label = label_new_Twibot22["data"]["u"+str(cur_author_id)]
    tokenized_tweet = tknzr.tokenize(cur_tweet["text"])
    filtered_tweet = [word for word in tokenized_tweet 
                      if word not in pyspark_stopwords and not contains_devanagari_unicodedata(word)]
    filtered_tweet = bigrams(filtered_tweet)
    if (cur_author_label == "human"):
        for token in filtered_tweet:
            if human_dict.get(token, 0) == 0:
                human_dict[token] = 1
            else:
                 human_dict[token] = human_dict[token]+1
    if (cur_author_label == "bot"):
        for token in filtered_tweet:
            if bot_dict.get(token, 0) == 0:
                bot_dict[token] = 1
            else:
                 bot_dict[token] = bot_dict[token]+1
sorted_bot = dict(sorted(bot_dict.items(), key=lambda item: item[1], reverse=True))
sorted_human = dict(sorted(human_dict.items(), key=lambda item: item[1], reverse=True))
print("Top 100 bot tokens:")
print(list(sorted_bot.items())[:100])
print("\nTop 100 human tokens:")
print(list(sorted_human.items())[:100])

Top 100 bot tokens:
[(('DOS', 'Punk'), 188), (('RT', '@SnaptADC'), 169), (('#NFTCommumity', '#NFTGiveaway'), 120), (('#NFTdrop', '#NFTartist'), 120), (('#NFTartist', '#nftcollector'), 120), (('#nftcollector', '#NFTProject'), 120), (('#forex', '#signaltrading'), 107), (('#signaltrading', '#trading'), 107), (('#trading', '#eurusd'), 107), (('#eurusd', '#foreignexchange'), 107), (('Zero', 'Trust'), 90), (('lucky', 'winner'), 89), (('today', 'just'), 88), (('simple', 'steps'), 88), (('Get', 'iPhone'), 87), (('iPhone', 'XS'), 87), (('XS', 'today'), 87), (('just', 'follow'), 87), (('follow', 'simple'), 87), (('steps', 'win'), 87), (('win', 'just'), 87), (('just', 'click'), 87), (('click', 'submit'), 87), (('submit', 'E-mail'), 87), (('E-mail', 'lucky'), 87), (('#nftart', '#cryptoart'), 84), (('#NFTGiveaway', '#NFTs'), 71), (('gt', 'gt'), 70), (('Price', 'üòä'), 70), (('üòä', '3'), 70), (('3', 'Tezos'), 70), (('Tezos', 'üåº'), 70), (('üåº', '#NFTCommumity'), 70), (('#NFTs', '#tezos'), 70)

In [60]:
# Most frequent trigrams
from nltk import trigrams

bot_dict = {}
human_dict = {}
for cur_tweet_id in tweet_keys: 
    cur_tweet = node_new_Twibot22["data"][cur_tweet_id]
    cur_author_id = cur_tweet["author_id"]
    if "u"+str(cur_author_id) not in label_new_Twibot22["data"]: 
        continue
    cur_author_label = label_new_Twibot22["data"]["u"+str(cur_author_id)]
    tokenized_tweet = tknzr.tokenize(cur_tweet["text"])
    filtered_tweet = [word for word in tokenized_tweet 
                      if word not in pyspark_stopwords and not contains_devanagari_unicodedata(word)]
    filtered_tweet = trigrams(filtered_tweet)
    if (cur_author_label == "human"):
        for token in filtered_tweet:
            if human_dict.get(token, 0) == 0:
                human_dict[token] = 1
            else:
                 human_dict[token] = human_dict[token]+1
    if (cur_author_label == "bot"):
        for token in filtered_tweet:
            if bot_dict.get(token, 0) == 0:
                bot_dict[token] = 1
            else:
                 bot_dict[token] = bot_dict[token]+1
sorted_bot = dict(sorted(bot_dict.items(), key=lambda item: item[1], reverse=True))
sorted_human = dict(sorted(human_dict.items(), key=lambda item: item[1], reverse=True))

print("Top 100 bot trigrams:")
print(list(sorted_bot.items())[:100])
print("\nTop 100 human trigrams:")
print(list(sorted_human.items())[:100])

Top 100 bot trigrams:
[(('#NFTdrop', '#NFTartist', '#nftcollector'), 120), (('#NFTartist', '#nftcollector', '#NFTProject'), 120), (('#forex', '#signaltrading', '#trading'), 107), (('#signaltrading', '#trading', '#eurusd'), 107), (('#trading', '#eurusd', '#foreignexchange'), 107), (('Get', 'iPhone', 'XS'), 87), (('iPhone', 'XS', 'today'), 87), (('XS', 'today', 'just'), 87), (('today', 'just', 'follow'), 87), (('just', 'follow', 'simple'), 87), (('follow', 'simple', 'steps'), 87), (('simple', 'steps', 'win'), 87), (('steps', 'win', 'just'), 87), (('win', 'just', 'click'), 87), (('just', 'click', 'submit'), 87), (('click', 'submit', 'E-mail'), 87), (('submit', 'E-mail', 'lucky'), 87), (('E-mail', 'lucky', 'winner'), 87), (('Price', 'üòä', '3'), 70), (('üòä', '3', 'Tezos'), 70), (('3', 'Tezos', 'üåº'), 70), (('Tezos', 'üåº', '#NFTCommumity'), 70), (('üåº', '#NFTCommumity', '#NFTGiveaway'), 70), (('#NFTCommumity', '#NFTGiveaway', '#NFTs'), 70), (('#NFTGiveaway', '#NFTs', '#tezos'), 70)