In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Install required libraries
!pip install -q spacy scipy pandas numpy nltk

# Download the spaCy English model
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m102.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import os
dir = "/content/drive/MyDrive/"
output_dir = "/content/drive/MyDrive/Nour"
data = f"{output_dir}/bluesky/exports/merged_dataset.csv"

In [None]:
import pandas as pd
df = pd.read_csv(data)

In [None]:
print(df.columns)

Index(['post.author.created_at', 'post.author.did', 'post.author.display_name',
       'post.author.handle', 'post.author.verification', 'post.cid',
       'post.indexed_at', 'post.like_count', 'post.quote_count',
       'post.record.created_at', 'post.record.labels', 'post.record.reply',
       'post.reply.parent.author.did', 'post.record.reply.parent.uri',
       'post.record.reply.parent.cid', 'post.record.tags', 'post.record.text',
       'post.reply_count', 'post.repost_count', 'post.uri', 'reason',
       'post_type', 'in_reply_to_id', 'in_reply_to_user_id', 'user_verified',
       'user_handle', 'user_did', 'user_description', 'user_followers_count',
       'user_friends_count', 'posts_count', 'user_listed_count',
       'user_follower_handles', 'user_friend_handles', 'total_posts',
       'mentions_total', 'hashtags_total', 'posts_with_mentions',
       'posts_with_hashtags', 'avg_mentions_per_post',
       'avg_hashtags_per_post'],
      dtype='object')


In [None]:
df['user_verified'].unique()

array(['unknown', 'verified', 'not_verified'], dtype=object)

In [None]:
df = df.rename(columns={"mentions_total": "total_mentions"})

In [None]:
# map values
mapping = {
    "verified": 1,
    "not_verified": 0,
    "unknown": 0
}

df["user_verified"] = df["user_verified"].map(mapping)

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(0)

In [None]:
df['total_mentions'].unique()

array([ 0.,  1.,  2.,  4., 33.,  3.])

# **User metadata features**

In [None]:
# Re-import after kernel reset
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.stats import entropy, skew, kurtosis

def extract_user_metadata_features(df):
    # Ensure datetime parsing
    df['post.author.created_at'] = pd.to_datetime(df['post.author.created_at'], errors='coerce').dt.tz_localize(None)
    df['post.record.created_at'] = pd.to_datetime(df['post.record.created_at'], errors='coerce').dt.tz_localize(None)

    # --- Basic Profile Info ---
    df['screen_name_length'] = df['post.author.handle'].astype(str).apply(len)
    df['screen_name_digit_count'] = df['post.author.handle'].astype(str).apply(lambda x: sum(c.isdigit() for c in x))
    df['user_name_length'] = df['post.author.handle'].astype(str).apply(len)
    df['account_age_days'] = (datetime(2025, 7, 27) - df['post.author.created_at']).dt.days
    # df['default_profile_image'] = df['user_default_profile_image'].astype(int)
    df['user_verified'] = df['user_verified'].astype(int)

    # Description features
    df['user_description'] = df['user_description'].fillna('').astype(str)
    df['user_description_length'] = df['user_description'].apply(len)
    df['has_profile_description'] = (df['user_description_length'] > 0).astype(int)

    # Unique description count
    user_desc_df = df[['post.author.did', 'user_description']].drop_duplicates()
    desc_counts = user_desc_df['user_description'].value_counts()
    user_desc_df['description_occurrence_count'] = user_desc_df['user_description'].map(desc_counts)
    user_desc_df['is_description_unique'] = (user_desc_df['description_occurrence_count'] == 1).astype(int)
    df = df.merge(user_desc_df[['post.author.did', 'description_occurrence_count', 'is_description_unique']], on='post.author.did', how='left')

    # --- Distributional Statistics for Friends, Followers, Favorites, Tweets ---
    for col in ['user_friends_count', 'user_followers_count', 'total_posts']:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    distribution_stats = {}
    for col in ['user_friends_count', 'user_followers_count', 'posts_count']:
        values = df[col].values
        distribution_stats[f'{col}_min'] = np.min(values)
        distribution_stats[f'{col}_max'] = np.max(values)
        distribution_stats[f'{col}_median'] = np.median(values)
        distribution_stats[f'{col}_mean'] = np.mean(values)
        distribution_stats[f'{col}_std'] = np.std(values)
        distribution_stats[f'{col}_skew'] = skew(values)
        distribution_stats[f'{col}_kurtosis'] = kurtosis(values)
        distribution_stats[f'{col}_entropy'] = entropy(np.histogram(values, bins=10, density=True)[0] + 1e-9)

    stats_df = pd.DataFrame([distribution_stats])

    # Aggregate tweet activity per user
    #df['is_mention'] = df['text'].astype(str).apply(lambda x: 1 if '@' in x else 0)
    user_activity = df.groupby('post.author.did').agg(
        total_posts=('post_type', lambda x: (x == 'original').sum()),
        total_reposts=('post_type', lambda x: (x == 'repost').sum()),
        total_replies=('post_type', lambda x: (x == 'reply').sum()),
        first_post_time=('post.record.created_at', 'min'),
        last_post_time=('post.record.created_at', 'max')
    ).reset_index()
    user_activity['total_mentions']=df['total_mentions']

    user_activity['active_hours'] = (user_activity['last_post_time'] - user_activity['first_post_time']).dt.total_seconds() / 3600
    user_activity['active_hours'] = user_activity['active_hours'].replace(0, np.nan)

    for col in ['posts', 'reposts', 'replies', 'mentions']:
        user_activity[f'{col}_per_hour'] = user_activity[f'total_{col}'] / user_activity['active_hours']

    df = df.merge(user_activity[['post.author.did', 'total_posts', 'posts_per_hour',
                                 'total_reposts', 'reposts_per_hour',
                                 'total_replies', 'replies_per_hour',
                                 'total_mentions', 'mentions_per_hour']], on='post.author.did', how='left')

    return df, stats_df


In [None]:
user_features, user_stats = extract_user_metadata_features(df)
print(user_features.head())
print(user_features.shape)
print(user_features.columns)

   post.author.created_at                   post.author.did  \
0 2025-02-12 17:55:18.147  did:plc:5fwnpskl6zvknaikbb5bwb3s   
1 2024-11-14 03:47:02.281  did:plc:4pon6zyd7tyyp7gpjbktus4j   
2 2024-12-11 23:42:22.843  did:plc:6dzaq2nkgwf5shgbchsnjwfw   
3 2024-12-11 23:42:22.843  did:plc:6dzaq2nkgwf5shgbchsnjwfw   
4 2025-03-05 18:50:42.345  did:plc:hdte5leshsk5zxfnovynif3o   

  post.author.display_name        post.author.handle post.author.verification  \
0   《☆Idiot Art☆》- offline  00dl3-aibfan.bsky.social                      NaN   
1           The Grand Lady        022770.bsky.social                      NaN   
2        Anonymous Patriot    0331online.bsky.social                      NaN   
3        Anonymous Patriot    0331online.bsky.social                      NaN   
4                 10bmnews      10bmnews.bsky.social                      NaN   

                                            post.cid  \
0  bafyreifqd6b4yu2hw2waujfreihycwijba4svuo5zxm5r...   
1  bafyreibydru2vwkqym

# **Timing features**

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import entropy, skew, kurtosis

def compute_time_deltas(timestamps):
    timestamps = sorted(pd.to_datetime(timestamps, errors='coerce').dropna())
    if len(timestamps) < 2:
        return []
    deltas = [(t2 - t1).total_seconds() for t1, t2 in zip(timestamps[:-1], timestamps[1:])]
    return deltas

def extract_temporal_features(df):
    df['post.record.created_at'] = pd.to_datetime(df['post.record.created_at'], errors='coerce')

    user_features = []

    for user_id, group in df.groupby('post.author.did'):
        user_dict = {'post.author.did': user_id}

        post_times = group[group['post_type'] == 'original']['post.record.created_at']
        post_deltas = compute_time_deltas(post_times)

        repost_times = group[group['post_type'] == 'repost']['post.record.created_at']
        repost_deltas = compute_time_deltas(repost_times)

        mention_filter = group['post.record.text'].astype(str).str.contains('@') | group['in_reply_to_user_id'].notnull()
        mention_times = group[mention_filter]['post.record.created_at']
        mention_deltas = compute_time_deltas(mention_times)

        def compute_stats(prefix, deltas):
            if len(deltas) == 0:
                stats = {f"{prefix}_{s}": np.nan for s in ['min', 'max', 'median', 'mean', 'std', 'skew', 'kurtosis', 'entropy']}
            else:
                hist = np.histogram(deltas, bins=10, density=True)[0] + 1e-9
                stats = {
                    f"{prefix}_min": np.min(deltas),
                    f"{prefix}_max": np.max(deltas),
                    f"{prefix}_median": np.median(deltas),
                    f"{prefix}_mean": np.mean(deltas),
                    f"{prefix}_std": np.std(deltas),
                    f"{prefix}_skew": skew(deltas),
                    f"{prefix}_kurtosis": kurtosis(deltas),
                    f"{prefix}_entropy": entropy(hist)
                }
            return stats

        user_dict.update(compute_stats("post_time", post_deltas))
        user_dict.update(compute_stats("repost_time", repost_deltas))
        user_dict.update(compute_stats("mention_time", mention_deltas))

        user_features.append(user_dict)

    temporal_df = pd.DataFrame(user_features)
    return temporal_df


In [None]:
temporal_features = extract_temporal_features(df)
print(temporal_features.head())
print(temporal_features.shape)
print(temporal_features.columns)

                    post.author.did  post_time_min  post_time_max  \
0  did:plc:2373gmka6swamb3wwcdqoefs            NaN            NaN   
1  did:plc:23wufz77rizcgyzcemtlq7q3            NaN            NaN   
2  did:plc:244k3lfd5j27cfmodmrbl5o7            NaN            NaN   
3  did:plc:255mnfwdshslqtawmd6msqwb            NaN            NaN   
4  did:plc:25smjnqpsiomhgsuajdez4qp            NaN            NaN   

   post_time_median  post_time_mean  post_time_std  post_time_skew  \
0               NaN             NaN            NaN             NaN   
1               NaN             NaN            NaN             NaN   
2               NaN             NaN            NaN             NaN   
3               NaN             NaN            NaN             NaN   
4               NaN             NaN            NaN             NaN   

   post_time_kurtosis  post_time_entropy  repost_time_min  ...  \
0                 NaN                NaN              NaN  ...   
1                 NaN           

# **Content based features**

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from scipy.stats import entropy, skew, kurtosis
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Define POS categories as mapped in the paper
POS_TAGS = {
    "VERB": "verbs",
    "NOUN": "nouns",
    "ADJ": "adjectives",
    "AUX": "modals",           # Modal auxiliaries
    "DET": "predeterminers",  # Pre-determiners/determiners
    "INTJ": "interjections",
    "ADV": "adverbs",
    "PRON": "pronouns",
    "SCONJ": "wh_words",
    "ADP": "wh_words"  # Some wh-words
}

def get_pos_counts_and_props(doc):
    pos_counts = Counter()
    total = 0
    for token in doc:
        if token.pos_ in POS_TAGS:
            mapped = POS_TAGS[token.pos_]
            pos_counts[mapped] += 1
            total += 1
    pos_props = {tag: (pos_counts[tag] / total if total > 0 else 0) for tag in set(POS_TAGS.values())}
    return pos_counts, pos_props
def compute_distribution_stats(values):
    if len(values) == 0:
        return {stat: np.nan for stat in ['min', 'max', 'median', 'mean', 'std', 'skew', 'kurtosis', 'entropy']}
    values = np.array(values)
    hist = np.histogram(values, bins=10, density=True)[0] + 1e-9
    return {
        'min': np.min(values),
        'max': np.max(values),
        'median': np.median(values),
        'mean': np.mean(values),
        'std': np.std(values),
        'skew': skew(values),
        'kurtosis': kurtosis(values),
        'entropy': entropy(hist)
    }

def extract_content_language_features(df):
    user_features = []

    # Group once to avoid repeated calls
    grouped = df.groupby("post.author.did")

    for user_id, group in grouped:
        user_dict = {'post.author.did': user_id}
        tweet_texts = group['post.record.text'].dropna().astype(str).tolist()

        word_counts = []
        word_entropies = []
        pos_freqs_per_tweet = {tag: [] for tag in set(POS_TAGS.values())}
        pos_props_per_tweet = {tag: [] for tag in set(POS_TAGS.values())}

        # Use nlp.pipe for efficient bulk processing
        for doc in nlp.pipe(tweet_texts, disable=["ner"]):  # disabling NER for speed
            words = [token.text.lower() for token in doc if token.is_alpha]
            word_count = len(words)
            word_counts.append(word_count)

            # Word entropy
            word_freq = Counter(words)
            probs = np.array(list(word_freq.values())) / word_count if word_count > 0 else [0]
            word_entropies.append(entropy(probs) if word_count > 0 else 0)

            # POS tagging
            pos_counts, pos_props = get_pos_counts_and_props(doc)
            for tag in pos_freqs_per_tweet:
                pos_freqs_per_tweet[tag].append(pos_counts.get(tag, 0))
                pos_props_per_tweet[tag].append(pos_props.get(tag, 0))

        # Add distribution stats
        for stat_name, stat_value in compute_distribution_stats(word_counts).items():
            user_dict[f'word_count_{stat_name}'] = stat_value

        for stat_name, stat_value in compute_distribution_stats(word_entropies).items():
            user_dict[f'word_entropy_{stat_name}'] = stat_value

        for tag in pos_freqs_per_tweet:
            stats = compute_distribution_stats(pos_freqs_per_tweet[tag])
            for stat_name, stat_value in stats.items():
                user_dict[f'pos_freq_{tag}_{stat_name}'] = stat_value

        for tag in pos_props_per_tweet:
            stats = compute_distribution_stats(pos_props_per_tweet[tag])
            for stat_name, stat_value in stats.items():
                user_dict[f'pos_prop_{tag}_{stat_name}'] = stat_value

        user_features.append(user_dict)

    return pd.DataFrame(user_features)


In [None]:
content_features = extract_content_language_features(df)
print(content_features.head())
print(content_features.columns)
print(content_features.shape)

  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(v

                    post.author.did  word_count_min  word_count_max  \
0  did:plc:2373gmka6swamb3wwcdqoefs               3              11   
1  did:plc:23wufz77rizcgyzcemtlq7q3              17              17   
2  did:plc:244k3lfd5j27cfmodmrbl5o7              43              43   
3  did:plc:255mnfwdshslqtawmd6msqwb              14              14   
4  did:plc:25smjnqpsiomhgsuajdez4qp               7              54   

   word_count_median  word_count_mean  word_count_std  word_count_skew  \
0                7.0              7.0        3.265986              0.0   
1               17.0             17.0        0.000000              NaN   
2               43.0             43.0        0.000000              NaN   
3               14.0             14.0        0.000000              NaN   
4               30.5             30.5       23.500000              0.0   

   word_count_kurtosis  word_count_entropy  word_entropy_min  ...  \
0                 -1.5        1.098612e+00          1.09861

In [None]:
content_features.isna().mean().sort_values(ascending=False).head(10)  # % of missing per column
content_features.describe().T[['mean', 'std']]                        # variability per feature

Unnamed: 0,mean,std
word_count_min,15.106562,13.429426
word_count_max,19.686334,15.135524
word_count_median,17.254064,13.663452
word_count_mean,17.287271,13.415985
word_count_std,1.903164,4.235566
...,...,...
pos_prop_nouns_mean,0.296733,0.165942
pos_prop_nouns_std,0.028882,0.063988
pos_prop_nouns_skew,0.043766,0.525363
pos_prop_nouns_kurtosis,-1.386336,1.006501


In [None]:
content_features.isna().sum().sort_values(ascending=False).head(10)

Unnamed: 0,0
pos_freq_interjections_kurtosis,1584
pos_prop_interjections_kurtosis,1584
pos_prop_interjections_skew,1584
pos_freq_interjections_skew,1584
pos_freq_adverbs_kurtosis,1330
pos_freq_adverbs_skew,1330
pos_prop_adverbs_kurtosis,1303
pos_prop_adverbs_skew,1303
pos_freq_adjectives_kurtosis,1283
pos_freq_adjectives_skew,1283


# **sentiment based features**

In [None]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
vad_df = pd.read_csv(f"{dir}/Nour/lexicons/NRC-VAD-Lexicon-v2.1.txt", sep="\t", header=None, skiprows=1,names=["word", "valence", "arousal", "dominance"])

In [None]:
vad_df.head()

Unnamed: 0,word,valence,arousal,dominance
0,a battery,0.134,-0.298,-0.096
1,a bit,-0.096,-0.264,-0.214
2,a bunch,0.088,-0.35,-0.068
3,a cappella,0.134,-0.116,-0.2
4,a couple,0.266,-0.11,0.09


In [None]:
vad_df = vad_df.drop_duplicates(subset="word")
for col in ["valence", "arousal", "dominance"]:
    vad_df[col] = vad_df[col].astype(float)
vad_lexicon = vad_df.set_index("word")[["valence", "arousal", "dominance"]].to_dict("index")

In [None]:
# Load Hedonometer lexicon
hedo_df = pd.read_csv(f"{dir}/Nour/lexicons/Hedonometer.csv")
hedo_df = hedo_df.dropna(subset=["Happiness Score"])
hedo_df["Happiness Score"] = hedo_df["Happiness Score"].astype(float)
hedo_lexicon = hedo_df.set_index("Word")["Happiness Score"].to_dict()

In [None]:
import re
import pandas as pd
import numpy as np
from collections import Counter
from scipy.stats import entropy, skew, kurtosis
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Pre-compiled emoticon regex
POS_EMOS_RE = re.compile(r"(:\)|:D|:-\)|😊|😁|😄|😃|😆|☺️|😍|😘|🤩)")
NEG_EMOS_RE = re.compile(r"(:\(|:'\(|:-\(|😢|😭|😞|😔|😡|😠|🙁|☹️)")

# Initialize VADER
vader = SentimentIntensityAnalyzer()

def compute_distribution_stats(values):
    if len(values) == 0:
        return {stat: np.nan for stat in ['min', 'max', 'median', 'mean', 'std', 'skew', 'kurtosis', 'entropy']}
    values = np.array(values)
    hist = np.histogram(values, bins=10, density=True)[0] + 1e-9
    return {
        'min': np.min(values),
        'max': np.max(values),
        'median': np.median(values),
        'mean': np.mean(values),
        'std': np.std(values),
        'skew': skew(values),
        'kurtosis': kurtosis(values),
        'entropy': entropy(hist)
    }

def extract_sentiment_features(df):
    user_features = []

    for user_id, group in df.groupby("post.author.did"):
        user_dict = {"post.author.did": user_id}
        texts = group["post.record.text"].dropna().astype(str).tolist()

        happiness, valence, arousal, dominance = [], [], [], []
        polarization, pos_emo, neg_emo = [], [], []

        for text in texts:
            words = re.findall(r'\w+', text.lower())

            h_vals = [hedo_lexicon[w] for w in words if w in hedo_lexicon]
            v_vals = [vad_lexicon[w]["valence"] for w in words if w in vad_lexicon]
            a_vals = [vad_lexicon[w]["arousal"] for w in words if w in vad_lexicon]
            d_vals = [vad_lexicon[w]["dominance"] for w in words if w in vad_lexicon]

            happiness.append(np.mean(h_vals) if h_vals else 0)
            valence.append(np.mean(v_vals) if v_vals else 0)
            arousal.append(np.mean(a_vals) if a_vals else 0)
            dominance.append(np.mean(d_vals) if d_vals else 0)

            vader_score = vader.polarity_scores(text)
            polarization.append(abs(vader_score['pos'] - vader_score['neg']))

            pos_count = len(POS_EMOS_RE.findall(text))
            neg_count = len(NEG_EMOS_RE.findall(text))

            pos_emo.append(pos_count)
            neg_emo.append(neg_count)

        total_emo = np.array(pos_emo) + np.array(neg_emo)

        # Sentiment scores
        score_sets = {
            "happiness": happiness,
            "valence": valence,
            "arousal": arousal,
            "dominance": dominance,
            "polarization": polarization
        }

        for name, values in score_sets.items():
            stats = compute_distribution_stats(values)
            for stat, val in stats.items():
                user_dict[f"{name}_{stat}"] = val

        # Emoticon stats
        for label, values in {
            "pos_emoticons": pos_emo,
            "neg_emoticons": neg_emo,
            "total_emoticons": total_emo
        }.items():
            stats = compute_distribution_stats(values)
            for stat, val in stats.items():
                user_dict[f"{label}_{stat}"] = val

        # Ratios
        total_neg = sum(neg_emo)
        total_pos = sum(pos_emo)
        user_dict["pos_neg_emo_ratio"] = (total_pos / (total_neg + 1e-5)) if total_neg > 0 else np.nan
        user_dict["posts_with_emoticons_ratio"] = (np.count_nonzero(total_emo) / len(texts)) if texts else 0

        user_features.append(user_dict)

    return pd.DataFrame(user_features)


In [None]:
sentiment_features=extract_sentiment_features(df)
print(sentiment_features.head())
print(sentiment_features.columns)
print(sentiment_features.shape)

  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(values),
  'skew': skew(values),
  'kurtosis': kurtosis(v

                    post.author.did  happiness_min  happiness_max  \
0  did:plc:2373gmka6swamb3wwcdqoefs       4.980000       5.483333   
1  did:plc:23wufz77rizcgyzcemtlq7q3       5.002500       5.002500   
2  did:plc:244k3lfd5j27cfmodmrbl5o7       5.189189       5.189189   
3  did:plc:255mnfwdshslqtawmd6msqwb       5.356923       5.356923   
4  did:plc:25smjnqpsiomhgsuajdez4qp       5.219400       5.732000   

   happiness_median  happiness_mean  happiness_std  happiness_skew  \
0          5.180000        5.214444       0.206923        0.245077   
1          5.002500        5.002500       0.000000             NaN   
2          5.189189        5.189189       0.000000             NaN   
3          5.356923        5.356923       0.000000             NaN   
4          5.475700        5.475700       0.256300        0.000000   

   happiness_kurtosis  happiness_entropy  valence_min  ...  \
0                -1.5       1.098612e+00    -0.079600  ...   
1                 NaN       2.162327e-08

  'skew': skew(values),
  'kurtosis': kurtosis(values),


# **network based features**

In [None]:
!pip install networkx



In [None]:
# Re-import necessary dependencies in case of reset
import pandas as pd
import numpy as np
import re
import networkx as nx
from collections import defaultdict
from scipy.stats import entropy, skew, kurtosis

MENTION_REGEX = r"@(\w{1,15})"

def build_user_interaction_graph(df):
    user_edges = defaultdict(list)
    for _, row in df.iterrows():
        author = row['post.author.display_name']
        mentions = re.findall(MENTION_REGEX, str(row['post.record.text']))
        for mentioned_user in mentions:
            if mentioned_user != author:
                user_edges[author].append(mentioned_user)

        replied_to = row.get('in_reply_to_user_id')
        if pd.notna(replied_to) and replied_to != author:
            user_edges[author].append(replied_to)

    return user_edges

def compute_distribution_stats(values):
    if len(values) == 0:
        return {stat: np.nan for stat in ['min', 'max', 'median', 'mean', 'std', 'skew', 'kurtosis', 'entropy']}
    values = np.array(values)
    hist = np.histogram(values, bins=10, density=True)[0] + 1e-9
    return {
        'min': np.min(values),
        'max': np.max(values),
        'median': np.median(values),
        'mean': np.mean(values),
        'std': np.std(values),
        'skew': skew(values),
        'kurtosis': kurtosis(values),
        'entropy': entropy(hist)
    }

def compute_network_features(user_edges):
    features = []
    for user, targets in user_edges.items():
        G = nx.DiGraph()
        G.add_edges_from((user, tgt) for tgt in targets)

        node_count = G.number_of_nodes()
        edge_count = G.number_of_edges()
        out_strength = sum(dict(G.out_degree()).values())
        in_strength = sum(dict(G.in_degree()).values())
        density = nx.density(G)
        clustering = nx.clustering(G.to_undirected(), user) if user in G else 0.0

        out_degrees = [d for n, d in G.out_degree() if n != user]
        in_degrees = [d for n, d in G.in_degree() if n != user]
        strengths = [G.degree(n) for n in G.nodes() if n != user]

        out_stats = compute_distribution_stats(out_degrees)
        in_stats = compute_distribution_stats(in_degrees)
        strength_stats = compute_distribution_stats(strengths)

        feature_row = {
            'post.author.display_name': user,
            'network_node_count': node_count,
            'network_edge_count': edge_count,
            'network_out_strength': out_strength,
            'network_in_strength': in_strength,
            'network_density': density,
            'network_clustering_coeff': clustering
        }

        for stat_name, stat_value in out_stats.items():
            feature_row[f'out_strength_{stat_name}'] = stat_value
        for stat_name, stat_value in in_stats.items():
            feature_row[f'in_strength_{stat_name}'] = stat_value
        for stat_name, stat_value in strength_stats.items():
            feature_row[f'strength_{stat_name}'] = stat_value

        features.append(feature_row)

    return pd.DataFrame(features)

def extract_network_features(df):
    user_edges = build_user_interaction_graph(df)
    network_df = compute_network_features(user_edges)
    return network_df


In [None]:
network_features = extract_network_features(df)

print(network_features.head())
print(network_features.columns)
print(network_features.shape)

  'skew': skew(values),
  'kurtosis': kurtosis(values),


  post.author.display_name  network_node_count  network_edge_count  \
0           The Grand Lady                   2                   1   
1                      NaN                 154                 153   
2                    crash                   2                   1   
3                  ShanteJ                   2                   1   
4                    670rv                   4                   3   

   network_out_strength  network_in_strength  network_density  \
0                     1                    1         0.500000   
1                   153                  153         0.006494   
2                     1                    1         0.500000   
3                     1                    1         0.500000   
4                     3                    3         0.250000   

   network_clustering_coeff  out_strength_min  out_strength_max  \
0                         0                 0                 0   
1                         0                 0         

# **Merge all the features**

In [None]:
network_features.describe().T[['min', 'max', 'mean', 'std']]

Unnamed: 0,min,max,mean,std
network_node_count,2.0,154.0,2.942051,6.334752
network_edge_count,1.0,153.0,1.942051,6.334752
network_out_strength,1.0,153.0,1.942051,6.334752
network_in_strength,1.0,153.0,1.942051,6.334752
network_density,0.006493506,0.5,0.438229,0.110694
network_clustering_coeff,0.0,0.0,0.0,0.0
out_strength_min,0.0,0.0,0.0,0.0
out_strength_max,0.0,153.0,0.22734,5.897717
out_strength_median,0.0,0.0,0.0,0.0
out_strength_mean,0.0,0.993506,0.001476,0.038297


In [None]:
unique_authors = df['post.author.display_name'].nunique()
print("Unique users (authors):", unique_authors)

user_edges = build_user_interaction_graph(df)
nodes_in_graph = set()
for user, targets in user_edges.items():
    nodes_in_graph.add(user)
    nodes_in_graph.update(targets)

print("Total unique nodes in interaction graph:", len(nodes_in_graph))

Unique users (authors): 1501
Total unique nodes in interaction graph: 1725


In [None]:
merged_df = user_features.copy()

In [None]:
merged_df = merged_df.merge(temporal_features, on='post.author.did', how='left')

In [None]:
merged_df = merged_df.merge(content_features, on='post.author.did', how='left')

In [None]:
merged_df = merged_df.merge(sentiment_features, on='post.author.did', how='left')

In [None]:
id_screen_map = df[['post.author.did', 'post.author.display_name']].drop_duplicates()

In [None]:
network_features_df = network_features.merge(id_screen_map, on='post.author.display_name', how='left')

In [None]:
merged_df = merged_df.merge(network_features_df.drop(columns='post.author.display_name'), on='post.author.did', how='left')

In [None]:
merged_df = merged_df.drop_duplicates(subset='post.author.did')

In [None]:
print("Merged shape:", merged_df.shape)
print("Columns:", merged_df.columns)
print("Any missing values?", merged_df.isna().any().sum())

Merged shape: (1661, 337)
Columns: Index(['post.author.created_at', 'post.author.did', 'post.author.display_name',
       'post.author.handle', 'post.author.verification', 'post.cid',
       'post.indexed_at', 'post.like_count', 'post.quote_count',
       'post.record.created_at',
       ...
       'in_strength_kurtosis', 'in_strength_entropy', 'strength_min',
       'strength_max', 'strength_median', 'strength_mean', 'strength_std',
       'strength_skew', 'strength_kurtosis', 'strength_entropy'],
      dtype='object', length=337)
Any missing values? 130


In [None]:
len(df['post.author.did'].unique())

1661

In [None]:
len(merged_df['post.author.did'])

1661

In [None]:
grouped_df = merged_df.sort_values('post.author.did')

In [None]:
merged_df

Unnamed: 0,post.author.created_at,post.author.did,post.author.display_name,post.author.handle,post.author.verification,post.cid,post.indexed_at,post.like_count,post.quote_count,post.record.created_at,...,in_strength_kurtosis,in_strength_entropy,strength_min,strength_max,strength_median,strength_mean,strength_std,strength_skew,strength_kurtosis,strength_entropy
0,2025-02-12 17:55:18.147,did:plc:5fwnpskl6zvknaikbb5bwb3s,《☆Idiot Art☆》- offline,00dl3-aibfan.bsky.social,,bafyreifqd6b4yu2hw2waujfreihycwijba4svuo5zxm5r...,2025-03-26T03:14:52.500Z,1,0,2025-03-26 03:14:51.096,...,,,,,,,,,,
1,2024-11-14 03:47:02.281,did:plc:4pon6zyd7tyyp7gpjbktus4j,The Grand Lady,022770.bsky.social,,bafyreibydru2vwkqymtfifoy2ck3toimry6eyvysxdarf...,2025-05-29T10:00:04.249Z,0,0,2025-05-29 10:00:02.309,...,,2.162327e-08,1.0,1.0,1.0,1.000000,0.000000,,,2.162327e-08
2,2024-12-11 23:42:22.843,did:plc:6dzaq2nkgwf5shgbchsnjwfw,Anonymous Patriot,0331online.bsky.social,,bafyreiavsvrmm2cgj7ia4ke7taetp2g5odpvcwcuazf7z...,2025-04-01T09:40:59.100Z,0,0,2025-04-01 09:40:55.725,...,,,,,,,,,,
4,2025-03-05 18:50:42.345,did:plc:hdte5leshsk5zxfnovynif3o,10bmnews,10bmnews.bsky.social,,bafyreihscptirmjxdc5gu2pt23lgtirqlbx2kymnw2qjc...,2025-06-27T17:53:31.941Z,0,0,NaT,...,,,,,,,,,,
5,2024-11-13 18:49:45.880,did:plc:jqs2i2olt22acffnaqtggcb4,PNWWitchywoman,1692witchywoman.bsky.social,,bafyreibntwqts3efy7ytastyp55bqhbyesg6u6neregcp...,2025-03-06T18:52:05.460Z,3,0,2025-03-06 18:52:04.945,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3958,2024-11-11 11:24:52.080,did:plc:6kdjzb5x7zmmsmdywcjsdet3,ZeroGen,zerogenamerican.bsky.social,,bafyreiepbib3kgxgnslsmrrntys5lgqziuww4gxnjber4...,2025-04-21T07:28:57.080Z,0,0,2025-04-21 07:28:56.531,...,,,,,,,,,,
3959,2024-11-22 19:42:42.755,did:plc:rzevoar7dyrcm57k2xivayil,,ziggystardust1276.bsky.social,,bafyreih7y3hn733yo524yet4hjohjjbdaa25nybduf4c3...,2025-06-23T18:29:36.336Z,1,0,2025-06-23 18:29:34.490,...,149.006536,3.917988e-02,1.0,153.0,1.0,1.987013,12.208676,12.288472,149.006536,3.918224e-02
3961,2024-11-12 22:53:20.908,did:plc:e6nnyq6cx4yvtwzfutayed5g,Liz Furler,zilenaj.bsky.social,,bafyreig5ynk6r5oyi4qyfidwvihc3iudqly4frueagas2...,2025-05-20T01:37:30.452Z,1,0,2025-05-20 01:37:27.824,...,,,,,,,,,,
3962,2024-11-17 21:15:11.527,did:plc:6j277n6u3p7ovagzxgwxar5h,,zinemaniac.bsky.social,,bafyreif3pmg4ugmolgp5hcoktbertglxilbiqe2ilhb6s...,2025-05-10T19:54:41.646Z,0,0,2025-05-10 19:54:39.622,...,149.006536,3.917988e-02,1.0,153.0,1.0,1.987013,12.208676,12.288472,149.006536,3.918224e-02


In [None]:
cols_to_drop=['post.cid',
       'post.indexed_at', 'post.like_count', 'post.quote_count',
       'post.record.created_at', 'post.record.labels', 'post.record.reply',
       'post.reply.parent.author.did', 'post.record.reply.parent.uri',
       'post.record.reply.parent.cid', 'post.record.tags', 'post.record.text',
       'post.reply_count', 'post.repost_count', 'post.uri', 'reason',
       'post_type', 'in_reply_to_id', 'in_reply_to_user_id',
       'user_listed_count',
       'user_follower_handles', 'user_friend_handles',
       'hashtags_total', 'posts_with_mentions',
       'posts_with_hashtags', 'avg_mentions_per_post',
       'avg_hashtags_per_post', 'total_posts']

cleaned_df = grouped_df.drop(columns=[col for col in cols_to_drop if col in grouped_df.columns])


In [None]:
cleaned_df.head(5)

Unnamed: 0,post.author.created_at,post.author.did,post.author.display_name,post.author.handle,post.author.verification,user_verified,user_handle,user_did,user_description,user_followers_count,...,in_strength_kurtosis,in_strength_entropy,strength_min,strength_max,strength_median,strength_mean,strength_std,strength_skew,strength_kurtosis,strength_entropy
3623,2024-11-17 15:53:21.938,did:plc:2373gmka6swamb3wwcdqoefs,Steve Postman 🇨🇦,steve1970peng.bsky.social,,0,steve1970peng.bsky.social,did:plc:2373gmka6swamb3wwcdqoefs,Canadian #elbowsup\nIndigenous Canada 🇨🇦 Mi’km...,5638.0,...,,2.162327e-08,1.0,1.0,1.0,1.0,0.0,,,2.162327e-08
897,2024-11-11 20:08:40.605,did:plc:23wufz77rizcgyzcemtlq7q3,Heartmyblumini🌻🇺🇦🇨🇦🏳️‍🌈🔱,plateaudweller.bsky.social,,0,,,,0.0,...,,2.162327e-08,1.0,1.0,1.0,1.0,0.0,,,2.162327e-08
2818,2023-10-22 18:28:43.313,did:plc:244k3lfd5j27cfmodmrbl5o7,cathyfl7,cathyfl7.bsky.social,,0,,,,0.0,...,,,,,,,,,,
1594,2024-11-18 15:49:31.046,did:plc:255mnfwdshslqtawmd6msqwb,MyLondon,itsmylondon.bsky.social,,0,itsmylondon.bsky.social,did:plc:255mnfwdshslqtawmd6msqwb,"The latest news, features and events from peop...",72.0,...,,,,,,,,,,
3200,2023-08-21 23:30:40.668,did:plc:25smjnqpsiomhgsuajdez4qp,Sarah W,sandrayln.bsky.social,,0,sandrayln.bsky.social,did:plc:25smjnqpsiomhgsuajdez4qp,"Reader, programmer, part-time editor, \nHorse ...",143.0,...,,2.162327e-08,1.0,1.0,1.0,1.0,0.0,,,2.162327e-08


In [None]:
filled_df = cleaned_df.fillna(0)

In [None]:
filled_df['strength_min'].unique()

array([1., 0.])

In [None]:
output_path = f"{output_dir}/bluesky/exports/featured_dataset1.csv"

In [None]:
filled_df.to_csv(output_path, index=False)

In [None]:
filled_df.columns

Index(['post.author.created_at', 'post.author.did', 'post.author.display_name',
       'post.author.handle', 'post.author.verification', 'user_verified',
       'user_handle', 'user_did', 'user_description', 'user_followers_count',
       ...
       'in_strength_kurtosis', 'in_strength_entropy', 'strength_min',
       'strength_max', 'strength_median', 'strength_mean', 'strength_std',
       'strength_skew', 'strength_kurtosis', 'strength_entropy'],
      dtype='object', length=310)

# **twitter data preparation**

In [None]:
twitter_data = f"{output_dir}/twitter/user_level_features_cleaned.csv"

In [None]:
import pandas as pd
twitter_df=pd.read_csv(twitter_data)

In [None]:
twitter_df.shape

(60028, 315)

In [None]:
cols = twitter_df.columns.tolist()
for c in cols:
    print(c)

user_screen_name
lang
user_id
user_created_at
user_default_profile_image
user_description
user_favourites_count
user_followers_count
user_friends_count
user_listed_count
user_location
user_name
user_statuses_count
user_time_zone
user_urls
user_verified
user_type
screen_name_length
screen_name_digit_count
user_name_length
account_age_days
default_profile_image
user_description_length
has_profile_description
description_occurrence_count
is_description_unique
is_mention
total_tweets
tweets_per_hour
total_retweets
retweets_per_hour
total_replies
replies_per_hour
total_mentions
mentions_per_hour
tweet_time_min
tweet_time_max
tweet_time_median
tweet_time_mean
tweet_time_std
tweet_time_skew
tweet_time_kurtosis
tweet_time_entropy
retweet_time_min
retweet_time_max
retweet_time_median
retweet_time_mean
retweet_time_std
retweet_time_skew
retweet_time_kurtosis
retweet_time_entropy
mention_time_min
mention_time_max
mention_time_median
mention_time_mean
mention_time_std
mention_time_skew
mention_tim

In [None]:
# Drop unwanted columns from twitter_df
cols_to_drop = [
    "user_default_profile_image",
    "user_favourites_count",
    "user_listed_count",
    "user_time_zone",
    "default_profile_image",
    "is_mention"
]

twitter_df = twitter_df.drop(columns=cols_to_drop, errors="ignore")


In [None]:
# 2) Explicit one-to-one renames (Twitter -> Bluesky)
explicit_map = {
    "user_screen_name": "post.author.display_name",
    "user_id": "post.author.did",
    "user_created_at": "post.author.created_at",
    "user_name" : "post.author.handle",
    "user_statuses_count": "posts_count"
}

# only keep keys that actually exist to avoid noise
explicit_present = {k: v for k, v in explicit_map.items() if k in twitter_df.columns}
twitter_df = twitter_df.rename(columns=explicit_present)

# 3) Programmatic renames for the specific tweet/retweet columns you listed
tweet_cols = [
    "total_tweets",
    "tweets_per_hour",
    "total_retweets",
    "retweets_per_hour",
    "tweet_time_min",
    "tweet_time_max",
    "tweet_time_median",
    "tweet_time_mean",
    "tweet_time_std",
    "tweet_time_skew",
    "tweet_time_kurtosis",
    "tweet_time_entropy",
    "retweet_time_min",
    "retweet_time_max",
    "retweet_time_median",
    "retweet_time_mean",
    "retweet_time_std",
    "retweet_time_skew",
    "retweet_time_kurtosis",
    "retweet_time_entropy",
    "tweets_with_emoticons_ratio",
]

def to_bluesky_name(col: str) -> str:
    # Order matters: handle plurals before singulars to avoid partial overlaps.
    col = re.sub(r"retweets", "reposts", col)
    col = re.sub(r"retweet",  "repost",  col)
    col = re.sub(r"tweets",   "posts",   col)
    col = re.sub(r"tweet",    "post",    col)
    return col

present_tweet_cols = [c for c in tweet_cols if c in twitter_df.columns]
tweet_rename_map = {c: to_bluesky_name(c) for c in present_tweet_cols}

twitter_df = twitter_df.rename(columns=tweet_rename_map)


In [None]:
cols = twitter_df.columns.tolist()
for c in cols:
    print(c)

post.author.display_name
lang
post.author.did
post.author.created_at
user_description
user_followers_count
user_friends_count
user_location
post.author.handle
posts_count
user_urls
user_verified
user_type
screen_name_length
screen_name_digit_count
user_name_length
account_age_days
user_description_length
has_profile_description
description_occurrence_count
is_description_unique
is_mention
total_posts
posts_per_hour
total_reposts
reposts_per_hour
total_replies
replies_per_hour
total_mentions
mentions_per_hour
post_time_min
post_time_max
post_time_median
post_time_mean
post_time_std
post_time_skew
post_time_kurtosis
post_time_entropy
repost_time_min
repost_time_max
repost_time_median
repost_time_mean
repost_time_std
repost_time_skew
repost_time_kurtosis
repost_time_entropy
mention_time_min
mention_time_max
mention_time_median
mention_time_mean
mention_time_std
mention_time_skew
mention_time_kurtosis
mention_time_entropy
word_count_min
word_count_max
word_count_median
word_count_mean
word

In [None]:
twitter_df.shape

(60028, 309)

In [None]:
filled_df.shape

(1661, 310)

In [None]:
cols = filled_df.columns.tolist()
for c in cols:
    print(c)

post.author.created_at
post.author.did
post.author.display_name
post.author.handle
post.author.verification
user_verified
user_handle
user_did
user_description
user_followers_count
user_friends_count
posts_count
total_posts_x
total_mentions_x
screen_name_length
screen_name_digit_count
user_name_length
account_age_days
user_description_length
has_profile_description
description_occurrence_count
is_description_unique
total_posts_y
posts_per_hour
total_reposts
reposts_per_hour
total_replies
replies_per_hour
total_mentions_y
mentions_per_hour
post_time_min
post_time_max
post_time_median
post_time_mean
post_time_std
post_time_skew
post_time_kurtosis
post_time_entropy
repost_time_min
repost_time_max
repost_time_median
repost_time_mean
repost_time_std
repost_time_skew
repost_time_kurtosis
repost_time_entropy
mention_time_min
mention_time_max
mention_time_median
mention_time_mean
mention_time_std
mention_time_skew
mention_time_kurtosis
mention_time_entropy
word_count_min
word_count_max
word_co

In [None]:
# Drop unwanted columns from filled_df
cols_to_drop = [
    "total_posts_y",
    "total_mentions_y",
    "post.author.verification"
]

filled_df = filled_df.drop(columns=cols_to_drop, errors="ignore")

In [None]:
explicit_map = {
    "total_posts_x" : "total_posts",
    "total_mentions_x" : "total_mentions"
}

explicit_present = {k: v for k, v in explicit_map.items() if k in filled_df.columns}
filled_df = filled_df.rename(columns=explicit_present)

In [None]:
cols = filled_df.columns.tolist()
for c in cols:
    print(c)

post.author.created_at
post.author.did
post.author.display_name
post.author.handle
user_verified
user_handle
user_did
user_description
user_followers_count
user_friends_count
posts_count
total_posts
total_mentions
screen_name_length
screen_name_digit_count
user_name_length
account_age_days
user_description_length
has_profile_description
description_occurrence_count
is_description_unique
posts_per_hour
total_reposts
reposts_per_hour
total_replies
replies_per_hour
mentions_per_hour
post_time_min
post_time_max
post_time_median
post_time_mean
post_time_std
post_time_skew
post_time_kurtosis
post_time_entropy
repost_time_min
repost_time_max
repost_time_median
repost_time_mean
repost_time_std
repost_time_skew
repost_time_kurtosis
repost_time_entropy
mention_time_min
mention_time_max
mention_time_median
mention_time_mean
mention_time_std
mention_time_skew
mention_time_kurtosis
mention_time_entropy
word_count_min
word_count_max
word_count_median
word_count_mean
word_count_std
word_count_skew
wo

# **training, testing and predicting**

In [None]:
import pandas as pd
import numpy as np

twitter_df = twitter_df[~twitter_df['user_type'].isin(['suspended', 'deleted'])]

bool_cols = twitter_df.select_dtypes(include='bool').columns.tolist()

twitter_df[bool_cols] = twitter_df[bool_cols].astype(int)

for col in twitter_df.columns:
    if twitter_df[col].dtype == 'object' and twitter_df[col].dropna().isin(['True', 'False']).all():
        twitter_df[col] = twitter_df[col].map({'True': 1, 'False': 0})

X = twitter_df.select_dtypes(include=[np.number]).drop(columns=['user_type'], errors='ignore')
y = twitter_df['user_type'].map({'other': 0, 'bot': 1})

In [None]:
X.shape

(53782, 300)

In [None]:
import pandas as pd
import numpy as np

bool_cols = filled_df.select_dtypes(include='bool').columns.tolist()

filled_df[bool_cols] = filled_df[bool_cols].astype(int)

for col in filled_df.columns:
    if filled_df[col].dtype == 'object' and filled_df[col].dropna().isin(['True', 'False']).all():
        filled_df[col] = filled_df[col].map({'True': 1, 'False': 0})
Xb = filled_df.select_dtypes(include=[np.number])

In [None]:
Xb.shape

(1661, 300)

In [None]:
cols = X.columns.tolist()
for c in cols:
    print(c)

user_followers_count
user_friends_count
posts_count
user_verified
screen_name_length
screen_name_digit_count
user_name_length
account_age_days
user_description_length
has_profile_description
description_occurrence_count
is_description_unique
total_posts
posts_per_hour
total_reposts
reposts_per_hour
total_replies
replies_per_hour
total_mentions
mentions_per_hour
post_time_min
post_time_max
post_time_median
post_time_mean
post_time_std
post_time_skew
post_time_kurtosis
post_time_entropy
repost_time_min
repost_time_max
repost_time_median
repost_time_mean
repost_time_std
repost_time_skew
repost_time_kurtosis
repost_time_entropy
mention_time_min
mention_time_max
mention_time_median
mention_time_mean
mention_time_std
mention_time_skew
mention_time_kurtosis
mention_time_entropy
word_count_min
word_count_max
word_count_median
word_count_mean
word_count_std
word_count_skew
word_count_kurtosis
word_count_entropy
word_entropy_min
word_entropy_max
word_entropy_median
word_entropy_mean
word_entropy

In [None]:
cols = Xb.columns.tolist()
for c in cols:
    print(c)

user_verified
user_followers_count
user_friends_count
posts_count
total_posts
total_mentions
screen_name_length
screen_name_digit_count
user_name_length
account_age_days
user_description_length
has_profile_description
description_occurrence_count
is_description_unique
posts_per_hour
total_reposts
reposts_per_hour
total_replies
replies_per_hour
mentions_per_hour
post_time_min
post_time_max
post_time_median
post_time_mean
post_time_std
post_time_skew
post_time_kurtosis
post_time_entropy
repost_time_min
repost_time_max
repost_time_median
repost_time_mean
repost_time_std
repost_time_skew
repost_time_kurtosis
repost_time_entropy
mention_time_min
mention_time_max
mention_time_median
mention_time_mean
mention_time_std
mention_time_skew
mention_time_kurtosis
mention_time_entropy
word_count_min
word_count_max
word_count_median
word_count_mean
word_count_std
word_count_skew
word_count_kurtosis
word_count_entropy
word_entropy_min
word_entropy_max
word_entropy_median
word_entropy_mean
word_entropy

In [None]:
filled_df.to_csv(f"{output_dir}/bluesky/random_forest_blueskyb", index=False)
twitter_df.to_csv(f"{output_dir}/bluesky/random_forest_twitterb", index=False)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier( n_estimators=100,
    criterion='gini',
    random_state=42,
    n_jobs=-1)
clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


Accuracy: 0.9478479129868923
Precision: 0.935064935064935
ROC AUC: 0.8695550178819378


In [None]:
# Make Xb have the exact same column order as X
Xb = Xb.reindex(columns=X.columns)

In [None]:
y_pred = clf.predict(Xb)

In [None]:
import numpy as np
# uniques only
np.unique(y_pred)
# uniques + counts (and shares)
vals, counts = np.unique(y_pred, return_counts=True)
for v, c in zip(vals, counts):
    print(f"{v}: {c} ({c/len(y_pred):.2%})")


0: 1622 (97.65%)
1: 39 (2.35%)


# **training, testing & predicting without considering retweets**

In [None]:
twitter_data = f"{output_dir}/bluesky/random_forest_T_noRetweets"

In [None]:
import pandas as pd
twitter_df=pd.read_csv(twitter_data)

In [None]:
# Drop unwanted columns from twitter_df
cols_to_drop = [
    "user_default_profile_image",
    "user_favourites_count",
    "user_listed_count",
    "user_time_zone",
    "default_profile_image",
    "is_mention"
]

twitter_df = twitter_df.drop(columns=cols_to_drop, errors="ignore")


In [None]:
import re
# 2) Explicit one-to-one renames (Twitter -> Bluesky)
explicit_map = {
    "user_screen_name": "post.author.display_name",
    "user_id": "post.author.did",
    "user_created_at": "post.author.created_at",
    "user_name" : "post.author.handle",
    "user_statuses_count": "posts_count"
}

# only keep keys that actually exist to avoid noise
explicit_present = {k: v for k, v in explicit_map.items() if k in twitter_df.columns}
twitter_df = twitter_df.rename(columns=explicit_present)

# 3) Programmatic renames for the specific tweet/retweet columns you listed
tweet_cols = [
    "total_tweets",
    "tweets_per_hour",
    "total_retweets",
    "retweets_per_hour",
    "tweet_time_min",
    "tweet_time_max",
    "tweet_time_median",
    "tweet_time_mean",
    "tweet_time_std",
    "tweet_time_skew",
    "tweet_time_kurtosis",
    "tweet_time_entropy",
    "retweet_time_min",
    "retweet_time_max",
    "retweet_time_median",
    "retweet_time_mean",
    "retweet_time_std",
    "retweet_time_skew",
    "retweet_time_kurtosis",
    "retweet_time_entropy",
    "tweets_with_emoticons_ratio",
]

def to_bluesky_name(col: str) -> str:
    # Order matters: handle plurals before singulars to avoid partial overlaps.
    col = re.sub(r"retweets", "reposts", col)
    col = re.sub(r"retweet",  "repost",  col)
    col = re.sub(r"tweets",   "posts",   col)
    col = re.sub(r"tweet",    "post",    col)
    return col

present_tweet_cols = [c for c in tweet_cols if c in twitter_df.columns]
tweet_rename_map = {c: to_bluesky_name(c) for c in present_tweet_cols}

twitter_df = twitter_df.rename(columns=tweet_rename_map)


In [None]:
# Drop unwanted columns from filled_df
cols_to_drop = [
    "total_posts_y",
    "total_mentions_y",
    "post.author.verification"
]

filled_df = filled_df.drop(columns=cols_to_drop, errors="ignore")

In [None]:
explicit_map = {
    "total_posts_x" : "total_posts",
    "total_mentions_x" : "total_mentions"
}

explicit_present = {k: v for k, v in explicit_map.items() if k in filled_df.columns}
filled_df = filled_df.rename(columns=explicit_present)

In [None]:
import pandas as pd
import numpy as np

twitter_df = twitter_df[~twitter_df['user_type'].isin(['suspended', 'deleted'])]

bool_cols = twitter_df.select_dtypes(include='bool').columns.tolist()

twitter_df[bool_cols] = twitter_df[bool_cols].astype(int)

for col in twitter_df.columns:
    if twitter_df[col].dtype == 'object' and twitter_df[col].dropna().isin(['True', 'False']).all():
        twitter_df[col] = twitter_df[col].map({'True': 1, 'False': 0})

X = twitter_df.select_dtypes(include=[np.number]).drop(columns=['user_type'], errors='ignore')
y = twitter_df['user_type'].map({'other': 0, 'bot': 1})

In [None]:
import pandas as pd
import numpy as np

filled_df= pd.read_csv(f"{output_dir}/bluesky/random_forest_blueskyb.csv")
Xb = filled_df.select_dtypes(include=[np.number])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier( n_estimators=100,
    criterion='gini',
    random_state=42,
    n_jobs=-1)
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


Accuracy: 0.9434786650553129
Precision: 0.9142857142857143
ROC AUC: 0.8528463097143038


In [None]:
# Make Xb have the exact same column order as X
Xb = Xb.reindex(columns=X.columns)

In [None]:
y_pred = clf.predict(Xb)

In [None]:
import numpy as np
# uniques only
np.unique(y_pred)
# uniques + counts (and shares)
vals, counts = np.unique(y_pred, return_counts=True)
for v, c in zip(vals, counts):
    print(f"{v}: {c} ({c/len(y_pred):.2%})")

0: 1636 (98.49%)
1: 25 (1.51%)
