In [None]:
import sys
sys.path.append("../processing/")

In [None]:
import gzip
import pickle
import random
import treelib
import ujson as json
import numpy as np
from collections import defaultdict
from tqdm import tqdm

from _config import Config
from utils import json_paths_iter
from tree_conversions import get_tree_tweet_edges

In [None]:
dataset = "news"
limit = None

conf = Config(dataset)
json_fpaths = json_paths_iter(conf.conversations_no_embs_jsons_dir, limit=limit)

# counts
root_tox_counts = {}  # root_t_id => {n_pos: x, n_neg: y}
tox_counts = {"pos": 0, "neg": 0}

for json_fpath in tqdm(json_fpaths):

    conversation = json.load(gzip.open(json_fpath))
    
    output = []
    
    tweet_info = conversation["tweets"]
    net_info = conversation["network_features"]
    toxicity_scores = conversation["toxicity_scores"]
    
    # root info
    root_tweet_id = conversation["reply_tree"]["tweet"]
    root_user_id = conversation["tweets"][root_tweet_id]["user_id"]
    
    # tweets in chronological order
    tweets = list(conversation["tweets"].values())
    tweets.sort(key=lambda x: x["time"])
    
    # get inReplyTo links (child_tweet_id => parent_tweet_id)
    tweet_replyto_id = get_tree_tweet_edges(conversation["reply_tree"])
    tweet_replyto_id = {c_t_id: p_t_id for p_t_id, c_t_id in tweet_replyto_id}
    tweet_replyto_id[root_tweet_id] = None
    
    # user id (str) => idx
    user_id_to_idx = enumerate(net_info["user_ids"])
    user_id_to_idx = {u_id: u_idx for u_idx, u_id in user_id_to_idx}
    
    # init tree
    tree = treelib.Tree()
    
    conv_tox_counts = {"pos": [], "neg": []}

    # LOOP
    for tweet in tweets:
        # tweet / user
        tweet_id = tweet["id"]
        user_id = tweet["user_id"]
        user_idx = user_id_to_idx.get(user_id, None)
        
        tweet_tox_score = toxicity_scores.get(tweet_id, None)
        tweet_tox_ok = False
        if tweet_tox_score is not None and (tweet_tox_score < 0.25 or tweet_tox_score > 0.75):
            tweet_tox_ok = True

        # parent
        parent_tweet_id = tweet_replyto_id[tweet_id]
        parent_user_id = None
        if parent_tweet_id is not None:
            parent_user_id = tweet_info[parent_tweet_id]["user_id"]
        
        # update reply tree
        tree.create_node(
            identifier=tweet_id, 
            parent=tweet_replyto_id[tweet_id]
        )
        
        # FILTER
        SKIP = (
            user_id == root_user_id or          # tweets by the root
            user_id == parent_user_id or        # self-replies
            user_id not in user_id_to_idx or    # users w/o follow net info
            tweet_tox_ok == False or            # tweets w/o toxicity label
            tree.depth(tweet_id) < 2            # direct replies to the root
        )
        
        # SAMPLE        
        if not SKIP:
            # counts code here
            tweet_tox_label = "pos" if tweet_tox_score > 0.5 else "neg"
            
            conv_tox_counts[tweet_tox_label].append(tweet_id)
            tox_counts[tweet_tox_label] += 1
            
    # save counts
    json_fname = json_fpath.split("/")[-1]
    root_tox_counts[(json_fname, root_tweet_id)] = conv_tox_counts
    

In [None]:
print(tox_counts, sum(tox_counts.values()))

In [None]:
cc = [True for v in root_tox_counts.values() if len(v["pos"]) > 0 and len(v["neg"]) > 0]
print(len(cc))

In [None]:
# output to pickle
out = {
    "tox_counts": tox_counts,
    "root_tox_counts": root_tox_counts
}

out_fpath = f"{conf.data_root}/next_reply_metrics/{dataset}_tweets_tox_p75_m25.pkl.gz"

with gzip.open(out_fpath, "wb") as fout:
    pickle.dump(out, fout, protocol=4)

print("Done")

In [None]:
# news
# {'pos': 879,165, 'neg': 4,927,127} 5,806,292
# 96,520

# midterms
# {'pos': 641,494, 'neg': 4,362,548} 5,004,042
# 50,143

### Paired Tweets Sampling

In [None]:
# load data
dataset = "news"

conf = Config(dataset)
fpath = f"{conf.data_root}/next_reply_metrics/{dataset}_tweets_tox_p75_m25.pkl.gz"

tox_stats = pickle.load(gzip.open(fpath))

ds_tox_counts = tox_stats["tox_counts"]
conv_tox_tweets = tox_stats["root_tox_counts"]

In [None]:
ds_tox_counts

In [None]:
# sample one toxic and one non-toxic tweet from each conversation
paired_tweets = []  # (fname, root_id, tweet_ids)
RNG = random.Random(0)

for file_root_id_pair, tweet_tox in conv_tox_tweets.items():
    fname, root_id = file_root_id_pair
    pos_t_ids = tweet_tox["pos"]
    neg_t_ids = tweet_tox["neg"]
    
    if len(pos_t_ids) < 1 or len(neg_t_ids) < 1:
        continue
    
    tweets_pair = [RNG.choice(pos_t_ids), RNG.choice(neg_t_ids)]
    
    paired_tweets.append((fname, root_id, tweets_pair))
    
print(len(paired_tweets))

In [None]:
# sanity checks
fnames_unq = set()
r_ids_unq = set()
t_ids_unq = set()

for fname, r_id, t_ids in paired_tweets:
    fnames_unq.add(fname)
    r_ids_unq.add(r_id)
    for t_id in t_ids:
        t_ids_unq.add(t_id)

assert len(paired_tweets) == len(fnames_unq) == len(r_ids_unq)
assert len(t_ids_unq) == len(paired_tweets) * 2

In [None]:
# output to file 
out_path = f"{conf.data_root}/next_reply_metrics/{dataset}_paired_sample_tweet_ids.json.gz"

with gzip.open(out_path, "wt") as fout:
    json.dump(paired_tweets, fout, indent=2)

print("Done!")