In [1]:
import sys
sys.path.append("../processing/")

In [2]:
import gzip
import json
import pickle
import numpy as np
from tqdm import tqdm
from collections import defaultdict

from _config import Config

# NB: ujson has trouble loading the jsons for some reason | ujson issue

In [3]:
ds_names = ["midterms", "news"]
prefixes = [str(p) for p in range(10, 110, 10)]

In [4]:
def rearrange_feature_groups(conv):
    # remove redundant features
    rm_group_fname_tuples = [
        ('tree', 'root_tweet_id'),
        ('follow_graph', 'root_tweet_id'),
        ('reply_graph', 'root_tweet_id'),
        ('polarization', 'root_tweet_id'),
        ('embeddedness', 'root_tweet_id'),
        ('subgraph', 'root_tweet_id'),
        ("polarization", "follow_di_n_nodes"),
        ("polarization", "follow_ud_n_nodes"),
        ("polarization", "reply_di_n_nodes"),
        ("polarization", "reply_ud_n_nodes")
    ]
    
    for group, fname in rm_group_fname_tuples:
        if conv[group] is not None:
            del conv[group][fname]
    
    # NB: rearrange a few features
    # (Makes more sense to group them differently while computing them)
    mv_group_from_to_tuples = [
        # polarization -> follow_graph
        ("follow_di_alg_cat_corr", "polarization", "follow_graph"),
        ("follow_di_alg_num_corr", "polarization", "follow_graph"),
        ("follow_ud_alg_cat_corr", "polarization", "follow_graph"),
        ("follow_ud_alg_modularity", "polarization", "follow_graph"),
        ("follow_ud_alg_num_corr", "polarization", "follow_graph"),
        # polarization -> reply_graph
        ("reply_di_alg_cat_corr", "polarization", "reply_graph"), 
        ("reply_di_alg_num_corr", "polarization", "reply_graph"), 
        ("reply_ud_alg_cat_corr", "polarization", "reply_graph"), 
        ("reply_ud_alg_modularity", "polarization", "reply_graph"), 
        ("reply_ud_alg_num_corr", "polarization", "reply_graph"), 
        # polarization -> tree
        ("tree_alg_cat_corr", "polarization", "tree"), 
        ("tree_alg_num_corr", "polarization", "tree"),
    ]
    
    for fname, g_from, g_to in mv_group_from_to_tuples:
        if conv[g_from] is not None and conv[g_to] is not None:
            val = conv[g_from][fname]
            conv[g_to][fname] = val
        
        if conv[g_from] is not None:
            del conv[g_from][fname]

    return conv

### Compile feature sets

In [5]:
# compile all features
feature_sets = defaultdict(set)
feature_types = defaultdict(set)

for ds_name in ds_names:
    print(f">> {ds_name}")
    conf = Config(ds_name)
    ds_path = f"{conf.data_root}/prefix_metrics/{ds_name}.json.gz"

    with gzip.open(ds_path) as fin:
        ds = json.load(fin)

    for conv_idx, conv in tqdm(enumerate(ds)):
        
        for ps in prefixes:
            if ps not in conv:
                continue
            
            conv_ps = conv[ps]
            conv_ps = rearrange_feature_groups(conv_ps)
            
            for f_set, fs in conv_ps.items():
                if fs is None:
                    continue

                for f_name, f_val in fs.items():
                    feature_sets[f_set].add(f_name)
                    feature_types[(f_set, f_name)].add(str(type(f_val)))
                    

>> midterms


130931it [07:53, 276.61it/s]


>> news


182365it [17:55, 169.60it/s]


In [8]:
# outputting feature sets to file
conf = Config()
feature_set_fpath = f"{conf.modeling_dir}/prefix/feature_sets.json"

feature_sets_lst = {fset_name: sorted(list(fset)) \
                    for fset_name, fset in feature_sets.items()}

with open(feature_set_fpath, "w") as fout:
    json.dump(feature_sets_lst, fout, indent=2)

In [None]:
# feature_sets

In [None]:
# feature_types

### Make dataset matrices

In [5]:
def make_dataset_matrix(dataset, prefix, feature_set_name_pairs):
    root_tweet_ids = []
    X = []
    
    for i in range(len(dataset)):
        
        if prefix not in dataset[i]:
            continue
            
        conv = dataset[i][prefix]
        conv = rearrange_feature_groups(conv)
        root_tweet_id = dataset[i]["root_tweet_id"]
        x_i = []

        for f_set, f_name in feature_set_name_pairs:
            # NB: feature subset might be missing
            f_set_dict = conv[f_set] if conv[f_set] is not None else {}
            
            # feature might be missing
            f_val = f_set_dict.get(f_name, None)
            
            # encode missing values as np.nans => np array is of type float
            f_val = f_val if f_val is not None else np.nan
            f_val = float(f_val)
            
            x_i.append(f_val)
        
        X.append(x_i)
        root_tweet_ids.append(root_tweet_id)

    X_arr = np.array(X)
    assert X_arr.shape[0] == len(root_tweet_ids)
    assert X_arr.shape[1] == len(feature_set_name_pairs)
    assert str(X_arr.dtype) == "float64"
        
    out = {
        "X": X_arr,
        "root_tweet_ids": root_tweet_ids,
        "feature_set_name_pairs": feature_set_name_pairs
    }
    
    return out

In [6]:
# load feature sets
fsets_fpath = f"{Config().modeling_dir}/prefix/feature_sets.json"
feature_sets = json.load(open(fsets_fpath))
    
# make feature set, name pairs
feature_set_name_pairs = []

for f_set_name, f_set_features in feature_sets.items():
    for f_name in f_set_features:
        feature_set_name_pairs.append((f_set_name, f_name))
    
    print("-", f_set_name, len(f_set_features))
    
print(f"[Total num of features: {len(feature_set_name_pairs)}]")

- tree 46
- follow_graph 115
- reply_graph 115
- embeddedness 56
- polarization 12
- subgraph 60
- arrival_seq 198
- rate 200
- toxicity 7
[Total num of features: 809]


In [7]:
# create & output all data matrices | dataset x prefix_size

ds_names = ["midterms", "news"]
prefixes = [str(p) for p in range(10, 110, 10)]

for ds_name in ds_names:
    print(f">> {ds_name}")
    
    # load dataset 
    conf = Config(ds_name)
    ds_path = f"{conf.data_root}/prefix_metrics/{ds_name}.json.gz"

    with gzip.open(ds_path) as fin:
        ds = json.load(fin)
        
    # make matrix for every prefix
    for prefix in prefixes:
        print(f"prefix: {prefix}")
        
        ds_mat = make_dataset_matrix(ds, prefix, feature_set_name_pairs)
        print(ds_mat.keys(), ds_mat["X"].shape)
        
        # output to file
        out_path = f"{conf.modeling_dir}/prefix/datasets/{ds_name}_p{prefix}.pkl.gz"

        with gzip.open(out_path, "wb") as fout:
            pickle.dump(ds_mat, fout, protocol=4)
            
    print("-----")

>> midterms
prefix: 10
dict_keys(['X', 'root_tweet_ids', 'feature_set_name_pairs']) (130931, 809)
prefix: 20
dict_keys(['X', 'root_tweet_ids', 'feature_set_name_pairs']) (81970, 809)
prefix: 30
dict_keys(['X', 'root_tweet_ids', 'feature_set_name_pairs']) (61900, 809)
prefix: 40
dict_keys(['X', 'root_tweet_ids', 'feature_set_name_pairs']) (50840, 809)
prefix: 50
dict_keys(['X', 'root_tweet_ids', 'feature_set_name_pairs']) (43627, 809)
prefix: 60
dict_keys(['X', 'root_tweet_ids', 'feature_set_name_pairs']) (38440, 809)
prefix: 70
dict_keys(['X', 'root_tweet_ids', 'feature_set_name_pairs']) (34598, 809)
prefix: 80
dict_keys(['X', 'root_tweet_ids', 'feature_set_name_pairs']) (31516, 809)
prefix: 90
dict_keys(['X', 'root_tweet_ids', 'feature_set_name_pairs']) (28966, 809)
prefix: 100
dict_keys(['X', 'root_tweet_ids', 'feature_set_name_pairs']) (26896, 809)
-----
>> news
prefix: 10
dict_keys(['X', 'root_tweet_ids', 'feature_set_name_pairs']) (182365, 809)
prefix: 20
dict_keys(['X', 'root_twe