In [1]:
import sys
sys.path.append("../processing/")

In [2]:
import gzip
import pickle
import ujson as json
import numpy as np
from tqdm import tqdm
from collections import defaultdict

from _config import Config

### Compile feature sets

In [3]:
ds_names = ["news", "midterms"]

feature_set = set()
feature_types = defaultdict(set)
all_types = set()

for ds_name in ds_names:
    print(f">> {ds_name}")
    
    # load dataset
    conf = Config(ds_name)
    ds_path = f"{conf.data_root}/next_reply_metrics/{ds_name}_paired_sample.json.gz"

    with gzip.open(ds_path) as fin:
        ds = json.load(fin)

    print(f"|{ds_name}| = {len(ds)}")
    
    # loop through all conversations
    for conv in ds:
        for f_name, f_val in conv.items():
            feature_set.add(f_name)
            feature_types[f_name].add(str(type(f_val)))
            all_types.add(str(type(f_val)))

>> news
|news| = 193040
>> midterms
|midterms| = 100286


In [4]:
print("len(feature_set) =", len(feature_set))

len(feature_set) = 270


In [5]:
all_types

{"<class 'NoneType'>",
 "<class 'bool'>",
 "<class 'float'>",
 "<class 'int'>",
 "<class 'str'>"}

In [6]:
# SUMMARY
# meta:
# tweet_id, root_tweet_id, root_tweet_type
#
# categorical features: 
# dyad_up_follow_edge_type, dyad_ur_follow_edge_type

for f_fname, f_types in feature_types.items():
    if "<class 'str'>" in f_types:
        print(f_fname, f_types)

tweet_id {"<class 'str'>"}
root_tweet_id {"<class 'str'>"}
root_tweet_type {"<class 'str'>"}
dyad_up_follow_edge_type {"<class 'str'>", "<class 'NoneType'>"}
dyad_ur_follow_edge_type {"<class 'str'>", "<class 'NoneType'>"}


### `feature_sets.json`
- manually created based on the output here
- includes the heierarcy of features
- includes the values of the categorical features (i.e., dyad edge types)

### Make dataset matrices

In [7]:
def categorical_to_one_hot(features):
    # make a copy, instead of changing the features in-place
    features_ = features.copy()
    
    # Only one categorical feature: dyad-edge-type
    dyad_etypes_map = [
        (None, "na"),
        ("O==O", "O==O"), 
        ("O->O", "O->O"), 
        ("O<-O", "O<-O"),
        ("O  O", "O__O")
    ]
    
    # user-parent-dyad & user-root-dyad
    dyad_fnames = [
        "dyad_up_follow_edge_type",
        "dyad_ur_follow_edge_type"
    ]
    
    for f_name in dyad_fnames:
        f_val_sum = 0
        
        for dyad_etype_val, dyad_etype_str in dyad_etypes_map:
            f_key = f"{f_name}_{dyad_etype_str}"
            f_val = int(features_[f_name] == dyad_etype_val)
            features_[f_key] = f_val
            f_val_sum += f_val
            
        # sanity check: exactly one value is on
        assert f_val_sum == 1
        
        # remove the string feature
        del features_[f_name]
    
    return features_

In [8]:
def make_dataset_matrix(dataset, feature_set_name_pairs):
    X, y, meta = [], [], []
    
    for i in range(len(dataset)):
        conv = dataset[i]
        
        # features
        x_i = []

        # make categorical features 1-hot
        conv = categorical_to_one_hot(conv)

        for f_set, f_name in feature_set_name_pairs:
            # NB: only dyad subgraph metrics are missing
            f_val = conv.get(f_name, None)
            
            # encode missing values as np.nans => np array is of type float
            f_val = np.nan if f_val is None else f_val
            f_val = float(f_val)
            
            x_i.append(f_val)

        X.append(x_i)

        # outcome
        y_i = float(conv["tweet_tox"])
        y.append(y_i)
        
        # meta
        meta.append({
            "tweet_id": conv["tweet_id"],
            "root_tweet_id": conv["root_tweet_id"],
            "root_tweet_type": conv["root_tweet_type"],
            "n_replies": conv["conv_n_replies"],
            "tox_score": conv["tweet_tox_score"]
        })
    
    X_arr = np.array(X)
    y_arr = np.array(y)

    # sanity checks
    assert X_arr.shape[0] == y_arr.shape[0] == len(meta)
    assert X_arr.shape[1] == len(feature_set_name_pairs)
    assert str(X_arr.dtype) == "float64"
    assert str(y_arr.dtype) == "float64"

    out = {
        "X": X_arr,
        "y": y_arr,
        "meta": meta,
        "feature_set_name_pairs": feature_set_name_pairs
    }
    
    return out

In [9]:
# load feature sets
conf = Config()
feature_sets_fpath = f"{conf.modeling_dir}/next_reply/feature_sets.json"
feature_sets = json.load(open(feature_sets_fpath))

In [10]:
for fset_name, features in feature_sets.items():
    print("-", fset_name, len(features))

- outcome 1
- meta 4
- conversation_state 9
- user_info 3
- alignments 2
- follow_di 15
- follow_ud 12
- reply_di 15
- reply_ud 12
- dyad_up 50
- dyad_ur 20
- embeddedness_all 10
- embeddedness_toxicity 20
- embeddedness_follow 50
- embeddedness_reply 50
- tree 5


In [11]:
# make feature set, name pairs
feature_set_name_pairs = []

for f_set_name, f_set_features in feature_sets.items():
    if f_set_name in ("meta", "outcome"):
        continue
    for f_name in f_set_features:
        feature_set_name_pairs.append((f_set_name, f_name))

print(f"Total num of features: {len(feature_set_name_pairs)}")

Total num of features: 273


In [12]:
# create & output all data matrices
ds_names = ["news", "midterms"]

for ds_name in ds_names:
    print(f">> {ds_name}")
    
    conf = Config(ds_name)
    
    # load dataset
    ds_path = f"{conf.data_root}/next_reply_metrics/{ds_name}_paired_sample.json.gz"
    ds = json.load(gzip.open(ds_path))    
    print(f"|{ds_name}| = {len(ds)}")
    
    # make dataset matrix
    ds_mat = make_dataset_matrix(ds, feature_set_name_pairs)
    print(ds_mat.keys(), ds_mat["X"].shape)

    # output ds matrix
    out_path = f"{conf.modeling_dir}/next_reply/datasets/{ds_name}_paired.pkl.gz"

    with gzip.open(out_path, "wb") as fout:
        pickle.dump(ds_mat, fout, protocol=4)
        
print("Done!")

>> news
|news| = 193040
dict_keys(['X', 'y', 'meta', 'feature_set_name_pairs']) (193040, 273)
>> midterms
|midterms| = 100286
dict_keys(['X', 'y', 'meta', 'feature_set_name_pairs']) (100286, 273)
Done!
