In [1]:
import pickle
from features import prepare_entry, merge_entries
from utills import chunker, cartesian_product
from tqdm.notebook import trange, tqdm
import numpy as np

In [2]:
DATA_DIR = 'data/'
TEMP_DIR = 'temp_data/'

MULTIDOC_MODEL_PATH = TEMP_DIR + 'reddit/multidoc_10/model_10.p'
SINGLEDOC_MODEL_PATH = TEMP_DIR + 'reddit/unchunked/model.p'
chunk_sz = 10 # 10 Reddit comments per chunk, a comment is on avg 33 tokens

Load Models
===

In [3]:
with open(SINGLEDOC_MODEL_PATH, 'rb') as f:
    (clf, transformer, scaler, secondary_scaler, _) = pickle.load(f)
    
with open(MULTIDOC_MODEL_PATH, 'rb') as f:
    (clf_multi, transformer_multi, scaler_multi, secondary_scaler_multi, _) = pickle.load(f)

Load Data
===

In [4]:
doc_sets_A = [
    ['This is some random text!!!'] * 20,
    ['Another set of random comments! :) :)'] * 25,
    ['More Reddit comments.'] * 27,
    ['Moderation is also conducted by community-specific moderators, who are not Reddit employees'] * 30
]
usernames_A = ['user_A', 'user_B', 'user_C', 'user_D']
doc_sets_B = [
    ['Reddit is a network of communities where people can dive into their interests.'] * 20,
    ['Posts are organized by subject into user-created boards called "communities" or "subreddits".'] * 23,
    ['More Reddit comments.'] * 30,
]
usernames_B = ['user_L', 'user_M', 'user_N']

Preprocess
===

In [5]:
def chunk_and_preprocess(texts, texts_per_chunk):
    """
        Takes a list of strings (Reddit comments), groups them into chunks of
        size `texts_per_chunk`, and preprocesses them
    """
    return [
        prepare_entry('\n'.join(d), mode='accurate', tokenizer='casual') 
        for d in chunker(texts, texts_per_chunk)
    ]

In [6]:
preprocessed_A = [chunk_and_preprocess(docs, chunk_sz) for docs in tqdm(doc_sets_A)]
preprocessed_B = [chunk_and_preprocess(docs, chunk_sz) for docs in tqdm(doc_sets_B)]

Widget Javascript not detected.  It may not be installed or enabled properly.





Widget Javascript not detected.  It may not be installed or enabled properly.





Vectorize
===

In [7]:
def vectorize(usernames, preprocessed, vector_path, vector_path_multi,
             transformer, scaler, transformer_multi, scaler_multi):
    total_chunks = 0
    user_bounds = {}
    user_to_idx = {}
    for i, user in enumerate(usernames):
        chunks = preprocessed[i]
        user_bounds[user] = (total_chunks, total_chunks + len(chunks))
        user_to_idx[user] = i
        total_chunks += len(chunks)
    
    x_shape_multi = (total_chunks, len(transformer_multi.get_feature_names()))
    x_shape = (len(usernames), len(transformer.get_feature_names()))
    
    XX_multi = np.memmap(vector_path_multi, dtype='float32', mode='w+', shape=x_shape_multi)
    XX = np.memmap(vector_path, dtype='float32', mode='w+', shape=x_shape)
    

    for i, user in enumerate(tqdm(usernames)):
        chunks = preprocessed[i]
        s, e = user_bounds[user]
        XX_multi[np.arange(s, e), :] = scaler_multi.transform(transformer_multi.transform(chunks).todense())
        
        XX[i, :] = scaler.transform(transformer.transform([merge_entries(chunks)]).todense())[0, :]
        i += 1
    return XX, XX_multi, user_to_idx, user_bounds, x_shape, x_shape_multi

In [8]:
XX_A, XX_multi_A, user_to_idx_A, user_bounds_A, x_shape_A, x_shape_multi_A = vectorize(
    usernames_A,
    preprocessed_A,
    'temp_data/reddit_example/XX_A.npy',
    'temp_data/reddit_example/XX_A_multi.npy',
    transformer,
    scaler,
    transformer_multi,
    scaler_multi
)

Widget Javascript not detected.  It may not be installed or enabled properly.





In [9]:
XX_B, XX_multi_B, user_to_idx_B, user_bounds_B, x_shape_B, x_shape_multi_B = vectorize(
    usernames_B,
    preprocessed_B,
    'temp_data/reddit_example/XX_B.npy',
    'temp_data/reddit_example/XX_B_multi.npy',
    transformer,
    scaler,
    transformer_multi,
    scaler_multi
)

Widget Javascript not detected.  It may not be installed or enabled properly.





Predict
===

In [10]:
# create a list of users we are going to compare
jobs = []
for a in usernames_A:
    
    for b in usernames_B:
        
        jobs.append((a,b))

In [11]:
probs_single_doc = []

    
inter_probs_mean = []
inter_probs_std = []

intraA_probs_mean = []
intraA_probs_std = []

intraB_probs_mean = []
intraB_probs_std = []
pred_lengths = []
    
    
for user_a, user_b in tqdm(jobs):

    start_a, end_a = user_bounds_A[user_a]
    start_b, end_b = user_bounds_B[user_b]
    
    #Inter A - B
    l = []
    idxs = cartesian_product(range(start_a, end_a), range(start_b, end_b))
    x_diff = secondary_scaler_multi.transform(np.abs(XX_multi_A[idxs[:, 0]] - XX_multi_B[idxs[:, 1]]))
    x_diff[np.isnan(x_diff)]=0
    p = clf_multi.predict_proba(x_diff)[:, 1]
    inter_probs_mean.append(p.mean())
    inter_probs_std.append(p.std())
    l.append(len(p))
    
    # Intra A
    idxs = cartesian_product(range(start_a, end_a), range(start_a, end_a))
    idxs = np.array([(i, j) for i, j in idxs if i != j])
    x_diff = secondary_scaler_multi.transform(np.abs(XX_multi_A[idxs[:, 0]] - XX_multi_A[idxs[:, 1]]))
    x_diff[np.isnan(x_diff)]=0
    p = clf_multi.predict_proba(x_diff)[:, 1]
    intraA_probs_mean.append(p.mean())
    intraA_probs_std.append(p.std())
    l.append(len(p))
    
    # Intra B
    idxs = cartesian_product(range(start_b, end_b), range(start_b, end_b))
    idxs = np.array([(i, j) for i, j in idxs if i != j])
    x_diff = secondary_scaler_multi.transform(np.abs(XX_multi_B[idxs[:, 0]] - XX_multi_B[idxs[:, 1]]))
    x_diff[np.isnan(x_diff)]=0
    p = clf_multi.predict_proba(x_diff)[:, 1]
    intraB_probs_mean.append(p.mean())
    intraB_probs_std.append(p.std())
    l.append(len(p))
    
    pred_lengths.append(l)
    
    p = clf.predict_proba(secondary_scaler.transform(np.abs(XX_A[[user_to_idx_A[user_a]], :] - XX_B[[user_to_idx_B[user_b]], :])))[0, 1]
    probs_single_doc.append(p)

Widget Javascript not detected.  It may not be installed or enabled properly.





In [12]:

inter_probs_mean = np.array(inter_probs_mean)
intraA_probs_mean = np.array(intraA_probs_mean)
intraB_probs_mean = np.array(intraB_probs_mean)
inter_probs_std = np.array(inter_probs_std)
intraA_probs_std = np.array(intraA_probs_std)
intraB_probs_std = np.array(intraB_probs_std)
pred_lengths = np.array(pred_lengths)


probs_single_doc = np.array(probs_single_doc)

n_a = pred_lengths[:, 0]
n_b = pred_lengths[:, 1]
n_ab = pred_lengths[:, 2]

intra_probs_mean = (intraA_probs_mean * n_a + intraB_probs_mean * n_b)/ (n_a + n_b)
intra_probs_std = (
        n_a * (intraA_probs_std ** 2 + (intraA_probs_mean - intra_probs_mean)**2) + 
        n_b * (intraB_probs_std ** 2 + (intraB_probs_mean - intra_probs_mean)**2)
    ) / (n_a + n_b)


pooled_mean = (intra_probs_mean * (n_a + n_b) + inter_probs_mean * n_ab)/ (n_a + n_b + n_ab)
pooled_std = (
        (n_a + n_b) * (intra_probs_mean ** 2 + (intra_probs_mean - pooled_mean)**2) + 
        n_ab * (inter_probs_mean ** 2 + (inter_probs_mean - pooled_mean)**2)
    ) / (n_a + n_b + n_ab)

intra_inter_diff = (1 - np.abs(inter_probs_mean - intra_probs_mean))
aggr_score = probs_single_doc * intra_inter_diff

In [15]:
for i in np.argsort(-aggr_score):
    user_a, user_b = jobs[i]
    print(user_a, '- ', user_b, ': ', aggr_score[i])

user_C -  user_N :  0.9990441523225682
user_C -  user_L :  0.9211873959822429
user_A -  user_L :  0.528932056260764
user_A -  user_N :  0.0748376178937503
user_B -  user_L :  0.024674413381435834
user_B -  user_N :  0.01184139182919703
user_B -  user_M :  5.533341574170584e-05
user_A -  user_M :  8.172728688916886e-06
user_D -  user_N :  1.038952670412488e-06
user_C -  user_M :  4.0535912579965615e-07
user_D -  user_L :  1.6730644046796573e-07
user_D -  user_M :  9.598529600890168e-13
