In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../")


In [2]:
import pickle
import glob
from tqdm.notebook import trange, tqdm
import json
import numpy as np
import re
from collections import defaultdict
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc, precision_score, recall_score, average_precision_score
import random
from train_utils import generate_doc_pairs_no_chunking
from utills import chunker, cartesian_product, ReservoirSample
from sklearn.linear_model import SGDClassifier

In [3]:
from matplotlib import rcParams
import matplotlib.pyplot as plt
import matplotlib.style as style

In [4]:
from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [5]:
# BASE_PATH = '../data/reddit_2/'
COMPUTED_DATA_PATH = '../temp_data/reddit_old_dec/preprocessed/'
chunk_sz=20

In [5]:
 
with open('../temp_data/reddit_old_dec/unchunked/model.p', 'rb') as f:
    (clf_nc, transformer_nc, scaler_nc, secondary_scaler_nc, _) = pickle.load(f)
    
with open('../temp_data/reddit_old_dec/multidoc_20/model_20.p', 'rb') as f:
    (clf, transformer, scaler, secondary_scaler, _) = pickle.load(f)

In [7]:
with open('../temp_data/reddit_old_dec/unchunked/experiment_data.p', 'rb') as f:
    (_, author_to_doc_idx_nc,  _, author_subreddit_nc, _, x_shape_nc, _, _, _, _) = pickle.load(f)
with open('../temp_data/reddit_old_dec/multidoc_20/experiment_data20.p', 'rb') as f:
    (_, author_bounds_c, _, author_subreddit_c, _, x_shape_c, _, _, _, _) = pickle.load(f)

In [8]:
XX_nc = np.memmap('../temp_data/reddit_old_dec/unchunked/XX_test.npy', dtype='float32', mode='r', shape=x_shape_nc)
XX_c = np.memmap('../temp_data/reddit_old_dec/multidoc_20/XX_test_20.npy', dtype='float32', mode='r', shape=x_shape_c)

Generate Test Pairs
===

In [9]:
doc_idx_to_author = {v:k for k, v in author_to_doc_idx_nc.items()}
author_mapping = defaultdict(set)
author_to_root = {}
for y in author_to_doc_idx_nc.keys():
    u = re.search(r'(.*)_[A-Z]+$', y).group(1)
    author_mapping[u].add(y)
    author_to_root[y] = u

subreddit_to_author = defaultdict(list)
for k, v in author_subreddit_nc.items():
    subreddit_to_author[v].append(k)

In [12]:
X_idxs_pos, X_idxs_neg_diff_topic, X_idxs_neg_same_topic = generate_doc_pairs_no_chunking(author_mapping, subreddit_to_author, author_to_root, author_to_doc_idx_nc, author_subreddit_c, return_all=True)

new2









45652 42961 36920


Make Predictions
===

In [16]:
def predict(X_idxs):
    probs_nc = []


    inter_probs_mean = []
    inter_probs_std = []

    intraA_probs_mean = []
    intraA_probs_std = []

    intraB_probs_mean = []
    intraB_probs_std = []
    pred_lengths = []


    for i, j in tqdm(X_idxs):
        user_a = doc_idx_to_author[i]
        user_b = doc_idx_to_author[j]

        start_a, end_a = author_bounds_c[user_a]
        start_b, end_b = author_bounds_c[user_b]

        l = []
        idxs = cartesian_product(range(start_a, end_a), range(start_b, end_b))
        x_diff = secondary_scaler.transform(np.abs(XX_c[idxs[:, 0]] - XX_c[idxs[:, 1]]))
        x_diff[np.isnan(x_diff)]=0
        p = clf.predict_proba(x_diff)[:, 1]
        inter_probs_mean.append(p.mean())
        inter_probs_std.append(p.std())
        l.append(len(p))

        idxs = cartesian_product(range(start_a, end_a), range(start_a, end_a))
        idxs = np.array([(i, j) for i, j in idxs if i != j])
        x_diff = secondary_scaler.transform(np.abs(XX_c[idxs[:, 0]] - XX_c[idxs[:, 1]]))
        x_diff[np.isnan(x_diff)]=0
        p = clf.predict_proba(x_diff)[:, 1]
        intraA_probs_mean.append(p.mean())
        intraA_probs_std.append(p.std())
        l.append(len(p))

        idxs = cartesian_product(range(start_b, end_b), range(start_b, end_b))
        idxs = np.array([(i, j) for i, j in idxs if i != j])
        x_diff = secondary_scaler.transform(np.abs(XX_c[idxs[:, 0]] - XX_c[idxs[:, 1]]))
        x_diff[np.isnan(x_diff)]=0
        p = clf.predict_proba(x_diff)[:, 1]
        intraB_probs_mean.append(p.mean())
        intraB_probs_std.append(p.std())
        l.append(len(p))

        pred_lengths.append(l)

        p = clf_nc.predict_proba(secondary_scaler_nc.transform(np.abs(XX_nc[[i], :] - XX_nc[[j], :])))[0, 1]
        probs_nc.append(p)
        

    inter_probs_mean = np.array(inter_probs_mean)
    intraA_probs_mean = np.array(intraA_probs_mean)
    intraB_probs_mean = np.array(intraB_probs_mean)
    inter_probs_std = np.array(inter_probs_std)
    intraA_probs_std = np.array(intraA_probs_std)
    intraB_probs_std = np.array(intraB_probs_std)
    pred_lengths = np.array(pred_lengths)


    probs_nc = np.array(probs_nc)

    n_a = pred_lengths[:, 0]
    n_b = pred_lengths[:, 1]
    n_ab = pred_lengths[:, 2]

    intra_probs_mean = (intraA_probs_mean * n_a + intraB_probs_mean * n_b)/ (n_a + n_b)
    intra_probs_std = (
            n_a * (intraA_probs_std ** 2 + (intraA_probs_mean - intra_probs_mean)**2) + 
            n_b * (intraB_probs_std ** 2 + (intraB_probs_mean - intra_probs_mean)**2)
        ) / (n_a + n_b)


    pooled_mean = (intra_probs_mean * (n_a + n_b) + inter_probs_mean * n_ab)/ (n_a + n_b + n_ab)
    pooled_std = (
            (n_a + n_b) * (intra_probs_mean ** 2 + (intra_probs_mean - pooled_mean)**2) + 
            n_ab * (inter_probs_mean ** 2 + (inter_probs_mean - pooled_mean)**2)
        ) / (n_a + n_b + n_ab)

    aggr_score = (probs_nc * (1 - np.abs(inter_probs_mean - intra_probs_mean)))
    return aggr_score

In [18]:
pos_preds = predict(X_idxs_pos)
neg_diff_topic_preds = predict(X_idxs_neg_diff_topic)
neg_same_topic_preds = predict(X_idxs_neg_same_topic)










In [19]:
with open('../temp_data/reddit_old_dec/class_imbalance/experiment_data.p', 'wb') as f:
    pickle.dump((
        X_idxs_pos,
        X_idxs_neg_diff_topic,
        X_idxs_neg_same_topic,
        pos_preds,
        neg_diff_topic_preds,
        neg_same_topic_preds
    ), f)

In [55]:
p = np.random.choice(np.arange(len(X_idxs_neg_diff_topic)), size=len(X_idxs_neg_diff_topic), replace=False)
neg_diff_topic_preds_sampled = neg_diff_topic_preds[p]

p = np.random.choice(np.arange(len(X_idxs_neg_same_topic)), size=len(X_idxs_neg_same_topic), replace=False)
neg_same_topic_preds_sampled = neg_same_topic_preds[p]

neg_preds = np.concatenate([neg_diff_topic_preds_sampled, neg_same_topic_preds_sampled])

# preds = np.concatenate([pos_preds, neg_diff_topic_preds_sampled, neg_same_topic_preds_sampled])
# labels = np.array([1] * len(pos_preds) + [0] * len(neg_diff_topic_preds_sampled) + [0] * len(neg_same_topic_preds_sampled))

In [56]:
len(X_idxs_neg_diff_topic), len(X_idxs_neg_same_topic), len(neg_preds)

(42961, 36920, 79881)

In [191]:
total_recs = 79000
pos_fracs = [0.5, 0.25, 0.1, 0.05, 0.01, 0.005]
# fig = plt.figure()

for frac in pos_fracs:
    roc_aucs = []
    pr_aucs = []
    r_at_ps = []
    precision_scores = []
    recall_scores = []
    for _ in range(10):
        pos_preds_sampled = np.random.choice(pos_preds, int(total_recs * frac), replace=False)
        neg_preds_sampled = np.random.choice(neg_preds, int(total_recs * (1-frac)), replace=False)

        preds = np.concatenate([pos_preds_sampled, neg_preds_sampled])
        labels = np.array([1] * len(pos_preds_sampled) + [0] * len(neg_preds_sampled))
#         print(len(labels), labels.sum())
        precision, recall, thresh = precision_recall_curve(labels, preds)
    #         plt.plot(precision, recall, label=str(frac))
        _, r_at_p = recall_at_precision(precision, recall, precision_value=0.90)
        # plt.plot([p_at_r_x], [p_at_r_y], marker='o')
        pr_auc = average_precision_score(labels, preds)

        fpr, tpr, thresh = roc_curve(labels, preds)
        roc_auc = auc(fpr, tpr)

        precision = precision_score(labels, preds > 0.5)
        recall = recall_score(labels, preds > 0.5)

        roc_aucs.append(roc_auc)
        pr_aucs.append(pr_auc)
        r_at_ps.append(r_at_p)
        precision_scores.append(precision)
        recall_scores.append(recall)
        
#     print(frac, 
#           'AUC:', round(np.mean(roc_aucs), 4), 
#           'PR_AUC:', round(np.mean(pr_aucs), 4), 
#           'R@P90:', np.mean(r_at_ps),
#           'P:', np.mean(precision_scores), 
#           'R:', np.mean(recall_scores)
#          )
    print(frac, 
          round(np.mean(roc_aucs), 3), '  &  ',
          round(np.mean(pr_aucs), 3), '  &  ',
          round(np.mean(r_at_ps), 3),'  &  ',
          round(np.mean(precision_scores), 3), '  &  ',
          round(np.mean(recall_scores), 3), '  &  '
         )
# fig.legend(loc='lower left', bbox_to_anchor=(0.1, 0.15))

# plt.tight_layout()
# plt.show()

0.5 0.984   &   0.982   &   0.973   &   0.947   &   0.933   &  
0.25 0.984   &   0.954   &   0.89   &   0.857   &   0.932   &  
0.1 0.984   &   0.89   &   0.675   &   0.667   &   0.933   &  
0.05 0.984   &   0.815   &   0.412   &   0.487   &   0.932   &  
0.01 0.984   &   0.541   &   0.011   &   0.155   &   0.934   &  
0.005 0.984   &   0.409   &   0.001   &   0.083   &   0.935   &  


In [178]:
precision, recall, thresh = precision_recall_curve(labels, preds)

In [41]:
def recall_at_precision(precisions, recalls, precision_value=0.9):
    res = np.argwhere(precisions > precision_value)
    idx = 0
    if len(res) > 0:
        idx = res[0][0]
    return precisions[idx], recalls[idx]

Retrain Classifier with Different Class Weights
===

In [87]:
with open('../temp_data/reddit_old_dec/unchunked/experiment_data.p', 'rb') as f:
    (
        author_to_doc_idx, 
        author_to_doc_idx_test, 
        author_subreddit, 
        author_subreddit_test, 
        x_shape, 
        x_shape_test,
        X_idxs_train,
        Y_train,
        X_idxs_test,
        Y_test
    ) = pickle.load(f)
    
XX_train = np.memmap('../temp_data/reddit_old_dec/unchunked/XX_train.npy', dtype='float32', mode='r', shape=x_shape)


In [91]:
total_recs = 10000
frac = 0.01
X_idxs_neg = np.concatenate([X_idxs_neg_diff_topic, X_idxs_neg_same_topic])
p = np.random.choice(range(len(X_idxs_neg)), int(total_recs * (1 - frac)), replace=False)
X_idxs_neg_sampled = X_idxs_neg[p]
p = np.random.choice(range(len(X_idxs_pos)), int(total_recs * frac), replace=False)
X_idxs_pos_sampled = X_idxs_pos[p]

X_idxs_test_sample = np.concatenate([X_idxs_pos_sampled, X_idxs_neg_sampled])
y_test_sample = np.array([1] * len(X_idxs_pos_sampled) + [0] * len(X_idxs_neg_sampled))

x_test_diff_sample = secondary_scaler_nc.transform(np.abs(XX_nc[X_idxs_test_sample[:, 0]] - XX_nc[X_idxs_test_sample[:, 1]]))

In [113]:
batch_sz = 10000
classifiers = {}
weights = [
    {0: 0.5, 1:0.5},
    {0: 0.9, 1: 0.1},
    {0: 0.91, 1: 0.09},
    {0 :0.1, 1: 0.9}
]
curves = {}
for w in weights:
    print('WEIGHT:', w)
    clf_new = SGDClassifier(loss='log', alpha=0.01, class_weight=w)
    aucs = []
    pr_aucs = []
    for i in range(20):
        for idxs in chunker(np.arange(len(X_idxs_train)), batch_sz):
            x_diff = secondary_scaler_nc.transform(np.abs(XX_train[X_idxs_train[idxs, 0]] - XX_train[X_idxs_train[idxs, 1]]))
            x_diff[np.isnan(x_diff)]=0
            y = Y_train[idxs]
            clf_new.partial_fit(x_diff, y, classes=[0, 1])

            probs = clf_new.predict_proba(x_test_diff_sample)[:, 1]

            fpr, tpr, thresh = roc_curve(y_test_sample, probs)
            roc_auc = auc(fpr, tpr)
            pr_auc = average_precision_score(y_test_sample, probs)

            print('AUC:', roc_auc, 'PR_AUC:', pr_auc)
        print('~'*20, 'Epoch: ', i)
        aucs.append(roc_auc)
        pr_aucs.append(pr_auc)
    classifiers[str(w)] = clf_new
    curves[str(w)] = (aucs, pr_aucs)

WEIGHT: {0: 0.1, 1: 0.9}
AUC: 0.8862848484848485 PR_AUC: 0.045808835047057454
AUC: 0.9205318181818181 PR_AUC: 0.06615292727442247
AUC: 0.9243131313131312 PR_AUC: 0.09388921595046973
AUC: 0.9442601010101009 PR_AUC: 0.11368438350198225
AUC: 0.9385722222222223 PR_AUC: 0.10229066896897222
AUC: 0.9469358585858586 PR_AUC: 0.11150689720197171
AUC: 0.9429055555555556 PR_AUC: 0.13671573280731833
AUC: 0.9413338383838384 PR_AUC: 0.13748901929694618
AUC: 0.9497085858585859 PR_AUC: 0.12749633665593127
AUC: 0.9522565656565657 PR_AUC: 0.14072232510675206
AUC: 0.9519424242424241 PR_AUC: 0.1405421670456858
AUC: 0.9496277777777778 PR_AUC: 0.13565259579355407
AUC: 0.9538808080808081 PR_AUC: 0.15265196323097635
AUC: 0.952189898989899 PR_AUC: 0.14524652949725586
AUC: 0.9509636363636363 PR_AUC: 0.1710925049594791
AUC: 0.9529515151515152 PR_AUC: 0.1601118456095976
AUC: 0.9480328282828283 PR_AUC: 0.16065370148195854
AUC: 0.9499065656565657 PR_AUC: 0.15559226417849717
AUC: 0.9489449494949495 PR_AUC: 0.14984205

AUC: 0.9463797979797979 PR_AUC: 0.1574988095552654
AUC: 0.9469853535353534 PR_AUC: 0.1605731078709888
AUC: 0.9469161616161617 PR_AUC: 0.1721381791897677
AUC: 0.9469489898989899 PR_AUC: 0.17369382319716484
AUC: 0.9457141414141415 PR_AUC: 0.16476549172820268
AUC: 0.9461575757575758 PR_AUC: 0.16234073231926643
AUC: 0.9474292929292929 PR_AUC: 0.15894338269071653
AUC: 0.9471848484848485 PR_AUC: 0.15782597461906625
AUC: 0.9473939393939393 PR_AUC: 0.15856657194061544
AUC: 0.9471959595959597 PR_AUC: 0.1537350182913365
AUC: 0.9469580808080807 PR_AUC: 0.15539040094919082
AUC: 0.9469500000000001 PR_AUC: 0.15290977611692189
AUC: 0.946679797979798 PR_AUC: 0.16039227082592789
AUC: 0.9473515151515152 PR_AUC: 0.16493226911226058
AUC: 0.9469272727272727 PR_AUC: 0.16758919804051084
AUC: 0.9466878787878789 PR_AUC: 0.16874889162528722
AUC: 0.9459040404040403 PR_AUC: 0.1603416613630527
~~~~~~~~~~~~~~~~~~~~ Epoch:  8
AUC: 0.9477318181818181 PR_AUC: 0.1697468968477824
AUC: 0.9480247474747474 PR_AUC: 0.167771

AUC: 0.9469909090909091 PR_AUC: 0.168396497609875
AUC: 0.9471252525252526 PR_AUC: 0.1702400081334907
AUC: 0.947169191919192 PR_AUC: 0.17421485621453195
AUC: 0.9472787878787878 PR_AUC: 0.17006458761939558
AUC: 0.9479621212121211 PR_AUC: 0.17028030014124257
AUC: 0.9471737373737374 PR_AUC: 0.16155636389357195
AUC: 0.9472560606060606 PR_AUC: 0.1622548893773363
AUC: 0.9471813131313132 PR_AUC: 0.1588350852567743
AUC: 0.9472272727272727 PR_AUC: 0.16234055980288367
AUC: 0.9475929292929293 PR_AUC: 0.16353896449135708
AUC: 0.9471313131313132 PR_AUC: 0.16429058474027014
AUC: 0.9473247474747474 PR_AUC: 0.1665725120541071
AUC: 0.947109090909091 PR_AUC: 0.1687240630145314
AUC: 0.9471020202020203 PR_AUC: 0.17008034844722994
AUC: 0.9463686868686869 PR_AUC: 0.16201915593285535
~~~~~~~~~~~~~~~~~~~~ Epoch:  16
AUC: 0.9475323232323232 PR_AUC: 0.17207984749135954
AUC: 0.9478232323232324 PR_AUC: 0.16943277338613452
AUC: 0.9468550505050505 PR_AUC: 0.16192731653189993
AUC: 0.9471020202020202 PR_AUC: 0.1638770

In [108]:

total_recs = 75000
frac = 0.01
X_idxs_neg = np.concatenate([X_idxs_neg_diff_topic, X_idxs_neg_same_topic])
p = np.random.choice(range(len(X_idxs_neg)), int(total_recs * (1 - frac)), replace=False)
X_idxs_neg_sampled = X_idxs_neg[p]
p = np.random.choice(range(len(X_idxs_pos)), int(total_recs * frac), replace=False)
X_idxs_pos_sampled = X_idxs_pos[p]

X_idxs_test = np.concatenate([X_idxs_pos_sampled, X_idxs_neg_sampled])
Y_test = np.array([1] * len(X_idxs_pos_sampled) + [0] * len(X_idxs_neg_sampled))


In [114]:
for w, clf in classifiers.items():
    probs = []
    for idxs in chunker(X_idxs_test, batch_sz):
        x_diff = secondary_scaler_nc.transform(np.abs(XX_nc[idxs[:, 0]] - XX_nc[idxs[:, 1]]))
        p = clf.predict_proba(x_diff)[:, 1]
        probs.extend(p)
    fpr, tpr, thresh = roc_curve(Y_test, probs)
    roc_auc = auc(fpr, tpr)
    pr_auc = average_precision_score(Y_test, probs)
    print('W:', w, 'AUC:', round(roc_auc, 3), round(pr_auc, 3))

W: {0: 0.5, 1: 0.5} AUC: 0.981 0.375
W: {0: 0.9, 1: 0.1} AUC: 0.961 0.264
W: {0: 0.91, 1: 0.09} AUC: 0.957 0.26
W: {0: 0.1, 1: 0.9} AUC: 0.95 0.167
