In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../")
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pickle
import glob
from tqdm.notebook import trange, tqdm
import json
import re
import pandas as pd
import numpy as np
from collections import defaultdict
from features import merge_entries
from sklearn.preprocessing import StandardScaler
from features import merge_entries, get_transformer


from utills import cartesian_product, chunker
from train_utils import  generate_doc_pairs_no_chunking, fit_transformers_no_chunking, vectorize_no_chunking


from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc

In [3]:
import matplotlib.pyplot as plt
import matplotlib.style as style
import tikzplotlib
%matplotlib inline

In [4]:

from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [5]:
BASE_PATH = '../data/reddit/'
COMPUTED_DATA_PATH = '../temp_data/reddit/preprocessed/'
TEMP_DATA = '../temp_data/reddit/unchunked_limitted_data_40_capped/'
FIGURES_PATH = '../figures/'

In [6]:
MAX_COMMENTS = 40

In [7]:
with open(COMPUTED_DATA_PATH + 'metadata.p', 'rb') as f:
    (train_files, test_files, min_count, author_mapping_all) = pickle.load(f)

In [8]:
len(train_files)

21383

Vectorized Data
===

In [8]:
exclude_users_root = np.random.choice(train_files, size=int(len(train_files)*0.5), replace=False)
exclude_users_root = [u.replace('../data/reddit_2/', '/scratch/jnw301/av/data/reddit_2/reddit_2/').replace('.jsonl', '') for u in exclude_users_root]

exclude_users = []
for u in exclude_users_root:
    exclude_users.extend(author_mapping_all[u])

In [9]:
exclude_users_test_root = np.random.choice(test_files, size=int(len(test_files)*0.5), replace=False)
exclude_users_test_root = [u.replace('../data/reddit_2/', '/scratch/jnw301/av/data/reddit_2/reddit_2/').replace('.jsonl', '') for u in exclude_users_test_root]

exclude_users_test = []
for u in exclude_users_test_root:
    exclude_users_test.extend(author_mapping_all[u])

In [10]:
transformer, scaler, secondary_scaler = fit_transformers_no_chunking(COMPUTED_DATA_PATH + 'train.jsonl', author_mapping_all, sample_fraction=0.1, max_comments=MAX_COMMENTS, exclude_users=exclude_users)

Sampled: 7339
Reading preprocessed data...



Fitting transformer
Generating pairs
new2









6163 5543 3210
6163 1848 3210


In [11]:
XX_train, author_to_doc_idx, author_subreddit, x_shape, e, t = vectorize_no_chunking(
    preprocessed_path = COMPUTED_DATA_PATH + 'train.jsonl', 
    vectorized_x_path = TEMP_DATA + 'XX_train.npy', 
    transformer=transformer,
    scaler=scaler,
    max_comments=MAX_COMMENTS,
    exclude_users=exclude_users
)

Precomputing record size...








In [12]:
print('Excluded: ', len(e), 'Total:', t)

Excluded:  29246 Total: 83295


In [13]:
XX_test, author_to_doc_idx_test, author_subreddit_test, x_shape_test, e, t = vectorize_no_chunking(
    preprocessed_path = COMPUTED_DATA_PATH + 'test.jsonl', 
    vectorized_x_path = TEMP_DATA + 'XX_test.npy', 
    transformer=transformer,
    scaler=scaler,
    max_comments=MAX_COMMENTS,
    exclude_users=exclude_users_test
)

Precomputing record size...








In [15]:
print('Excluded: ', len(e), 'Total:', t)

Excluded:  21339 Total: 42961


In [16]:
author_mapping = defaultdict(set)
author_to_root = {}
for y in author_to_doc_idx.keys():
    u = re.search(r'(.*)_[A-Z]+$', y).group(1)
    author_mapping[u].add(y)
    author_to_root[y] = u

subreddit_to_author = defaultdict(list)
for k, v in author_subreddit.items():
    subreddit_to_author[v].append(k)
    
author_mapping_test = defaultdict(set)
author_to_root_test = {}
for y in author_to_doc_idx_test.keys():
    u = re.search(r'(.*)_[A-Z]+$', y).group(1)
    author_mapping_test[u].add(y)
    author_to_root_test[y] = u

subreddit_to_author_test = defaultdict(list)
for k, v in author_subreddit_test.items():
    subreddit_to_author_test[v].append(k)

In [17]:
train_authors = np.unique(list(author_to_root.values()))
test_authors = np.unique(list(author_to_root_test.values()))

print('Total number of authors:', len(train_authors) + len(test_authors))
print('Train authors:', len(train_authors))
print('Test authors:', len(test_authors))

Total number of authors: 27781
Train authors: 19786
Test authors: 7995


In [18]:
X_idxs_train, Y_train = generate_doc_pairs_no_chunking(author_mapping, subreddit_to_author, author_to_root, author_to_doc_idx, author_subreddit)    
X_idxs_test, Y_test = generate_doc_pairs_no_chunking(author_mapping_test, subreddit_to_author_test, author_to_root_test, author_to_doc_idx_test, author_subreddit_test)

new2









59611 54049 47631
59611 17883 41727
new2









23414 21622 16925
23414 7024 16389


Train the classifier
===

In [19]:
batch_sz = 10000
x_test_diff_sample = secondary_scaler.transform(np.abs(XX_test[X_idxs_test[:batch_sz, 0]] - XX_test[X_idxs_test[:batch_sz, 1]]))
y_test_sample = Y_test[:batch_sz]

In [20]:
len(X_idxs_train)

119221

In [21]:
batch_sz = 10000
clf = SGDClassifier(loss='log', alpha=0.01)
aucs = []
for i in range(100):
    for idxs in chunker(np.arange(40000), batch_sz):
        x_diff = secondary_scaler.transform(np.abs(XX_train[X_idxs_train[idxs, 0]] - XX_train[X_idxs_train[idxs, 1]]))
        x_diff[np.isnan(x_diff)]=0
        y = Y_train[idxs]
        clf.partial_fit(x_diff, y, classes=[0, 1])

        probs = clf.predict_proba(x_test_diff_sample)[:, 1]

        fpr, tpr, thresh = roc_curve(y_test_sample, probs)
        roc_auc = auc(fpr, tpr)
        print('AUC:', roc_auc)
    print('~'*20, 'Epoch: ', i)
    aucs.append(roc_auc)

AUC: 0.874771538684063
AUC: 0.8948091341006124
AUC: 0.9065100214977239
AUC: 0.9095398389494723
~~~~~~~~~~~~~~~~~~~~ Epoch:  0
AUC: 0.9104860043993854
AUC: 0.9130368190920781
AUC: 0.9187625520723
AUC: 0.9151866714752277
~~~~~~~~~~~~~~~~~~~~ Epoch:  1
AUC: 0.9195409365557945
AUC: 0.9210589652996404
AUC: 0.9235585996975343
AUC: 0.9199230587568185
~~~~~~~~~~~~~~~~~~~~ Epoch:  2
AUC: 0.9257221321594813
AUC: 0.9236948004820508
AUC: 0.9242083634401734
AUC: 0.9244043045687942
~~~~~~~~~~~~~~~~~~~~ Epoch:  3
AUC: 0.9251002085772015
AUC: 0.9226481544533697
AUC: 0.9257160521244603
AUC: 0.9262457351754346
~~~~~~~~~~~~~~~~~~~~ Epoch:  4
AUC: 0.9244795650022944
AUC: 0.9258404928412388
AUC: 0.9287907898349494
AUC: 0.9264398962938026
~~~~~~~~~~~~~~~~~~~~ Epoch:  5
AUC: 0.9284207077032762
AUC: 0.9292430724400973
AUC: 0.9327363325612755
AUC: 0.9312553640308968
~~~~~~~~~~~~~~~~~~~~ Epoch:  6
AUC: 0.9300559171220827
AUC: 0.9303998791033037
AUC: 0.9329972140639531
AUC: 0.9315968059976025
~~~~~~~~~~~~~~~~~~~

AUC: 0.941257261641827
AUC: 0.9411893012503754
~~~~~~~~~~~~~~~~~~~~ Epoch:  64
AUC: 0.9414377426813979
AUC: 0.9409428198306422
AUC: 0.9414103025233425
AUC: 0.9414356226691866
~~~~~~~~~~~~~~~~~~~~ Epoch:  65
AUC: 0.9414371826781722
AUC: 0.940951779882252
AUC: 0.9411729011559106
AUC: 0.9410431804087191
~~~~~~~~~~~~~~~~~~~~ Epoch:  66
AUC: 0.9413825823636744
AUC: 0.9409525798868601
AUC: 0.9409013395917161
AUC: 0.9411359809432501
~~~~~~~~~~~~~~~~~~~~ Epoch:  67
AUC: 0.9412288614782423
AUC: 0.9411499810238906
AUC: 0.9412976218743021
AUC: 0.9411743411642051
~~~~~~~~~~~~~~~~~~~~ Epoch:  68
AUC: 0.9413416621279739
AUC: 0.9410975407218345
AUC: 0.9413473021604605
AUC: 0.941231141491375
~~~~~~~~~~~~~~~~~~~~ Epoch:  69
AUC: 0.9411879012423111
AUC: 0.9411322609218229
AUC: 0.9411141808176814
AUC: 0.9414376226807066
~~~~~~~~~~~~~~~~~~~~ Epoch:  70
AUC: 0.9413866223869449
AUC: 0.9414099425212689
AUC: 0.9412922218431979
AUC: 0.9413851823786504
~~~~~~~~~~~~~~~~~~~~ Epoch:  71
AUC: 0.9414019024749583
AUC

In [22]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=aucs))


In [23]:
TEMP_DATA

'../temp_data/reddit/unchunked_limitted_data_40_capped/'

In [24]:
with open(TEMP_DATA + 'model.p', 'wb') as f:
    pickle.dump((clf, transformer, scaler, secondary_scaler, aucs ), f)

In [25]:
with open(TEMP_DATA + 'experiment_data.p', 'wb') as f:
    pickle.dump((
        author_to_doc_idx, 
        author_to_doc_idx_test, 
        author_subreddit, 
        author_subreddit_test, 
        x_shape, 
        x_shape_test,
        X_idxs_train,
        Y_train,
        X_idxs_test,
        Y_test
    ), f)

In [None]:
x_diff = secondary_scaler.transform(np.abs(XX_test[X_idxs_test[:, 0]] - XX_test[X_idxs_test[:, 1]]))
x_diff[np.isnan(x_diff)]=0

probs = clf.predict_proba(x_diff)[:, 1]
fpr, tpr, thresh = roc_curve(Y_test, probs)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

In [None]:
aucs[-1]