In [1]:
%matplotlib inline
import pickle
import urllib
import time
import feedparser
import itertools
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import scipy.misc
from tqdm import tqdm

	"backend      : $TEMPLATE_BACKEND
"
	in file "/Users/michaelfarrell/.matplotlib/matplotlibrc"
	Key backend: Unrecognized backend string "$template_backend": valid strings are [u'pgf', u'cairo', u'MacOSX', u'CocoaAgg', u'gdk', u'ps', u'GTKAgg', u'nbAgg', u'GTK', u'Qt5Agg', u'template', u'emf', u'GTK3Cairo', u'GTK3Agg', u'WX', u'Qt4Agg', u'TkAgg', u'agg', u'svg', u'GTKCairo', u'WXAgg', u'WebAgg', u'pdf']
  (val, error_details, msg))


## Load in data

In [2]:
category = 'astro-ph'
entries = pickle.load(open(category + '_entries.pkl', 'rb'))
author_ind = pickle.load( open(category + '_author_ind.pkl', 'rb'))
train_adj_list = pickle.load(open(category + '_train_adj_list.pkl', 'rb'))
test_adj_list = pickle.load(open(category + '_test_adj_list.pkl', 'rb'))
train_adj_list_w_year = pickle.load( open( category + '_train_adj_list_with_year.pkl'))
test_adj_list_w_year = pickle.load( open( category + '_test_adj_list_with_year.pkl'))

num_authors = len(author_ind)
authors = range(num_authors)
pos_edges = set([(min(a1, a2), max(a1, a2)) for (a1, a2) in \
                 itertools.combinations(authors, 2)]) - set(train_adj_list)
pred_edges = set(test_adj_list) - set(train_adj_list)

train_years = sorted(list(set(map(lambda x : x[2], train_adj_list_w_year))))
test_years = sorted(list(set(map(lambda x : x[2], test_adj_list_w_year))))
possible_edges = list(itertools.combinations(authors, 2))

## Split the edges up by year

In [3]:
edges_by_year = {}
for year in test_years:
    edges_by_year[year] = map(lambda y: y[:2], filter(lambda x : x[2] == year, test_adj_list_w_year))

In [4]:
for year in test_years:
    year, len(edges_by_year[year])

(1996, 1358)

(1997, 2250)

(1998, 5716)

(1999, 4751)

(2000, 4889)

(2001, 2948)

## Train and validation years

In [5]:
years_for_train = train_years[:3]
valid_year = train_years[3]
years_for_train

[1996, 1997, 1998]

## Build the set of edges in the training set

In [6]:
train = set()
for year in years_for_train:
    train = train.union(set(edges_by_year[year]))
n_train = len(train)
n_train

9324

## List all potential edges with label

In [7]:
targets = map(lambda x : int(x in train), possible_edges)
zipped_train_input = zip(possible_edges, targets)

## Validation set of new edges added

In [8]:
valid = set(edges_by_year[valid_year]).difference(train)
n_valid = len(valid)
n_valid

4751

## The set of edges to be predicted

In [9]:
potential_new_edges = set(map(lambda  y : y[0], filter(lambda x: x[1] == 0, zipped_train_input)))
zipped_valid_input = map(lambda x : (x, 1) if x in valid else (x,0), potential_new_edges)
valid_no_edges = filter(lambda x: x[1] == 0, zipped_valid_input)
valid_yes_edges = filter(lambda x: x[1] == 1, zipped_valid_input)

## Matrix Factorization

In [50]:
k = 10 # Number of latent features
gammaU = 1e-3
gammaB = 1e-3
lambdaU = 1e-1
TOP_K = 4751
U = np.random.rand(num_authors,k) # User matrix
B = np.random.rand(num_authors) # Bias matrix

In [51]:
def score(vals):
    return np.sum(map(lambda ((i,j), t) : (np.dot(U[i], U[j]) + B[i] + B[j] - t)**2, vals)) / len(vals) + lambdaU*np.linalg.norm(U, ord='fro')
def predict(vals):
    return map(lambda ((i,j), t) : np.dot(U[i], U[j]) + B[i] + B[j], vals)
def top_acc(vals):
    predictions = predict(vals)
    valid_pred_vs_target = zip(predictions, map(lambda x : x[1], vals))
    sorted_scores = sorted(valid_pred_vs_target, key= lambda  x: x[0], reverse=True)
    return len(filter(lambda x : x[1] == 1, sorted_scores[:TOP_K]))/float(TOP_K)

In [52]:
G_U = np.zeros((num_authors, k, k)) +.01
G_B = np.zeros(num_authors) + .01

In [53]:
validation_scores = []
train_scores = []
no_scores = []
yes_scores = []
top_k_acc = []






N_EPOCH = 25
for epoch in range(N_EPOCH):
#     gammaU *= .1
#     gammaB *= .1
    print 'EPOCH', epoch
#     print 'train score:', score(zipped_train_input)
    print 'valid score:', score(zipped_valid_input)
#     print 'no avg score:', np.mean(predict(valid_no_edges)), 'yes avg score:', np.mean(predict(valid_yes_edges))
#     print 'Acc of TOP ', TOP_K, ':', top_acc(zipped_valid_input)
    np.random.shuffle(zipped_train_input)
    for (i,j), target in tqdm(zipped_train_input):
        # Difference in prediction vs target
        eij = np.dot(U[i], U[j]) + B[i] + B[j] - target
        
        gi = (eij*U[j] + lambdaU*U[i])
        G_U[i] += np.outer(gi, gi)
        U[i] -= gammaU*np.multiply(np.diag(G_U[i])**(-.5), gi)
        
        gj = (eij*U[i] + lambdaU*U[j])
        G_U[j] += np.outer(gj, gj)
        U[j] -= gammaU*np.multiply(np.diag(G_U[j])**(-.5), gj)
        
        sq_eij = eij**2
        
        G_B[i] += sq_eij
        B[i] -= gammaB*eij/np.sqrt(G_B[i])
        
        G_B[j] += sq_eij
        B[j] -= gammaB*eij/np.sqrt(G_B[j])

EPOCH 0
valid score: 21.956335614




EPOCH 1
valid score: 15.3456495715




EPOCH 2
valid score: 13.4708512796


  2%|▏         | 49431/3272961 [00:02<02:47, 19222.73it/s]

KeyboardInterrupt: 

In [42]:
U

array([[ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       ..., 
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan]])

In [36]:
np.outer(U[1], U[1]).shape

(10, 10)