In [None]:
# """
# Key Ideas:
# -> Load the train, test data frame.
# -> Compute csr - train_csr.
# -> Perform SVD on train_csr.
# -> Perform NMF on train_csr.
# -> Select transactions to be used for Training.
# -> Collect the following features for the transactions:
#     * SVD val for the pair
#     * NMF val for the pair
#     * In-degree for sender
#     * Out-degree for sender
#     * In-degree for reciever
#     * Out-degree for reciever
#     * PageRank sender 
#     * PageRank reciever
#     * Part of same connected component?
#     * Closeness Centrality for sender
#     * Betweenness Centrality for sender
#     * Closeness Centrality for reciever
#     * Betweenness Centrality for reciever
#     * Jaccards for the pair
#     * Acad_ for the pair
# -> Train Classifier
# -> Collect same metrics from test transactions
# -> Generate Predictions
# -> Metrics: RoC, Accuracy, Confusion Matrix
# """

In [1]:
import numpy as np

from sklearn.decomposition import NMF
from scipy.sparse import csr_matrix, coo_matrix, linalg
import matplotlib.pyplot as plt
import itertools
import matplotlib

from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

import random

import pandas as pd
from sklearn.metrics import roc_curve, auc

import networkx as nx

In [2]:
def load_data(convert_to_bin_trans):
    dftrain = pd.read_csv('data/txTripletsCounts.txt', header=None, index_col=None, sep=' ',
                          names=['sender','receiver','transaction'])
    dftest = pd.read_csv('data/testTriplets.txt', header=None, index_col=None, sep=' ',
                         names=['sender','receiver','transaction'])
    dim = max((dftrain['sender'].max(), dftrain['receiver'].max(), dftest['sender'].max(), dftest['receiver'].max()))
    dim += 1
    
    if convert_to_bin_trans:
        dftrain['transaction'] = np.array(dftrain['transaction'].tolist()).astype('bool').astype('int')
    
    train_csr = csr_matrix((dftrain['transaction'],(dftrain['sender'],dftrain['receiver'])), shape=(dim,dim), dtype=float)
    return dftrain, dftest, train_csr


def gen_nmf(A, n_components, save=False):
    nmf_model = NMF(n_components=n_components)
    W = nmf_model.fit_transform(A);
    H = nmf_model.components_;
    if save:
        save_sparse_csr("data/nmf_W_"+int(n_components), csr_matrix(W))
        save_sparse_csr("data/nmf_H_"+int(n_components), csr_matrix(H))
    return W, H


def get_predictions_svd(U, sigma, VT, df):
    pred = [np.sum(U[row['sender'],:] * sigma * VT[:,row['receiver']]) 
        for index,row in df[['sender', 'receiver']].iterrows()]
    return np.array(pred).astype(float)


def get_predictions_nmf(W, H, df):
    pred = [np.sum(W[row['sender'],:] * H[:,row['receiver']]) 
            for index,row in df[['sender', 'receiver']].iterrows()]
    return np.array(pred).astype(float)


def are_in_same_component(node1, node2, component_list):
    for component in component_list:
        if node1 in component or node2 in component:
            return node1 in component and node2 in component

        
def compute_graph_metrics(g, un_dir_g):
    in_d = nx.centrality.in_degree_centrality(g)
    out_d = nx.centrality.out_degree_centrality(g)

    pagerank = nx.pagerank(g)

    closeness = nx.centrality.in_degree_centrality(g)
    betweenness = nx.centrality.out_degree_centrality(g)

    cc = list(nx.connected_components(un_dir_g))
    scc = list(nx.strongly_connected_components(g))
    wcc = list(nx.weakly_connected_components(g))
    return in_d, out_d, pagerank, closeness, betweenness, cc, scc, wcc

        
def add_lin_alg_features(df, U, sigma, VT, W, H):
    df['svd_vals'] = get_predictions_svd(U, sigma, VT, df)
    df['nmf_vals'] = get_predictions_nmf(W, H, df)


def add_centrality_features(df, in_d, out_d, closeness, betweenness, pagerank):
    df['snd_in_degree'] = [in_d[i] for i in df['sender'].tolist()]
    df['rcv_in_degree'] = [in_d[i] for i in df['receiver'].tolist()]

    df['snd_out_degree'] = [out_d[i] for i in df['sender'].tolist()]
    df['rcv_out_degree'] = [out_d[i] for i in df['receiver'].tolist()]
    
    df['snd_closeness'] = [closeness[i] for i in df['sender'].tolist()]
    df['rcv_closeness'] = [closeness[i] for i in df['receiver'].tolist()]

    df['snd_betweenness'] = [betweenness[i] for i in df['sender'].tolist()]
    df['rcv_betweenness'] = [betweenness[i] for i in df['receiver'].tolist()]
    
    df['snd_pagerank'] = [pagerank[i] for i in df['sender'].tolist()]
    df['rcv_pagerank'] = [pagerank[i] for i in df['receiver'].tolist()]


def add_pair_features(df, pairs, un_dir_g, cc, scc, wcc):
    df['adamic'] = [x[2] for x in list(nx.adamic_adar_index(un_dir_g, pairs))]
    df['jaccard'] = [x[2] for x in list(nx.jaccard_coefficient(un_dir_g, pairs))]

    df['connected'] = np.array([are_in_same_component(pr[0], pr[1], cc) for pr in pairs]).astype('int')

    df['strng_connected'] = np.array([are_in_same_component(pr[0], pr[1], scc) for pr in pairs]).astype('int')

    df['wk_connected'] = np.array([are_in_same_component(pr[0], pr[1], wcc) for pr in pairs]).astype('int')

In [3]:
# Configs
convert_to_bin_trans = True

svd_k = 50
nmf_n = 12

selection_column = 'sender'
selection_count = 250

In [4]:
# Load the train, test data frame.
dftrain, dftest, train_csr = load_data(convert_to_bin_trans)

In [None]:
# Common computations for both train & test data

# Perform SVD on train_csr.
U, sigma, VT = svds(train_csr, k=svd_k, tol=1e-10, which = 'LM')
print "SVD Done!"

# Perform NMF on train_csr.
W, H = gen_nmf(train_csr, nmf_n)
print "NMF Done!"

# Graph based Computations
g = nx.from_scipy_sparse_matrix(train_csr, create_using=nx.DiGraph())
un_dir_g = nx.from_scipy_sparse_matrix(train_csr)
print "G represent Done!"

in_d, out_d, pagerank, closeness, betweenness, cc, scc, wcc = compute_graph_metrics(g, un_dir_g)

SVD Done!
NMF Done!
G represent Done!


In [None]:
random_senders = np.random.choice(dftrain[selection_column].unique(), selection_count)
train_txns = dftrain.loc[dftrain[selection_column].isin(random_senders)].copy(deep=True)
pairs = zip(train_txns['sender'].tolist(), train_txns['receiver'].tolist())

In [None]:
add_lin_alg_features(train_txns, U, sigma, VT, W, H)
add_centrality_features(train_txns, in_d, out_d, closeness, betweenness, pagerank)
add_pair_features(train_txns, pairs, un_dir_g, cc, scc, wcc)

In [None]:
add_lin_alg_features(dftest, U, sigma, VT, W, H)
add_centrality_features(dftest, in_d, out_d, closeness, betweenness, pagerank)
add_pair_features(dftest, pairs, un_dir_g, cc, scc, wcc)