In [None]:
# """
# Key Ideas:
# -> Load the train, test data frame.
# -> Compute csr - train_csr.
# -> Perform SVD on train_csr.
# -> Perform NMF on train_csr.
# -> Select transactions to be used for Training.
# -> Collect the following features for the transactions:
#     * SVD val for the pair
#     * NMF val for the pair
#     * In-degree for sender
#     * Out-degree for sender
#     * In-degree for reciever
#     * Out-degree for reciever
#     * PageRank sender 
#     * PageRank reciever
#     * Part of same connected component?
#     * Closeness Centrality for sender
#     * Betweenness Centrality for sender
#     * Closeness Centrality for reciever
#     * Betweenness Centrality for reciever
#     * Jaccards for the pair
#     * Acad_ for the pair
# -> Train Classifier
# -> Collect same metrics from test transactions
# -> Generate Predictions
# -> Metrics: RoC, Accuracy, Confusion Matrix
# """

In [1]:
import numpy as np

from sklearn.decomposition import NMF
from scipy.sparse import csr_matrix, coo_matrix, linalg
import matplotlib.pyplot as plt
import itertools
import matplotlib

from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

import random

import pandas as pd
from sklearn.metrics import roc_curve, auc

import networkx as nx

In [2]:
def load_data(convert_to_bin_trans):
    dftrain = pd.read_csv('data/txTripletsCounts.txt', header=None, index_col=None, sep=' ',
                          names=['sender','receiver','transaction'])
    dftest = pd.read_csv('data/testTriplets.txt', header=None, index_col=None, sep=' ',
                         names=['sender','receiver','transaction'])
    dim = max((dftrain['sender'].max(), dftrain['receiver'].max(), dftest['sender'].max(), dftest['receiver'].max()))
    dim += 1
    
    if convert_to_bin_trans:
        dftrain['transaction'] = np.array(dftrain['transaction'].tolist()).astype('bool').astype('int')
    
    train_csr = csr_matrix((dftrain['transaction'],(dftrain['sender'],dftrain['receiver'])), shape=(dim,dim), dtype=float)
    return dftrain, dftest, train_csr


def gen_nmf(A, n_components, save=False):
    nmf_model = NMF(n_components=n_components)
    W = nmf_model.fit_transform(A);
    H = nmf_model.components_;
    if save:
        save_sparse_csr("data/nmf_W_"+int(n_components), csr_matrix(W))
        save_sparse_csr("data/nmf_H_"+int(n_components), csr_matrix(H))
    return W, H


def get_predictions_svd(U, sigma, VT, df):
    pred = [np.sum(U[row['sender'],:] * sigma * VT[:,row['receiver']]) 
        for index,row in df[['sender', 'receiver']].iterrows()]
    return np.array(pred).astype(float)


def get_predictions_nmf(W, H, df):
    pred = [np.sum(W[row['sender'],:] * H[:,row['receiver']]) 
            for index,row in df[['sender', 'receiver']].iterrows()]
    return np.array(pred).astype(float)


def are_in_same_component(node1, node2, component_list):
    for component in component_list:
        if node1 in component or node2 in component:
            return node1 in component and node2 in component

        
def compute_graph_metrics(g, un_dir_g):
    in_d = nx.centrality.in_degree_centrality(g)
    out_d = nx.centrality.out_degree_centrality(g)

    pagerank = nx.pagerank(g)

    closeness = nx.centrality.in_degree_centrality(g)
    betweenness = nx.centrality.out_degree_centrality(g)

    cc = list(nx.connected_components(un_dir_g))
    scc = list(nx.strongly_connected_components(g))
    wcc = list(nx.weakly_connected_components(g))
    return in_d, out_d, pagerank, closeness, betweenness, cc, scc, wcc

        
def add_lin_alg_features(df, U, sigma, VT, W, H):
    df['svd_vals'] = get_predictions_svd(U, sigma, VT, df)
    df['nmf_vals'] = get_predictions_nmf(W, H, df)


def add_centrality_features(df, in_d, out_d, closeness, betweenness, pagerank):
    df['snd_in_degree'] = [in_d[i] for i in df['sender'].tolist()]
    df['rcv_in_degree'] = [in_d[i] for i in df['receiver'].tolist()]

    df['snd_out_degree'] = [out_d[i] for i in df['sender'].tolist()]
    df['rcv_out_degree'] = [out_d[i] for i in df['receiver'].tolist()]
    
    df['snd_closeness'] = [closeness[i] for i in df['sender'].tolist()]
    df['rcv_closeness'] = [closeness[i] for i in df['receiver'].tolist()]

    df['snd_betweenness'] = [betweenness[i] for i in df['sender'].tolist()]
    df['rcv_betweenness'] = [betweenness[i] for i in df['receiver'].tolist()]
    
    df['snd_pagerank'] = [pagerank[i] for i in df['sender'].tolist()]
    df['rcv_pagerank'] = [pagerank[i] for i in df['receiver'].tolist()]


def add_pair_features(df, pairs, un_dir_g, cc, scc, wcc):
    df['adamic'] = [x[2] for x in list(nx.adamic_adar_index(un_dir_g, pairs))]
    df['jaccard'] = [x[2] for x in list(nx.jaccard_coefficient(un_dir_g, pairs))]

    df['connected'] = np.array([are_in_same_component(pr[0], pr[1], cc) for pr in pairs]).astype('int')

    df['strng_connected'] = np.array([are_in_same_component(pr[0], pr[1], scc) for pr in pairs]).astype('int')

    df['wk_connected'] = np.array([are_in_same_component(pr[0], pr[1], wcc) for pr in pairs]).astype('int')

In [3]:
# Configs
convert_to_bin_trans = True

svd_k = 50
nmf_n = 12

selection_column = 'sender'
selection_count = 250

In [4]:
# Load the train, test data frame.
dftrain, dftest, train_csr = load_data(convert_to_bin_trans)

In [5]:
# Common computations for both train & test data

# Perform SVD on train_csr.
U, sigma, VT = svds(train_csr, k=svd_k, tol=1e-10, which = 'LM')
print "SVD Done!"

# Perform NMF on train_csr.
W, H = gen_nmf(train_csr, nmf_n)
print "NMF Done!"

# Graph based Computations
g = nx.from_scipy_sparse_matrix(train_csr, create_using=nx.DiGraph())
un_dir_g = nx.from_scipy_sparse_matrix(train_csr)
print "G represent Done!"

in_d, out_d, pagerank, closeness, betweenness, cc, scc, wcc = compute_graph_metrics(g, un_dir_g)

SVD Done!
NMF Done!
G represent Done!


In [6]:
random_senders = np.random.choice(dftrain[selection_column].unique(), selection_count)
train_txns = dftrain.loc[dftrain[selection_column].isin(random_senders)].copy(deep=True)
pairs = zip(train_txns['sender'].tolist(), train_txns['receiver'].tolist())

In [7]:
add_lin_alg_features(train_txns, U, sigma, VT, W, H)
add_centrality_features(train_txns, in_d, out_d, closeness, betweenness, pagerank)
add_pair_features(train_txns, pairs, un_dir_g, cc, scc, wcc)

In [9]:
test_pairs = zip(dftest['sender'].tolist(), dftest['receiver'].tolist())
add_lin_alg_features(dftest, U, sigma, VT, W, H)
add_centrality_features(dftest, in_d, out_d, closeness, betweenness, pagerank)
add_pair_features(dftest, test_pairs, un_dir_g, cc, scc, wcc)

In [13]:
dftest

Unnamed: 0,sender,receiver,transaction,svd_vals,nmf_vals,snd_in_degree,rcv_in_degree,snd_out_degree,rcv_out_degree,snd_closeness,rcv_closeness,snd_betweenness,rcv_betweenness,snd_pagerank,rcv_pagerank,adamic,jaccard,connected,strng_connected,wk_connected
0,1,16,0,2.636684e-03,4.832074e-05,0.044502,0.000027,0.037719,0.018729,0.044502,0.000027,0.037719,0.018729,7.849765e-03,4.976294e-06,52.975046,0.005823,1,1,1
1,1,66,0,3.551430e-01,5.390742e-01,0.044502,0.000122,0.037719,0.000032,0.044502,0.000122,0.037719,0.000032,7.849765e-03,1.165492e-05,2.483570,0.001259,1,1,1
2,1,12458,0,1.392981e-01,2.469361e-01,0.044502,0.000180,0.037719,0.000029,0.044502,0.000180,0.037719,0.000029,7.849765e-03,7.843179e-06,3.242614,0.001610,1,1,1
3,1,22506,1,1.787394e-02,1.057615e-03,0.044502,0.000005,0.037719,0.000002,0.044502,0.000005,0.037719,0.000002,7.849765e-03,3.414914e-07,0.000000,0.000000,1,1,1
4,1,29176,0,9.818377e-04,1.576791e-04,0.044502,0.000137,0.037719,0.000063,0.044502,0.000137,0.037719,0.000063,7.849765e-03,1.965032e-05,1.123157,0.000251,1,1,1
5,1,38898,0,3.799609e-04,7.220628e-05,0.044502,0.000045,0.037719,0.000009,0.044502,0.000045,0.037719,0.000009,7.849765e-03,4.917941e-06,0.000000,0.000000,1,1,1
6,1,99687,1,2.903712e-01,8.935075e-02,0.044502,0.000023,0.037719,0.000038,0.044502,0.000023,0.037719,0.000038,7.849765e-03,2.399287e-06,1.768725,0.000454,1,1,1
7,1,244013,0,2.194104e-02,1.208937e-03,0.044502,0.000153,0.037719,0.000007,0.044502,0.000153,0.037719,0.000007,7.849765e-03,3.218047e-06,0.339685,0.000101,1,1,1
8,1,246667,0,1.821761e-05,1.127830e-05,0.044502,0.000014,0.037719,0.000009,0.044502,0.000014,0.037719,0.000009,7.849765e-03,1.772463e-06,0.000000,0.000000,1,1,1
9,1,330731,0,3.574782e-01,1.001018e-01,0.044502,0.000020,0.037719,0.000011,0.044502,0.000020,0.037719,0.000011,7.849765e-03,2.541162e-06,0.780359,0.000302,1,1,1


In [15]:
train_txns

Unnamed: 0,sender,receiver,transaction,svd_vals,nmf_vals,snd_in_degree,rcv_in_degree,snd_out_degree,rcv_out_degree,snd_closeness,rcv_closeness,snd_betweenness,rcv_betweenness,snd_pagerank,rcv_pagerank,adamic,jaccard,connected,strng_connected,wk_connected
1632399,2728,3,1,9.997116e-01,9.997402e-01,0.000007,0.388960,0.000005,0.370058,0.000007,0.388960,0.000005,0.370058,1.109603e-06,9.877454e-02,0.925602,0.000017,1,1,1
1632400,2728,331195,1,4.845484e-05,7.326564e-06,0.000007,0.000007,0.000005,0.000005,0.000007,0.000007,0.000005,0.000005,1.109603e-06,1.414307e-06,0.082880,0.142857,1,1,1
1675219,4787,29489,1,2.089685e-13,4.342117e-13,0.000009,0.000266,0.000005,0.000002,0.000009,0.000266,0.000005,0.000002,8.864922e-06,4.402120e-05,0.000000,0.000000,1,1,1
1675220,4787,31507,1,2.288052e-09,2.122231e-10,0.000009,0.000018,0.000005,0.000009,0.000009,0.000018,0.000005,0.000009,8.864922e-06,6.175372e-06,0.000000,0.000000,1,1,1
1683788,5318,1,1,1.305805e+00,3.708156e-01,0.000047,0.044502,0.000025,0.037719,0.000047,0.044502,0.000025,0.037719,6.542226e-06,7.849765e-03,1.667922,0.000605,1,1,1
1683789,5318,3,1,1.002287e+00,1.006626e+00,0.000047,0.388960,0.000025,0.370058,0.000047,0.388960,0.000025,0.370058,6.542226e-06,9.877454e-02,1.667922,0.000069,1,1,1
1683790,5318,9,1,1.054294e+00,1.194142e+00,0.000047,0.075384,0.000025,0.067081,0.000047,0.075384,0.000025,0.067081,6.542226e-06,1.313671e-02,1.667922,0.000347,1,1,1
1683791,5318,13,1,1.033241e+00,1.267967e+00,0.000047,0.118383,0.000025,0.106412,0.000047,0.118383,0.000025,0.106412,6.542226e-06,2.379108e-02,1.667922,0.000224,1,1,1
1683792,5318,14,1,1.015215e+00,1.073889e+00,0.000047,0.180026,0.000025,0.166977,0.000047,0.180026,0.000025,0.166977,6.542226e-06,4.145227e-02,1.667922,0.000148,1,1,1
1683793,5318,37,1,9.365325e-01,2.785961e-01,0.000047,0.035145,0.000025,0.029346,0.000047,0.035145,0.000025,0.029346,6.542226e-06,5.970384e-03,1.667922,0.000765,1,1,1
