In [1]:
##################################################################
##################################################################
#
#                   LIBRARIES
#
##################################################################
##################################################################
import collections
import csv
import datetime as dt
import itertools as it
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import re
import requests
import seaborn as sns
import sys

from collections import Counter
from collections import defaultdict
from community import best_partition
from igraph import *
from itertools import count
from itertools import zip_longest
from networkx.drawing.nx_agraph import graphviz_layout
from neo4j.v1 import GraphDatabase
from operator import itemgetter, attrgetter

In [2]:
##################################################################
##################################################################
#
#                    DEFINITIONS
#
##################################################################
##################################################################


def create_dict(keys, values):
    return dict(zip_longest(keys, values[:len(keys)]))

def combinatorial(lst):
    count = 0
    index = 1
    pairs = []
    for element1 in lst:
        for element2 in lst[index:]:
            pairs.append((element1, element2))
        index += 1
    
    return pairs

def dedupe(data):
    result = Counter()
    for row in data:
        result.update(dict([row]))
    return result.items()

def edgelist_joshwt(data):
    edges = []
    weights = []
    counter = []
    start = int(data['user'].min())
    stop = int(data['user'].max())
    for u in range(start, stop):
        y = data[data['user'] == u]
        y = y.sort_values(by=[node_category])
        x = y[node_category].value_counts()
        z = list(combinatorial(y[node_category]))
        z = remove_selfloops(z)
        for i in range(0,len(z)):
            for item in z[i]:
                counter.append(x[item])
            if counter[0] == counter[1]:
                weights.append(round(counter[0],2))
            else:
                weights.append(round(max(counter)/abs(counter[0]-counter[1]),2))
            counter = []
        edges = edges + z

    edgesandwts = list(zip(edges, weights))
    edgesandwts = sorted(edgesandwts,key=itemgetter(0))
    edgesandwts_dedupe = list(dedupe(edgesandwts)) #gets rid of duplicate edges and adds their weights
    e_pairs, wts = zip(*edgesandwts_dedupe)
    e1, e2 = zip(*e_pairs)
    edgelist = list(zip(e1, e2, wts))

    return edgelist

def edgelist_usrcount(data):
    edges = []
    weights = []
    counter = []
    start = int(data['user'].min())
    stop = int(data['user'].max())
    for u in range(start, stop):
        y = data[data['user'] == u]
        y = y.sort_values(by=[node_category])
        y = y.drop_duplicates(node_category)
        z = list(combinatorial(y[node_category]))
        edges = edges + z

    edges = sorted(edges)   #creates a list of tuples (endnode1, endnode2)
    cntr = collections.Counter(edges)
    e_pairs, wts = cntr.keys(), cntr.values()
    e1, e2 = zip(*e_pairs) #unzips the list of tuples
    edgelist = list(zip(e1, e2, wts))   #(endnode1, endnode2, weight)

    return edgelist

def edgelist_bynumber_forR(edgelst):
    e1, e2, w = zip(*edgelst)
    src = replace(e1, node_labels)
    tgt = replace(e2, node_labels)
    wtd_edgelist_byid = list(zip(src, tgt, w))
    return wtd_edgelist_byid

def edgelist_withnames(edgelst, node_labels):
    e1, e2, w = zip(*edgelst)
    src = replace(e1, node_labels)
    tgt = replace(e2, node_labels)
    wtd_edgelist_withnames = list(zip(e1, e2, w, src, tgt))
    return wtd_edgelist_withnames

def mod(communities):
    modularity = {}
    for k,v in communities.items(): # Loop through the community dictionary
        if v not in modularity:
            modularity[v] = [k] # Add a new key for a modularity class the code hasn't seen before
        else:
            modularity[v].append(k)
    
    return modularity

def remove_duplicate(alist):
    return list(set(alist))

# Remove self loops! ie (a,a)
def remove_selfloops(listofpairs):
    new_list = []
    for a, b in listofpairs:
        if (a != b):
            new_list.append((a,b))
    return new_list

def replace(my_list, my_dict):
    return [x if x not in my_dict else my_dict[x] for x in my_list]

#def remove_duplicates(A):
#    [A.pop(count) for count,elem in enumerate(A) if A.count(elem)!=1]
#        return A

In [3]:
##################################################################
##################################################################
#
#                     Inputs
#
##################################################################
##################################################################

#choose which category to use for nodes of the network (merchant_id, merchant_details, merchant_name, mcc_code, mcc_description)
node_category = 'mcc_code'

In [4]:
df_travel = pd.read_csv('/Users/MonikaHeinig/Desktop/Insight/KOHO_Financial/TRAVEL/df_travel.csv')

In [5]:
edgelist_travel_mcccode_usrcount = edgelist_usrcount(df_travel)
edgelist_travel_mcccode_usrcount.sort(key=operator.itemgetter(2), reverse=True)

In [6]:
nodes_travel_mcccode_usrcount = df_travel.copy()
nodes_travel_mcccode_usrcount = nodes_travel_mcccode_usrcount.drop_duplicates(node_category)
nodes_travel_mcccode_usrcount = nodes_travel_mcccode_usrcount.drop(columns = ['merchant_id', 'merchant_details', 'merchant_name', 'index', 'user', 'age', 'date', 'weekday', 'amount', 'user_numofpurchases',  'transaction_code', 'authorization_timestamp'])

In [7]:
# Create a dictionary for nodes & mcc_code (this is for R later on)
node_labels_mcccode_usrcount = create_dict(nodes_travel_mcccode_usrcount.mcc_code, nodes_travel_mcccode_usrcount.mcc_description.values)

In [8]:
edgelist_travel_mcccode_usrcount_withnames = edgelist_withnames(edgelist_travel_mcccode_usrcount, node_labels_mcccode_usrcount)
edgelist_travel_mcccode_usrcount_withnames_asdf = pd.DataFrame(edgelist_travel_mcccode_usrcount_withnames, columns=['from', 'to', 'weight', 'from_name', 'to_name'])

In [9]:
#write to csv:
edgelist_travel_mcccode_usrcount_withnames_asdf.to_csv('/Users/MonikaHeinig/Desktop/Insight/KOHO_Financial/TRAVEL/mcc_code/edgelist_travel_mcccode_usrcount_withnames_asdf.csv', index=False)

In [10]:
##################################################################
##################################################################
#
#               GRAPHING   -- using networkx
#
##################################################################
##################################################################

T = nx.Graph()
T.add_weighted_edges_from(edgelist_travel_mcccode_usrcount)

In [11]:
# Louvain Community Detection  <-- fix this to make isolated nodes their own community?
communities_travel_mcccode_usrcount = best_partition(T)
nx.set_node_attributes(T, communities_travel_mcccode_usrcount, 'modularity')
nodes_travel_mcccode_usrcount['community'] = nodes_travel_mcccode_usrcount[node_category].map(communities_travel_mcccode_usrcount)
modularity_travel_mcccode_usrcount = mod(communities_travel_mcccode_usrcount)  # <-- lets you look at each community, i.e.   cmd:  modularity_travel[#]

In [12]:
# degree
degree_dict_travel_mcccode_usrcount = dict(T.degree(T.nodes()))
nx.set_node_attributes(T, degree_dict_travel_mcccode_usrcount, 'degree')
nodes_travel_mcccode_usrcount['degree'] = nodes_travel_mcccode_usrcount[node_category].map(degree_dict_travel_mcccode_usrcount)
nodes_travel_mcccode_usrcount['degree'].fillna(0, inplace=True)  # < -- to deal with isolated nodes (degree = 0)

In [13]:
# betweenness centrality
btwn_centrality_travel_mcccode_usrcount = nx.betweenness_centrality(T)  # sets node attribute
nodes_travel_mcccode_usrcount['betweenness_centrality'] = nodes_travel_mcccode_usrcount[node_category].map(btwn_centrality_travel_mcccode_usrcount)  #adds attribute to nodes_travel dataframe

# closeness centrality
close_centrality_travel_mcccode_usrcount = nx.closeness_centrality(T)
nodes_travel_mcccode_usrcount['closeness_centrality'] = nodes_travel_mcccode_usrcount[node_category].map(close_centrality_travel_mcccode_usrcount)

# eigenvectory centrality
eigenvector_centrality_travel_mcccode_usrcount = nx.eigenvector_centrality(T)
nodes_travel_mcccode_usrcount['eigenvector_centrality'] = nodes_travel_mcccode_usrcount[node_category].map(eigenvector_centrality_travel_mcccode_usrcount)

In [14]:
# neighbors (list by mcc_code)
codes_travel_mcccode_usrcount = list(degree_dict_travel_mcccode_usrcount.keys())
nbrs_travel_mcccode_usrcount = {}
for c in codes_travel_mcccode_usrcount:
    nbrs_travel_mcccode_usrcount[c] = [n for n in T.neighbors(c)]
nodes_travel_mcccode_usrcount['neighbors'] = nodes_travel_mcccode_usrcount[node_category].map(nbrs_travel_mcccode_usrcount)

In [17]:
nbrs_travel_mcccode_usrcount_byname = nbrs_travel_mcccode_usrcount.copy()

In [26]:
##################################################################
##################################################################
#
#               SAVE OUT INFO TO CSV FILES
#
##################################################################
##################################################################
nodes_travel_mcccode_usrcount.to_csv('/Users/MonikaHeinig/Desktop/Insight/KOHO_Financial/TRAVEL/mcc_code/nodes_travel_mcccode_usrcount_07012018.csv')

In [28]:
modularity_travel_mcccode_usrcount

{0: [4722,
  7011,
  4511,
  3009,
  3180,
  3000,
  3058,
  4582,
  7991,
  3640,
  3513,
  3007,
  3642,
  3035,
  3219,
  3008,
  3012,
  3043,
  3047,
  3261,
  3710,
  3750,
  3769,
  3751,
  3770,
  3050,
  3056,
  3075,
  3127,
  3782,
  3783,
  3017,
  3022,
  3034,
  3052,
  3076,
  3077,
  3079,
  3099,
  3196,
  3187,
  3206,
  3505,
  3533,
  3592,
  3628,
  3654,
  3676,
  3774,
  5962],
 1: [3502,
  3501,
  3615,
  3510,
  3665,
  3508,
  3692,
  3562,
  4131,
  3516,
  3072,
  3552,
  3555,
  3604,
  3687,
  3740,
  3765,
  3066,
  3515,
  3608,
  3670,
  3780],
 2: [3581,
  3690,
  3709,
  4411,
  3703,
  3638,
  3211,
  3005,
  3016,
  3649,
  3029,
  3771,
  3217,
  3650],
 3: [3001,
  3509,
  3573,
  3256,
  3260,
  3637,
  3715,
  3778,
  3553,
  3621,
  3700,
  3174,
  3613,
  3660,
  3161,
  3551,
  3667,
  3644,
  3132,
  3602,
  3528,
  3561,
  3631],
 4: [3590,
  3504,
  7012,
  3503,
  3530,
  3512,
  3695,
  3722,
  3020,
  3543,
  3545,
  3659,
  3039,
  365