In [9]:
## Production version of code for Insight Data Engineers program.

# Overall strategy - define module "account" which contains a list of "contacts"
# Contacts are saved as Python dictionary, using account number as key and the "tier" level as value
# "tier" being the shorthand for the number of degrees of separation.
# Since Python dictionaries are extremely fast (equivalent to hash tables), should be able to flag transaction fast

import pandas as pd
import csv
import numpy as np

In [2]:
## Paths to data files
input_dir = 'paymo_input/'
batch_file = 'batch_payment.csv'
stream_file = 'stream_payment.csv'
batch_path = input_dir + batch_file
stream_path = input_dir + stream_file

test_file = 'batch_test_2.csv'
test_path = input_dir + test_file

In [3]:
## first hurdle - PayMo messages can include commas, and therefore using pandas csv_read fails due to inconsistent number of columns.
## current compromise is to use DictReader instead, which truncates Message at first comma
## best solution likely involves regular expressions, but not analyzing messages for the time being anyway

#takes about 30 seconds to load the 3 million lines each of batch_payment and stream_payment on my macbook pro.

batch_dict = {}
batch_dict['time'] = {}
batch_dict['id1'] = {}
batch_dict['id2'] = {}
batch_dict['amount'] = {}
batch_dict['message'] = {}

with open(batch_path) as csvfile:
    reader = csv.DictReader(csvfile)
    for i, row in enumerate(reader):
        batch_dict['time'][i] = row['time']
        batch_dict['id1'][i] = row[' id1']
        batch_dict['id2'][i] = row[' id2']
        batch_dict['amount'][i] = row[' amount']
        batch_dict['message'][i] = row[' message']
        
df_batch = pd.DataFrame.from_dict(batch_dict)

In [4]:
## repeat the above - load in data from stream_payment and turn into pandas dataframe.
stream_dict = {}
stream_dict['time'] = {}
stream_dict['id1'] = {}
stream_dict['id2'] = {}
stream_dict['amount'] = {}
stream_dict['message'] = {}

with open(stream_path) as csvfile:
    reader = csv.DictReader(csvfile)
    for i, row in enumerate(reader):
        stream_dict['time'][i] = row['time']
        stream_dict['id1'][i] = row[' id1']
        stream_dict['id2'][i] = row[' id2']
        stream_dict['amount'][i] = row[' amount']
        stream_dict['message'][i] = row[' message']
        
df_stream = pd.DataFrame.from_dict(stream_dict)

In [5]:
#dictionary columns are randomly ordered - reorder as expected
df_batch = df_batch[['time','id1','id2','amount','message']]
df_stream = df_stream[['time','id1','id2','amount','message']]

In [6]:
#for convenience, let's call the id1 users 'givers' and id2 users 'receivers'
#strategy is to use the pandas groupby command to obtain list of all partners in transactions where <user_id> is giver
#repeat for transactions where <user_id> is receiver, then combine into single list
givers = df_batch.groupby('id1')
receivers = df_batch.groupby('id2')
partners_1 = {}
partners_2 = {}

In [7]:
## find all transactions where user <user_id> was giver, then find list of partners in those transactions
for user_id,transactions in givers:
    
    #store list of all transaction partners as numpy array
    try:
        partners_1[int(user_id)] = np.array(givers.get_group(user_id)['id2'].astype(int))
   
    #some lines of batch_payment.txt and stream_payment.txt are off - omit malformed entries
    except (KeyError, ValueError) as BadLine:
        print("Skipping invalid key:",user_id)
    
## same as before for all transactions where <user_id> was receiver
for user_id,transactions in receivers:
    
    #store list of all transaction partners as numpy array
    try: 
        partners_2[int(user_id)] = np.array(givers.get_group(user_id)['id1'].astype(int))
        
    #some lines of batch_payment.txt and stream_payment.txt are off - omit malformed entries
    except (KeyError, ValueError) as BadLine:
        print("Skipping invalid key:",user_id)

Skipping invalid key:  no. Even if the union were a matter of economic indifference
Skipping invalid key:  and even if it were to be disadvantageous from the economic standpoint


In [8]:
## it's possible that some users only show up as givers and others only as receivers - combine to master list of all IDs
## in actuality for the provided batch_payment.txt all users show up as givers at least once, but not safe to assume
user_list_1 = np.array(list(partners_1.keys()))
user_list_2 = np.array(list(partners_2.keys()))
user_list = np.unique(np.concatenate([user_list_1,user_list_2]))

In [None]:
user_master_list = {}

#cycle through all users and agglomerate partners from all transactions
#conversion back and forth between list and numpy array is pretty fast
#lists easier to append to, hence why stored as list, but also wanted to use numpy.unique function.
for user_id in user_list:
    
    pp = []
    
    if user_id in partners_1.keys():
        pp += partners_1[user_id]
        
    if user_id in partners_2.keys():
        pp += partners_2[user_id]
        
    #reduce to (sorted) list of all unique partners
    user_master_list[user_id] = User(user_id, list(np.unique(pp)))

In [None]:
## use the find_new_friends function (stored in user_class.py) to supplement friend tiers down to level of interest
#creating lists of friends down to 4th-degree connections takes about two minutes for 70,000 users on my macbook pro.
tier_depth = 4

#successively add tiers of friendship to every user in user_master_list
for tier in range(2,tier_depth+1):
    print("Building lists of connections of degree", tier, "for each user...")
    for user_id, user_info in user_master_list.items():
        user_info.friends[tier] = find_new_friends(user_master_list,user_info,tier)
        
print("Done. Connections of degree n accessible via User.friends[n]")