In [1]:
import pandas as pd
import re
import csv
import time
import numpy as np
%install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
%load_ext autotime
from user_class import User, find_new_friends
from itertools import compress



Installed autotime.py. To use it, type:
  %load_ext autotime


In [2]:
input_dir = 'paymo_input/'
batch_file = 'batch_payment.csv'
stream_file = 'stream_payment.csv'
batch_path = input_dir + batch_file
stream_path = input_dir + stream_file

test_file = 'batch_test_2.csv'
test_path = input_dir + test_file

time: 2.52 ms


In [3]:
batch_dict = {}
batch_dict['time'] = {}
batch_dict['id1'] = {}
batch_dict['id2'] = {}
batch_dict['amount'] = {}
batch_dict['message'] = {}

with open(batch_path, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for i, row in enumerate(reader):
        batch_dict['time'][i] = row['time']
        batch_dict['id1'][i] = row[' id1']
        batch_dict['id2'][i] = row[' id2']
        batch_dict['amount'][i] = row[' amount']
        batch_dict['message'][i] = row[' message']
        
df_batch = pd.DataFrame.from_dict(batch_dict)

time: 29 s


In [4]:
stream_dict = {}
stream_dict['time'] = {}
stream_dict['id1'] = {}
stream_dict['id2'] = {}
stream_dict['amount'] = {}
stream_dict['message'] = {}

with open(stream_path, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for i, row in enumerate(reader):
        stream_dict['time'][i] = row['time']
        stream_dict['id1'][i] = row[' id1']
        stream_dict['id2'][i] = row[' id2']
        stream_dict['amount'][i] = row[' amount']
        stream_dict['message'][i] = row[' message']
        
df_stream = pd.DataFrame.from_dict(stream_dict)

time: 21.1 s


In [6]:
## BIG PROBLEM - SKIPPED OVER A BUNCH OF ENTRIES
batch_dict['message'][377592]
#print(len(batch_dict['message'].keys()))

' 🇨🇴🇨🇴🇨🇴🇨🇴👍🏼🎉 '

In [4]:
#dictionary columns are randomly ordered - reorder as expected
df_batch = df_batch[['time','id1','id2','amount','message']]
#df_stream = df_stream[['time','id1','id2','amount','message']]

time: 332 ms


In [8]:
df_batch.tail()

Unnamed: 0,time,id1,id2,amount,message
3934578,2016-11-02 09:49:29,20942,826,39.4,🍻🍾🍷
3934579,2016-11-02 09:49:29,2464,2425,6.03,Taco tuesday
3934580,2016-11-02 09:49:29,68862,25922,22.48,Comcast
3934581,2016-11-02 09:49:29,66910,21661,21.28,Signs
3934582,2016-11-02 09:49:29,64593,18179,31.57,blue kiwi


In [5]:
givers = df_batch.groupby('id1')
receivers = df_batch.groupby('id2')
partners_1 = {}
partners_2 = {}

time: 1.73 ms


In [6]:
## find all transactions where user <user_id> was giver, then find list of partners in those transactions
for user_id,transactions in givers:
    
    #store list of all transaction partners as list (easiest type to extend later)
    try:
        partners_1[int(user_id)] = list(givers.get_group(user_id)['id2'].astype(int))
   
    #some lines of batch_payment.txt and stream_payment.txt are off - omit malformed entries
    except (KeyError, ValueError) as BadLine:
        print("Skipping invalid key:",user_id)
    
## same as before for all transactions where <user_id> was receiver
for user_id,transactions in receivers:
    
    #store list of all transaction partners as list (easiest type to extend later)
    try: 
        partners_2[int(user_id)] = list(receivers.get_group(user_id)['id1'].astype(int))
        
    #some lines of batch_payment.txt and stream_payment.txt are off - omit malformed entries
    except (KeyError, ValueError) as BadLine:
        print("Skipping invalid key:",user_id)

Skipping invalid key:  no. Even if the union were a matter of economic indifference
Skipping invalid key:  and even if it were to be disadvantageous from the economic standpoint
time: 1min 38s


In [7]:
## it's possible that some users only show up as givers and others only as receivers - combine to master list of all IDs
## in actuality for the provided batch_payment.txt all users show up as givers at least once, but not safe to assume
user_list_1 = np.array(list(partners_1.keys()))
user_list_2 = np.array(list(partners_2.keys()))
user_list = np.unique(np.concatenate([user_list_1,user_list_2]))

time: 23.5 ms


In [8]:
user_master_list = {}

#cycle through all users and agglomerate partners from all transactions
#conversion back and forth between list and numpy array is pretty fast
#lists easier to append to, hence why stored as list, but also wanted to use numpy.unique function.
for user_id in user_list:
    
    pp = []
    
    if user_id in partners_1.keys():
        pp += partners_1[user_id]
        
    if user_id in partners_2.keys():
        pp += partners_2[user_id]
        
    #reduce to (sorted) list of all unique partners
    user_master_list[user_id] = User(user_id, list(np.unique(pp)))

time: 2.4 s


In [10]:
## use the find_new_friends function (stored in user_class.py) to supplement friend tiers down to level of interest
#creating lists of friends down to 4th-degree connections takes about two minutes for 70,000 users on my macbook pro.
tier_depth = 4

#successively add tiers of friendship to every user in user_master_list
for tier in range(2,tier_depth+1):
    print("Building lists of connections of degree", tier, "for each user...")
    for user_id, user_info in user_master_list.items():
        user_info.friends[tier] = find_new_friends(user_master_list,user_info,tier)
        
print("Done. Connections of degree n accessible via User.friends[n]")

Building lists of connections of degree 2 for each user...
time: 8.75 s


In [36]:
for user_id, user_info in user_master_list.items():
    user_info.friends[4] = find_new_friends(user_info,4)

time: 1min 27s


In [34]:
user_master_list[4].friends[2]

[3,
 15,
 17,
 25,
 46,
 57,
 84,
 89,
 91,
 92,
 95,
 111,
 125,
 140,
 143,
 145,
 147,
 165,
 166,
 167,
 176,
 177,
 182,
 184,
 209,
 239,
 272,
 282,
 283,
 376,
 395,
 398,
 468,
 505,
 510,
 576,
 625,
 647,
 653,
 699,
 710,
 729,
 818,
 837,
 913,
 923,
 996,
 1026,
 1042,
 1091,
 1224,
 1270,
 1361,
 1367,
 1373,
 1477,
 1560,
 1569,
 1699,
 1900,
 1902,
 1912,
 1938,
 2031,
 2077,
 2128,
 2138,
 2230,
 2319,
 2358,
 2366,
 2384,
 2483,
 2511,
 2529,
 2553,
 2571,
 2581,
 2588,
 2721,
 2811,
 2938,
 2961,
 3450,
 3451,
 3540,
 3827,
 3940,
 4111,
 4260,
 4366,
 4660,
 4661,
 4662,
 4663,
 4664,
 4665,
 4666,
 4667,
 4669,
 4670,
 4671,
 4672,
 4673,
 4674,
 4676,
 4677,
 4678,
 4679,
 4680,
 4681,
 4682,
 4683,
 4684,
 4685,
 4686,
 4687,
 4688,
 4689,
 4690,
 4691,
 4692,
 4693,
 4694,
 4695,
 4696,
 4697,
 4698,
 4699,
 4700,
 4701,
 4702,
 4703,
 4704,
 4705,
 4706,
 4707,
 4708,
 4709,
 4710,
 4711,
 4712,
 4713,
 4714,
 4715,
 4717,
 4718,
 4719,
 4720,
 4721,
 4722,
 4

time: 6.6 ms


In [17]:
user_master_list[0].friends[2] = find_new_friends(user_master_list[5],2)

time: 1.27 ms


In [50]:
type(users_master_list[5].friends)

dict

time: 1.91 ms


In [52]:
for k,v in users_master_list[5].friends.items():
    print(k)
    print(v)

1
[0, 4730, 4920, 4921, 67513]
2
[3, 17, 57, 89, 91, 92, 125, 145, 147, 151, 166, 167, 177, 184, 209, 217, 225, 282, 283, 376, 468, 625, 653, 699, 710, 729, 913, 1026, 1091, 1224, 1361, 1373, 1477, 1560, 1569, 1900, 2230, 2358, 2366, 2384, 2490, 2511, 2529, 2553, 2721, 2811, 2938, 2961, 3450, 3451, 3540, 3827, 3940, 4111, 4260, 4366, 4660, 4661, 4662, 4663, 4664, 4665, 4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678, 4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691, 4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704, 4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717, 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743, 4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756, 4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4766, 4767, 4768, 4769, 4770, 4771

In [15]:
tst = User(0, partners[0])

time: 685 µs


In [16]:
tst.partners

AttributeError: 'User' object has no attribute 'partners'

time: 4.46 ms


In [29]:
## the tier parameter indicates what degree of friendship we're searching for - need it so we know what existing friends
#to account for.
def find_new_friends(current_user, new_tier):
    
    #sanity check in case we incorrectly specify new_tier somewhere else
    if new_tier <= 1:
        print('Function only works if there are at least some pre-existing friends (choose higher tier)')
        return
    
    #we have to pool together all existing first-degree friends, second-degree, etc.
    #by initializing existing_friends with the user themselves, avoid ever adding them to new_friends
    existing_friends = [current_user.user_id] 
    
    #don't need to check for redundancy of users in tiers - any given user can only belong to one tier
    for key in range(1,new_tier):
        existing_friends += current_user.friends[key]
    
    tentative_new = []
    
    #for each user who is currently a friend of degree new_tier-1, add all of their first-degree friends to tentative_new
    for friend_id in current_user.friends:
        tentative_new += user_master_list[friend_id].friends[1] 
    
    tentative_new = list(np.unique(tentative_new)) #get rid of duplicates
    #following line returns 
    new_friends = [ x for x in tentative_new if x not in existing_friends ]
    
    return new_friends

time: 8.62 ms
