# Cleaning and grouping Sarafu data
Based on code from "Mattsson, C.E.S., Criscione, T. & Takes, F.W. Circulation of a digital community currency. Sci Rep 13, 5864 (2023). https://doi.org/10.1038/s41598-023-33184-1". 

In [7]:
import numpy as np
import pandas as pd
import networkx as nx
import json
import re
import os

## Directory paths & transaction data

In [10]:
# Define directories
projdir = ""
datadir = "data_sarafu"

In [12]:
# Load into pandas
raw_fn = os.path.join(datadir,"sarafu_txns_20200125-20210615.csv")
raw = pd.read_csv(raw_fn).drop(columns=['token_name','token_address'])

In [13]:
users_fn = os.path.join(datadir,"sarafu_users_20210615.csv")
categoricals = ['gender','area_name','area_type','held_roles','business_type']
strings = ['start','old_POA_blockchain_address','xDAI_blockchain_address']
dtypes = {col:"category" for col in categoricals}
dtypes.update({col:"string" for col in strings})
users = pd.read_csv(users_fn,dtype=dtypes,na_filter=False)
users = users.drop_duplicates(subset=['xDAI_blockchain_address'],keep='first')
users = users.set_index('xDAI_blockchain_address')

## Initial data cleaning 

Removing transactions directly between system-run accounts and non-standard transactions. 

In [15]:
# Filter system run accounts
admins = ['0xBDB3Bc887C3b70586BC25D04d89eC802b897fC5F','0xEDA5C9B75Fdb3B9bdAB987A704632280Cf93084F']
txns_fn = os.path.join(datadir,"transactions","sarafu_txns.csv")
txns = raw[~((raw['source'].isin(admins)) & (raw['target'].isin(admins)))].copy()

# Filter out the currency management and cash exchange operations
txns_std = txns[txns['transfer_subtype']=="STANDARD"].copy()

## Sarafu flow network (agregate network by adding weights)

In [17]:
# Total Sarafu over each link
flow_net = txns_std.groupby(by = ['source', 'target'])[["weight"]].sum().reset_index() 

# Edgelist
flow_nx = nx.from_pandas_edgelist(flow_net,edge_attr='weight',create_using=nx.DiGraph)

# Node information
nx.set_node_attributes(flow_nx, users.to_dict('index'))

### Second data cleaning

In [19]:
# Filter system accounts
has_admin_role = users['held_roles'].isin(['ADMIN','VENDOR'])
has_admin_type = users['business_type'].isin(['system'])
reg_users = users.loc[~has_admin_role & ~has_admin_type].copy()

In [20]:
# Gather corresponding subnetwork
flow_reg_nx = nx.DiGraph()
for e, e_dict in flow_nx.subgraph(reg_users.index).edges.items():
    flow_reg_nx.add_edge(*e,**e_dict)

## Save data

In [22]:
# create dir
networks_fn = os.path.join(datadir,"networks")
if not os.path.exists(networks_fn):
    os.makedirs(networks_fn)

In [23]:
# write in Pajek format
flow_reg_fn = os.path.join(networks_fn,"sarafu_reg_users.net")
nx.write_pajek(flow_reg_nx, flow_reg_fn, encoding='UTF-8')

# clean up the file
with open(flow_reg_fn, 'r') as file :
    filedata = file.read()
filedata = filedata.replace(' 0.0 0.0 ellipse', '') 
with open(flow_reg_fn, 'w') as file:
    file.write(filedata)

In [24]:
print(" nodes", flow_reg_nx.number_of_nodes())
print(" edges", flow_reg_nx.number_of_edges())
print("weight", flow_reg_nx.size(weight="weight"))

 nodes 40655
 edges 145659
weight 293688266.0480015


# Monthly data

In [26]:
from pandas.api.types import CategoricalDtype
from datetime import datetime, timedelta

In [28]:
# generate timestamp
txns_std['timestamp'] = pd.to_datetime(txns_std['timeset'],format="%Y-%m-%d %H:%M:%S.%f")

# month
txns_std['Month'] = txns_std['timestamp'].dt.strftime("%Y-%m") 

# Monthly volumes of STANDARD transactions -- February through May (removing incomplete months)
feb20 = datetime.strptime("2020-02-01 00:00:00.00","%Y-%m-%d %H:%M:%S.%f")
jun21 = datetime.strptime("2021-06-01 00:00:00.00","%Y-%m-%d %H:%M:%S.%f")
txns_febmay = txns_std[(txns_std['timestamp']>feb20)&(txns_std['timestamp']<jun21)].copy()
txns_febmay["Month"] = txns_febmay["Month"]

## Monthly agregate network

In [31]:
flow_net_febmay = txns_febmay.groupby(by = ['Month','source', 'target'])[["weight"]].sum().reset_index() 

# Get dictionary of dataframes, one dataframe for each month
months = flow_net_febmay.Month.unique()
flow_net_febmay_dic = {month : flow_net_febmay[flow_net_febmay.Month == month].drop(columns=['Month']) for month in months}

In [33]:
# Get dictionary of regular users networks 
flow_reg_febmay = {}

for month, flow  in flow_net_febmay_dic.items(): 
    G = nx.from_pandas_edgelist(flow,edge_attr='weight',create_using=nx.DiGraph)
    nx.set_node_attributes(G, users.to_dict('index'))
    G_reg = nx.DiGraph()
    for e, e_dict in G.subgraph(reg_users.index).edges.items():
        G_reg.add_edge(*e,**e_dict)
    flow_reg_febmay[month] = G_reg

## Save data

In [35]:
# create dir
monthly_fn = os.path.join(datadir,"networks", "monthly")
if not os.path.exists(monthly_fn):
    os.makedirs(monthly_fn)

In [36]:
# write in Pajek format
for month, G  in flow_reg_febmay.items():
    G_fn = os.path.join(monthly_fn,"sarafu_reg_"+month+".net")
    nx.write_pajek(G, G_fn, encoding='UTF-8')

# clean up the files
for month, G  in flow_reg_febmay.items():
    G_fn = os.path.join(monthly_fn,"sarafu_reg_"+month+".net")
    with open(G_fn, 'r') as file :
        filedata = file.read()
    filedata = filedata.replace(' 0.0 0.0 ellipse', '') 
    with open(G_fn, 'w') as file:
        file.write(filedata)

In [37]:
reg_users.loc["0xa2b55ff5940297F42Ac638d37EC455754eA354cB"]

id                                                                  6594
start                                         2019-11-21 12:53:28.000000
final_bal                                               49.9098019725512
gender                                                            Female
area_name                                                   Misc Nairobi
area_type                                                          urban
held_roles                                                   BENEFICIARY
business_type                                                       food
old_POA_blockchain_address    0x25c6018e898395db6ac72eb6d6aa2bc28766e2cf
ovol_in                                                      5691.909802
ovol_out                                                            83.0
otxns_in                                                              34
otxns_out                                                              4
ounique_in                                         

# Creating communities data (in file 2 supplementary materials)

In [39]:
from collections import Counter

## Infomap

In [43]:
# create an 'infomap' folder within the 'analysis' directory, and
# run Infomap using the following script, or via the command line:

#!/bin/bash  

DATA='/Users/mattssonc/Documents/Research/Sarafu/Sarafu2021_UKDS'
WORKING='/Users/mattssonc/Documents/Research/Sarafu/Exploration'

infomap --flow-model rawdir --tree \\$DATA/networks/sarafu_reg_users.net \\$WORKING/analysis/infomap/ > \\$WORKING/analysis/infomap/sarafu_reg_users.out

### Load infomap output

In [46]:
# Load measures & modules
reg_users_mod_fn = os.path.join(datadir,"infomap","sarafu_reg_users.tree")
reg_users_mod = pd.read_csv(reg_users_mod_fn,names=['module','flow','node','idx'],sep=" ",skiprows=9) # MARC: change 8 to 9
reg_users_mod['node'] = reg_users_mod['node'].apply(lambda x: x.strip('"'))
reg_users_mod['mod_1'] = reg_users_mod['module'].apply(lambda x: ':'.join(x.split(':')[:min(1,len(x))]))
reg_users_mod['mod_2'] = reg_users_mod['module'].apply(lambda x: ':'.join(x.split(':')[:min(2,len(x))]))
reg_users_mod['mod_3'] = reg_users_mod['module'].apply(lambda x: ':'.join(x.split(':')[:min(3,len(x))]))
reg_users_mod['mod_4'] = reg_users_mod['module'].apply(lambda x: ':'.join(x.split(':')[:min(4,len(x))]))
reg_users_mod['mod_5'] = reg_users_mod['module'].apply(lambda x: ':'.join(x.split(':')[:min(5,len(x))]))
reg_users_mod['mod_6'] = reg_users_mod['module'].apply(lambda x: ':'.join(x.split(':')[:min(6,len(x))]))
reg_users_mod['mod_7'] = reg_users_mod['module'].apply(lambda x: ':'.join(x.split(':')[:min(7,len(x))]))
reg_users_mod = reg_users_mod.drop(['idx','flow'],axis=1)
reg_users_mod = reg_users_mod.set_index('node')

In [48]:
# Set node attributes
reg_users = reg_users_mod.join(reg_users, how='left')
# Flow network
nx.set_node_attributes(flow_reg_nx, reg_users.to_dict('index'))

In [50]:
# Confirm consistent number of nodes
print("nodes",flow_reg_nx.number_of_nodes()) # MARC: we lost 2 nodes, at some point check if can recover
print("users",reg_users.index.size)

nodes 40655
users 40655


In [51]:
flow_reg_nx.nodes['0x0831252aE03010CeB7C0fd8032d4bC9aB3B84B80']

{'module': '1:7:3:8',
 'mod_1': '1',
 'mod_2': '1:7',
 'mod_3': '1:7:3',
 'mod_4': '1:7:3:8',
 'mod_5': '1:7:3:8',
 'mod_6': '1:7:3:8',
 'mod_7': '1:7:3:8',
 'id': 15126,
 'start': '2020-04-20 09:00:58.865892',
 'final_bal': '3593.46',
 'gender': 'Female',
 'area_name': 'Mukuru Nairobi',
 'area_type': 'urban',
 'held_roles': 'BENEFICIARY',
 'business_type': 'labour',
 'old_POA_blockchain_address': '',
 'ovol_in': 4519.86,
 'ovol_out': 216.4,
 'otxns_in': 25,
 'otxns_out': 4,
 'ounique_in': 3,
 'ounique_out': 1,
 'svol_in': 29410.0,
 'svol_out': 30120.0,
 'stxns_in': 114,
 'stxns_out': 95,
 'sunique_in': 14,
 'sunique_out': 13}

 ## Format & save

Networks by top-level and second-level infomap module

In [63]:
# modules
mod_1s = ['1','2','3','4','5']
mod_2s = [key for key, value in Counter(dict(flow_reg_nx.nodes(data="mod_2")).values()).items() if value > 100]
mod_3s = [key for key, value in Counter(dict(flow_reg_nx.nodes(data="mod_3")).values()).items() if value > 10]

In [64]:
# Split the network by module
flow_reg_nxs = {}
print("layer","modules","in_weight","tot_weight","fraction")
for label, modules in [('mod_1',mod_1s),('mod_2',mod_2s),('mod_3',mod_3s)]:
    total_weight = 0
    flow_reg_nxs[label] = {}
    for module in modules:
        # Gather the network subgraph
        flow_reg_nxs[label][module] = nx.DiGraph()
        for e, e_dict in flow_reg_nx.subgraph(reg_users[reg_users[label]==module].index).edges.items():
            flow_reg_nxs[label][module].add_edge(*e,**e_dict)
        total_weight += flow_reg_nxs[label][module].size(weight="weight")
    print(label,len(modules),total_weight,flow_reg_nx.size(weight="weight"),total_weight/flow_reg_nx.size(weight="weight"))

layer modules in_weight tot_weight fraction
mod_1 5 292913149.2380003 293688266.0480015 0.9973607498166965
mod_2 36 283681910.3480001 293688266.0480015 0.9659286500116218
mod_3 456 238295271.27400002 293688266.0480015 0.8113884646486085


In [66]:
# creating dirs
modules_fn = os.path.join(datadir,"networks","modules")
if not os.path.exists(modules_fn):
            os.makedirs(modules_fn)

for module in ['mod_1', 'mod_2']: 
    module_fn = os.path.join(modules_fn, module)
    if not os.path.exists(module_fn):
                os.makedirs(module_fn)

In [67]:
# write in Pajek format 
for label, modules in [('mod_1',mod_1s),('mod_2',mod_2s)]:
    for module in modules:
        flow_reg_fn = os.path.join(modules_fn ,label,"sarafu_reg_users_"+module.replace(':',"~")+".net")
        nx.write_pajek(flow_reg_nxs[label][module], flow_reg_fn, encoding='UTF-8')

# clean up the files     
for label, modules in [('mod_1',mod_1s),('mod_2',mod_2s)]:
    for module in modules:
        flow_reg_fn = os.path.join(modules_fn ,label,"sarafu_reg_users_"+module.replace(':',"~")+".net")
        with open(flow_reg_fn, 'r') as file :
            filedata = file.read()
        filedata = filedata.replace(' 0.0 0.0 ellipse', '')
        with open(flow_reg_fn, 'w') as file: # MARC: maybe have to run it a couple of times before works
            file.write(filedata)

If an error occurs in the previous cell try running twice.