## 태그 데이터 삽입

In [None]:
import csv
import sqlite3

In [None]:
indexpath = 'dbv3-index.db'
corepath = 'dbv3-core.db'
dbpath = 'dbv3-service.db'
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
cur.execute(f'''ATTACH DATABASE '{indexpath}' AS DBINDEX;''')
cur.execute(f'''ATTACH DATABASE '{corepath}' AS DBCORE;''')
conn.commit()

In [None]:
tagpath = 'data/address_tags_210525.csv'

with open(tagpath, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        cur.execute('''SELECT id FROM DBINDEX.AddrID
                       WHERE DBINDEX.AddrID.addr = ?;''', (row['Address'],))
        addrid = cur.fetchone()[0]

        cur.execute('''INSERT OR IGNORE INTO AddrTagID (tag)
                       VALUES (?);''', (Row['Tag'],))
        conn.commit()
        cur.execute('''SELECT id FROM AddrTagID
                       WHERE AddrTagID.tag = ?;''', (row['Tag'],))
        tagid = cur.fetchone()[0]

        cur.execute('''INSERT OR IGNORE INTO AddrTag (addr, tag)
                       VALUES (?, ?);''', (addrid, tagid))
        conn.commit()

In [None]:
conn.close()

## 주소 특징 추출

In [None]:
import os
import csv
import sqlite3
import statistics
import collections

import numpy as np
import pandas as pd
from scipy.stats import moment

import matplotlib.pyplot as plt
%matplotlib inline
# %matplotlib notebook

In [None]:
# Cache DataFrame
cachepath = 'cache_address_210525.pickle'
if os.path.exists(cachepath):
    df = pd.read_pickle(cachepath)
else:
    df = pd.DataFrame()

In [None]:
# Connect DB
conn = sqlite3.connect(':memory:')
cur = conn.cursor()
cur.execute('''ATTACH DATABASE './dbv3-index.db' AS DBINDEX;''')
cur.execute('''ATTACH DATABASE './dbv3-core.db' AS DBCORE;''')
cur.execute('''ATTACH DATABASE './dbv3-util.db' AS DBUTIL;''')
cur.execute('''ATTACH DATABASE './dbv3-service.db' AS DBSERVICE;''')
conn.commit()

In [None]:
# Enqueue all of tagged addresses
queue = collections.deque()
subqueue = collections.deque()

for row in cur.execute('''SELECT DISTINCT addr FROM DBSERVICE.AddrTag;'''):
    queue.append(row[0])
print(f'Ready the tagged queue: {len(queue)}')

In [None]:
# Process feature for addresses
while len(queue) != 0 or len(subqueue) != 0:
    fill = False
    if len(queue) != 0:
        addr = queue.popleft()
        fill = True
    else:
        addr = subqueue.popleft()
    if 'Addr' in df.columns and addr in df['Addr'].values:
        continue
    metric = {'Addr': addr}

    # Tx
    cur.execute('''SELECT COUNT(DISTINCT DBUTIL.Edge.tx)
                   FROM DBUTIL.Edge
                   WHERE DBUTIL.Edge.src == ?
                      OR DBUTIL.Edge.dst == ?;''', (addr, addr))
    Tx = cur.fetchone()[0]
#     print(f'{Tx=}')
    metric['Tx'] = Tx
    
    # InTx
    cur.execute('''SELECT COUNT(DISTINCT DBUTIL.Edge.tx)
                   FROM DBUTIL.Edge
                   WHERE DBUTIL.Edge.src == ?''', (addr,))
    InTx = cur.fetchone()[0]
#     print(f'{InTx=}')
    metric['InTx'] = InTx
    
    # OutTx
    cur.execute('''SELECT COUNT(DISTINCT DBUTIL.Edge.tx)
                   FROM DBUTIL.Edge
                   WHERE DBUTIL.Edge.dst == ?;''', (addr,))
    OutTx = cur.fetchone()[0]
#     print(f'{OutTx=}')
    metric['OutTx'] = OutTx
    
    # BTC
    cur.execute('''SELECT SUM(DBUTIL.Edge.btc)
                   FROM DBUTIL.Edge
                   WHERE DBUTIL.Edge.src == ?
                      OR DBUTIL.Edge.dst == ?;''', (addr, addr))
    BTC = cur.fetchone()[0]
#     print(f'{BTC=}')
    metric['BTC'] = BTC
    
    # InBTC series
    lInBTC = []
    for row in cur.execute('''SELECT DBUTIL.Edge.btc
                              FROM DBUTIL.Edge
                              WHERE DBUTIL.Edge.src == ?;''', (addr,)):
        lInBTC.append(row[0])
    if len(lInBTC) == 0:
        lInBTC.append(0)
    InBTCMin = min(lInBTC)
    InBTCMax = max(lInBTC)
    InBTCSum = sum(lInBTC)
    InBTCMedian = statistics.median(lInBTC)
    InBTCM1 = statistics.mean(lInBTC)
    InBTCM2 = moment(lInBTC, moment=2)
    InBTCM3 = moment(lInBTC, moment=3)
    InBTCM4 = moment(lInBTC, moment=4)
#     print(f'{InBTCMin=}, {InBTCMax=}, {InBTCSum=}, {InBTCMedian=}')
#     print(f'{InBTCM1=}, {InBTCM2=}, {InBTCM3=}, {InBTCM4=}')
    metric['InBTCMin'] = InBTCMin
    metric['InBTCMax'] = InBTCMax
    metric['InBTCSum'] = InBTCSum
    metric['InBTCMedian'] = InBTCMedian
    metric['InBTCM1'] = InBTCM1
    metric['InBTCM2'] = InBTCM2
    metric['InBTCM3'] = InBTCM3
    metric['InBTCM4'] = InBTCM4

    # OutBTC series
    lOutBTC = []
    for row in cur.execute('''SELECT DBUTIL.Edge.btc
                              FROM DBUTIL.Edge
                              WHERE DBUTIL.Edge.dst == ?;''', (addr,)):
        lOutBTC.append(row[0])
    if len(lOutBTC) == 0:
        lOutBTC.append(0)
    OutBTCMin = min(lOutBTC)
    OutBTCMax = max(lOutBTC)
    OutBTCSum = sum(lOutBTC)
    OutBTCMedian = statistics.median(lOutBTC)
    OutBTCM1 = statistics.mean(lOutBTC)
    OutBTCM2 = moment(lOutBTC, moment=2)
    OutBTCM3 = moment(lOutBTC, moment=3)
    OutBTCM4 = moment(lOutBTC, moment=4)
#     print(f'{OutBTCMin=}, {OutBTCMax=}, {OutBTCSum=}, {OutBTCMedian=}')
#     print(f'{OutBTCM1=}, {OutBTCM2=}, {OutBTCM3=}, {OutBTCM4=}')
    metric['OutBTCMin'] = OutBTCMin
    metric['OutBTCMax'] = OutBTCMax
    metric['OutBTCSum'] = OutBTCSum
    metric['OutBTCMedian'] = OutBTCMedian
    metric['OutBTCM1'] = OutBTCM1
    metric['OutBTCM2'] = OutBTCM2
    metric['OutBTCM3'] = OutBTCM3
    metric['OutBTCM4'] = OutBTCM4
    
    # Use
    cur.execute('''SELECT COUNT(DBUTIL.Edge.tx)
                   FROM DBUTIL.Edge
                   WHERE DBUTIL.Edge.src == ?
                      OR DBUTIL.Edge.dst == ?;''', (addr, addr))
    Use = cur.fetchone()[0]
#     print(f'{Use=}')
    metric['Use'] = Use

    # InUse
    cur.execute('''SELECT COUNT(DBUTIL.Edge.tx)
                   FROM DBUTIL.Edge
                   WHERE DBUTIL.Edge.src == ?''', (addr,))
    InUse = cur.fetchone()[0]
#     print(f'{InUse=}')
    metric['InUse'] = InUse
    
    # OutUse
    cur.execute('''SELECT COUNT(DBUTIL.Edge.tx)
                   FROM DBUTIL.Edge
                   WHERE DBUTIL.Edge.dst == ?;''', (addr,))
    OutUse = cur.fetchone()[0]
#     print(f'{OutUse=}')
    metric['OutUse'] = OutUse
    
    # Age series
    lAge = []
    for row in cur.execute('''SELECT DBCORE.BlkTime.unixtime
                              FROM DBCORE.BlkTime
                              INNER JOIN DBCORE.BlkTx ON DBCORE.BlkTx.blk = DBCORE.BlkTime.blk
                              WHERE DBCORE.BlkTx.tx IN
                              (SELECT DBCORE.TxIn.tx
                               FROM DBCORE.TxIn
                               INNER JOIN DBCORE.TxOut ON DBCORE.TxOut.tx = DBCORE.TxIn.ptx AND
                                                          DBCORE.TxOut.n = DBCORE.TxIn.pn
                               WHERE DBCORE.TxOut.addr = ?
                               UNION 
                               SELECT DISTINCT DBCORE.TxOut.tx
                               FROM DBCORE.TxOut
                               WHERE DBCORE.TxOut.addr = ?);''', (addr, addr)):
        lAge.append(row[0])
    if len(lAge) == 0:
        lAge.append(0)
    Age = max(lAge) - min(lAge)
    AgeMin = min(lAge)
    AgeMax = max(lAge)
    AgeM1 = statistics.mean(lAge)
    AgeM2 = moment(lAge, moment=2)
    AgeM3 = moment(lAge, moment=3)
    AgeM4 = moment(lAge, moment=4)
#     print(f'{Age=}, {AgeMin=}, {AgeMax=}')
#     print(f'{AgeM1=}, {AgeM2=}, {AgeM3=}, {AgeM4=}')
    metric['Age'] = Age
    metric['AgeMin'] = AgeMin
    metric['AgeMax'] = AgeMax
    metric['AgeM1'] = AgeM1
    metric['AgeM2'] = AgeM2
    metric['AgeM3'] = AgeM3
    metric['AgeM4'] = AgeM4
    
    # InAgemmmm1~m4s
    lInAge = []
    for row in cur.execute('''SELECT DBCORE.BlkTime.unixtime
                              FROM DBCORE.BlkTime
                              INNER JOIN DBCORE.BlkTx ON DBCORE.BlkTx.blk = DBCORE.BlkTime.blk
                              WHERE DBCORE.BlkTx.tx IN
                              (SELECT DBCORE.TxIn.tx
                               FROM DBCORE.TxIn
                               INNER JOIN DBCORE.TxOut ON DBCORE.TxOut.tx = DBCORE.TxIn.ptx AND
                                                          DBCORE.TxOut.n = DBCORE.TxIn.pn
                               WHERE DBCORE.TxOut.addr = ?);''', (addr,)):
        lInAge.append(row[0])
    if len(lInAge) == 0:
        lInAge.append(0)
    InAge = max(lInAge) - min(lInAge)
    InAgeMin = min(lInAge)
    InAgeMax = max(lInAge)
    InAgeM1 = statistics.mean(lInAge)
    InAgeM2 = moment(lInAge, moment=2)
    InAgeM3 = moment(lInAge, moment=3)
    InAgeM4 = moment(lInAge, moment=4)
#     print(f'{InAge=}, {InAgeMin=}, {InAgeMax=}')
#     print(f'{InAgeM1=}, {InAgeM2=}, {InAgeM3=}, {InAgeM4=}')
    metric['InAge'] = InAge
    metric['InAgeMin'] = InAgeMin
    metric['InAgeMax'] = InAgeMax
    metric['InAgeM1'] = InAgeM1
    metric['InAgeM2'] = InAgeM2
    metric['InAgeM3'] = InAgeM3
    metric['InAgeM4'] = InAgeM4
    
    # OutAgemmmm1~m4s
    lOutAge = []
    for row in cur.execute('''SELECT DBCORE.BlkTime.unixtime
                              FROM DBCORE.BlkTime
                              INNER JOIN DBCORE.BlkTx ON DBCORE.BlkTx.blk = DBCORE.BlkTime.blk
                              WHERE DBCORE.BlkTx.tx IN
                              (SELECT DISTINCT DBCORE.TxOut.tx
                               FROM DBCORE.TxOut
                               WHERE DBCORE.TxOut.addr = ?);''', (addr,)):
        lOutAge.append(row[0])
    if len(lOutAge) == 0:
        lOutAge.append(0)
    OutAge = max(lOutAge) - min(lOutAge)
    OutAgeMin = min(lOutAge)
    OutAgeMax = max(lOutAge)
    OutAgeM1 = statistics.mean(lOutAge)
    OutAgeM2 = moment(lOutAge, moment=2)
    OutAgeM3 = moment(lOutAge, moment=3)
    OutAgeM4 = moment(lOutAge, moment=4)
#     print(f'{OutAge=}, {OutAgeMin=}, {OutAgeMax=}')
#     print(f'{OutAgeM1=}, {OutAgeM2=}, {OutAgeM3=}, {OutAgeM4=}')
    metric['OutAge'] = OutAge
    metric['OutAgeMin'] = OutAgeMin
    metric['OutAgeMax'] = OutAgeMax
    metric['OutAgeM1'] = OutAgeM1
    metric['OutAgeM2'] = OutAgeM2
    metric['OutAgeM3'] = OutAgeM3
    metric['OutAgeM4'] = OutAgeM4
    
    # isP2PKH
    # isP2SH
    # isBech32
    ## P2PKH which begin with the number 1, eg: 1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2.
    ## P2SH type starting with the number 3, eg: 3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy.
    ## Bech32 type starting with bc1, eg: bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq.
    isP2PKH = 0
    isP2SH = 0
    isBech32 = 0
    cur.execute('''SELECT DBINDEX.AddrID.addr
                   FROM DBINDEX.AddrID
                   WHERE DBINDEX.AddrID.id = ?''', (addr,))
    addr_str = cur.fetchone()[0]
    if addr_str.startswith('1'):
        isP2PKH = 1
    elif addr_str.startswith('3'):
        isP2SH = 1
    elif addr_str.startswith('bc1'):
        isBech32 = 1
#     print(f'{isP2PKH=}, {isP2SH=}, {isBech32=}')
    metric['isP2PKH'] = isP2PKH
    metric['isP2SH'] = isP2SH
    metric['isBech32'] = isBech32

    df = df.append(metric, ignore_index=True)
    
    if fill:
        for row in cur.execute('''SELECT DBCORE.TxOut.addr
                                  FROM DBCORE.TxIn
                                  INNER JOIN DBCORE.TxOut ON DBCORE.TxIn.ptx = DBCORE.TxOut.tx
                                         AND DBCORE.TxIn.pn = DBCORE.TxOut.n
                                  WHERE DBCORE.txIn.tx IN (
                                   SELECT DBCORE.TxIn.tx
                                   FROM DBCORE.TxIn
                                   INNER JOIN DBCORE.TxOut ON DBCORE.TxIn.ptx = DBCORE.TxOut.tx
                                          AND DBCORE.TxIn.pn = DBCORE.TxOut.n
                                   WHERE DBCORE.TxOut.addr = ?)
                                  GROUP BY DBCORE.TxOut.addr;''', (addr,)):
            subqueue.append(row[0])
    print(f'Left: {len(queue)}, {len(subqueue)}', end='\r')
        
df = df[['Addr', 
         'Tx', 'InTx', 'OutTx',
         'BTC',
         'InBTCMin', 'InBTCMax', 'InBTCSum', 'InBTCMedian',
         'InBTCM1', 'InBTCM2', 'InBTCM3', 'InBTCM4',
         'OutBTCMin', 'OutBTCMax', 'OutBTCSum', 'OutBTCMedian',
         'OutBTCM1', 'OutBTCM2', 'OutBTCM3', 'OutBTCM4',
         'Use', 'InUse', 'OutUse',
         'Age', 'AgeMin', 'AgeMax',
         'AgeM1', 'AgeM2', 'AgeM3', 'AgeM4',
         'InAge', 'InAgeMin', 'InAgeMax',
         'InAgeM1', 'InAgeM2', 'InAgeM3', 'InAgeM4',
         'OutAge', 'OutAgeMin', 'OutAgeMax',
         'OutAgeM1', 'OutAgeM2', 'OutAgeM3', 'OutAgeM4', 
         'isBech32', 'isP2PKH', 'isP2SH']]
df

In [None]:
df.to_pickle(cachepath)
conn.close()