In [2]:
import requests

import db_connection as db_conn
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

import pandas as pd
import numpy as np

import random
from web3 import Web3
# choose a Provider of your choice

from alchemy_api import AlchemyApi
import concurrent.futures
import re

In [18]:
alchemy = AlchemyApi()

In [154]:
scam_df = pd.read_csv('data/hoptrail_scam_dataset.csv')
print(scam_df.shape[0])

8473


In [34]:
# get the scam addresses as list
scam_addresses = scam_df['address'].tolist()

In [None]:
def api_key_rotation(api_keys):
    index = random.randint(0, len(api_keys) - 1)
    api_key = api_keys[index]
    return api_key

user_addresses = []
contract_addresses = []

for address in scam_addresses:
    check_sum_address = Web3.to_checksum_address(address)
    api_key = api_key_rotation(alchemy.get_api_keys())
    alchemy_url = alchemy.get_api_url() + api_key
    w3 = Web3(Web3.HTTPProvider(alchemy_url))
    response = w3.eth.get_code(check_sum_address)
    if response.hex() == '0x':
        print('User address: ', address)
        user_addresses.append(address)
    else:
        print('Contract address: ', address)
        contract_addresses.append(address)


In [45]:
def get_address_type(address, api_keys, alchemy_url):
    check_sum_address = Web3.to_checksum_address(address)
    api_key = api_key_rotation(api_keys)
    full_url = alchemy_url + api_key
    w3 = Web3(Web3.HTTPProvider(full_url))
    response = w3.eth.get_code(check_sum_address)
    return response.hex() == '0x'

def api_key_rotation(api_keys):
    index = random.randint(0, len(api_keys) - 1)
    return api_keys[index]

user_addresses = []
contract_addresses = []
api_keys = alchemy.get_api_keys()
alchemy_url = alchemy.get_api_url()

with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    future_to_address = {executor.submit(get_address_type, address, api_keys, alchemy_url): address for address in scam_addresses}
    for future in concurrent.futures.as_completed(future_to_address):
        address = future_to_address[future]
        is_user_address = future.result()
        if is_user_address:
            user_addresses.append(address)
        else:
            contract_addresses.append(address)

In [46]:
print('User addresses: ', len(user_addresses))
print('Contract addresses: ', len(contract_addresses))

User addresses:  7839
Contract addresses:  634


In [49]:
# save to pickle
import pickle

with open('data/pickle_files/user_addresses.pkl', 'wb') as f:
    pickle.dump(user_addresses, f)

with open('data/pickle_files/contract_addresses.pkl', 'wb') as f:
    pickle.dump(contract_addresses, f)

In [125]:
contract_addresses = pd.read_pickle('data/pickle_files/contract_addresses.pkl')

In [137]:
def chunk_list(input_list, chunk_size):
    """Yield successive chunks from input_list."""
    for i in range(0, len(input_list), chunk_size):
        yield input_list[i:i + chunk_size]

contract_addresses_chunks = list(chunk_list(contract_addresses, 5))

etherscan_url = 'https://api.etherscan.io/api'
ethscan_keys = ['VZWEQKFCC2K9TRJZKNQGFRD1K4TASQ7IMV','GVWFQX6V8TVYWR9PATBVNRHUUK4V4IKDBY','8R1AXV1QBC3P98P2EEJ2NMIG84RQS5BR8V']
contract_creators = []

for address_chunk in contract_addresses_chunks:
    etherscan_params = {
        'module': 'contract',
        'action': 'getcontractcreation',
        'contractaddresses': ",".join(address_chunk),
    }
    headers = {'content-type': 'application/json', 'content-encoding': 'gzip', 'charset': 'utf-8'}

    # Manually formatting the URL
    etherscan_url += "?module={}&action={}&contractaddresses={}&apikey={}".format(
        etherscan_params['module'],
        etherscan_params['action'],
        etherscan_params['contractaddresses'],
        random.choice(ethscan_keys)
    )

    response = requests.get(etherscan_url, headers=headers)
    etherscan_url = 'https://api.etherscan.io/api'
    response_json = response.json()

    for result in response_json['result']:
        creator_details = {'contract_creator': result['contractCreator'], 'contract_address': result['contractAddress']}
        contract_creators.append(creator_details)


In [139]:
contract_creators_df = pd.DataFrame(contract_creators)
contract_creators_df = contract_creators_df.groupby('contract_creator')['contract_address'].agg(list).reset_index()
contract_creators_df.to_parquet('data/parquet_files/contract_creators.parquet')

In [189]:
scam_users = pd.read_pickle('data/pickle_files/user_addresses.pkl')
contract_creators_df = pd.read_parquet('data/parquet_files/contract_creators.parquet')
scam_contract_creators = contract_creators_df['contract_creator'].tolist()

scam_users += scam_contract_creators
len(scam_users)

8385

In [148]:
db_con_test = db_conn.config_sql
db_con_train = db_conn.config_train_sql
spark = SparkSession.builder \
    .appName("process_tx") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "32g") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/04 22:39:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [155]:
url = f"jdbc:mariadb://{db_con_test['host']}:{db_con_test['port']}/{db_con_test['database']}"
user = db_con_test['user']
password = db_con_test['password']
address_test_df = spark.read.format('jdbc').options(url=url, dbtable='Addresses', user=user, password=password).load()

url = f"jdbc:mariadb://{db_con_train['host']}:{db_con_train['port']}/{db_con_train['database']}"
user = db_con_train['user']
password = db_con_train['password']
address_train_df = spark.read.format('jdbc').options(url=url, dbtable='Addresses', user=user, password=password).load()

In [156]:
train_matches = address_train_df.filter(col('address').isin(scam_users))
test_matches = address_test_df.filter(col('address').isin(scam_users))

In [161]:
train_addresses = [row['address'] for row in train_matches.collect()]
test_addresses = [row['address'] for row in test_matches.collect()]
scam_addresses_in_train_test = list(set(train_addresses) | set(test_addresses))

                                                                                

In [164]:
with open('data/pickle_files/scam_addresses_in_train_test.pkl', 'wb') as f:
    pickle.dump(scam_addresses_in_train_test, f)

In [190]:
original_scam_df = pd.read_csv('data/transaction_dataset.csv')
original_scam_df = original_scam_df[original_scam_df['FLAG'] == 1]
original_scam_addresses = original_scam_df['Address'].tolist()

In [191]:
filtered_original_scams = [scam for scam in original_scam_addresses if scam in scam_users]
filtered_missing_scams = [scam for scam in original_scam_addresses if scam not in scam_users]

print('Original scams: ', len(original_scam_addresses))
print('Original scams in scam_users: ', len(filtered_original_scams))
print('Filtered missing scams from scam_users: ', len(filtered_missing_scams))

Original scams:  2179
Original scams in scam_users:  1928
Filtered missing scams from scam_users:  251


In [192]:
scam_users += filtered_missing_scams
len(scam_users)

8636

In [None]:
def is_valid_ethereum_address(address):
    match = re.match('^0x[a-fA-F0-9]{40}$', address)
    return match is not None

def is_valid_ethereum_tx_hash(tx_hash):
    match = re.match('^0x[a-fA-F0-9]{64}$', tx_hash)
    return match is not None

invalid_addresses = []
for address in scam_users:
    if not is_valid_ethereum_address(address):
        invalid_addresses.append(address)
    if is_valid_ethereum_tx_hash(address):
        invalid_addresses.append(address)

invalid_addresses = list(set(invalid_addresses))

# Remove invalid addresses from all_addresses list
scam_users = [address for address in scam_users if address not in invalid_addresses]

In [187]:
with open('data/pickle_files/scam_users_and_contract_creators.pkl', 'wb') as f:
    pickle.dump(scam_users, f)