In [2]:
import requests
import pandas as pd
import db_connection as db
import pickle
from concurrent.futures import ThreadPoolExecutor
import threading

from random import sample

from pyspark.sql import SparkSession

In [2]:
db_config = db.config_sql
db_config_train = db.config_train_sql

spark = SparkSession.builder \
    .appName("process_tx") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "32g") \
    .getOrCreate()

url = f"jdbc:mariadb://{db_config['host']}:{db_config['port']}/{db_config['database']}"
user = db_config['user']
password = db_config['password']
address_df = spark.read.format('jdbc').options(url=url, dbtable='Addresses', user=user, password=password).load()

url_train = f"jdbc:mariadb://{db_config_train['host']}:{db_config_train['port']}/{db_config_train['database']}"
user_train = db_config_train['user']
password_train = db_config_train['password']
address_df_train = spark.read.format('jdbc').options(url=url_train, dbtable='Addresses', user=user_train, password=password_train).load()

23/06/28 14:26:23 WARN Utils: Your hostname, NatRng-MBP.local resolves to a loopback address: 127.0.0.1; using 192.168.222.50 instead (on interface en0)
23/06/28 14:26:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/28 14:26:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/28 14:26:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/06/28 14:26:25 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
all_addresses_df = address_df.union(address_df_train)

In [5]:
# Assuming the column name is 'address'
address_list = [row['address'] for row in all_addresses_df.select('address').collect()]
address_list = list(set(address_list))
spark.stop()

In [6]:
with open('data/pickle_files/address_list.pkl', 'wb') as f:
    pickle.dump(address_list, f)

In [6]:
with open('data/pickle_files/address_list.pkl', 'rb') as f:
    address_list = pickle.load(f)
print(len(address_list))

10876289


In [13]:
# Chunk size
chunk_size = 1000
# Create sublists
sublists = [address_list[i:i+chunk_size] for i in range(0, len(address_list), chunk_size)]

len(sublists)

10877

In [41]:
def process_addresses(session, addresses):
    for address in addresses:
        response = session.get(url + address)
        response_json = response.json()
        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code} for address {address}")
            continue
        if not response_json:
            continue
        elif address not in {resp_json['address'] for resp_json in response_json}:
            tainted = {
                "address": address,
                "tag": "Tainted",
                "type": "Tainted",
                "txhash": None,
                "direction": None,
                "timestamp": None,
                "subtype": None
            }
            response_json.append(tainted)
        with lock:
            scams_list.extend(response_json)

url = 'https://app.hoptrail.io/api/eth/check/'
eth_address = address_list
# Chunk size
chunk_size = 1000

# Create chunks
address_chunks = [eth_address[i:i+chunk_size] for i in range(0, len(eth_address), chunk_size)]

scams_list = []
lock = threading.Lock()

# Use ThreadPoolExecutor to send requests in parallel
with ThreadPoolExecutor(max_workers=10) as executor, requests.Session() as session:
    executor.map(lambda addresses: process_addresses(session, addresses), address_chunks[0:200])


KeyboardInterrupt: 

In [43]:
scam_df = pd.DataFrame(scams_list)
tainted_df = scam_df[scam_df['tag'] == 'Tainted']
scam_df = scam_df[['address', 'tag', 'type', 'subtype']]
scam_df.drop_duplicates().reset_index(drop=True)

Unnamed: 0,address,tag,type,subtype
0,0x3af4788ade7a75b90c0d8f59017f3ef56b51b84e,Phishing Scam,Scam,Phishing
1,0x54a6ab0e9ca7a4a0ede6651ac4eb6359514d5f3c,Tainted,Tainted,
2,0x34278f6f40079eae344cbac61a764bcf85afc949,Fake_Phishing4953,Scam,
3,0xD13017C013ae2eb708C4FCDB70B20D16ba3B64a9,os20.io (Opensea),Scam,
4,0x10cc08c32744761a78782c3879603abc478487df,Tainted,Tainted,
...,...,...,...,...
2265,0xd22886ce03dfd879e452cd1991c31d2701f664fa,Tainted,Tainted,
2266,0xc3746825f13c07dcd7e6fdb9c0c80a9affb18952,Tainted,Tainted,
2267,0xcdcfd7af70d21b83332582ab8f274f4ef0942887,Tainted,Tainted,
2268,0xf635c8996284dcba97e59c7752be51d6cef9d86f,Tainted,Tainted,


In [38]:
scam_df.to_parquet('data/parquet_files/scam_df_sample_0_200.parquet')

In [4]:
scams = pd.read_parquet('data/parquet_files/scam_df_sample_0_200.parquet')
scams = scams.drop_duplicates().reset_index(drop=True)
scams = scams[scams['tag'] != 'Tainted']
scams

Unnamed: 0,address,tag,type,subtype
0,0x3af4788ade7a75b90c0d8f59017f3ef56b51b84e,Phishing Scam,Scam,Phishing
2,0x34278f6f40079eae344cbac61a764bcf85afc949,Fake_Phishing4953,Scam,
3,0xD13017C013ae2eb708C4FCDB70B20D16ba3B64a9,os20.io (Opensea),Scam,
6,0xab8674084c581f549681541ab18187a67ef1aa60,Fake_Phishing5087,Scam,
9,0xfd1edc29754b65f1f45b885589c90811054be7a9,Fake_Phishing2513,Scam,
...,...,...,...,...
2234,0xfa71e31efa184e3948c486e7f411707b4da290fb,Fake_Phishing6285,Scam,
2245,0x86c620ab6ece77634e5a41ea530bfcc3a23f032b,Fake_Phishing5313,Scam,
2249,0x7f48a221989661b7bf5a961c6620554a2a3b781c,Fake_Phishing4161,Scam,
2259,0x3a577dccfaeb49c18e10c6e750b316aa02e5c7f7,Fake_Phishing817,Scam,
