# Analyzing DeFi Data on Ethereum Blockchain to Understand Behaviors
## Preprocessing

#### Timestamp conversion for Spark

In [None]:
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd

file_path = '../data/dataset/market.parquet'
parquet_file = pq.ParquetFile(file_path)

df = parquet_file.read().to_pandas()

for column in df.columns:
    if pd.api.types.is_timedelta64_dtype(df[column]):
        df[column] = df[column].astype('str')
    elif pd.api.types.is_datetime64_any_dtype(df[column]):
        df[column] = df[column].astype('str')

output_path = '../data/dataset/market_converted.parquet'
df.to_parquet(output_path, engine='pyarrow')

print(f'Le fichier a été converti avec succès et sauvegardé sous {output_path}')

In [None]:
import pandas as pd

def load_data(list, base_path='../data/dataset'):
    dataframes = {}
    for file in list:
        file_path = f"{base_path}/{file}.parquet"
        df = pd.read_parquet(file_path)
        dataframes[file] = df
        print(f"DataFrame '{file}' contains columns:\n{df.columns.tolist()}\n")
    return dataframes

dataframes = load_data(['users', 'market'])

for key, df in dataframes.items():
    print(f"Dataframe {key}:\n=============================================\n")
    print(f"{df.head(5)}\n")

In [4]:
def clean_column_names(df):
    """
    Clean column names by stripping leading/trailing spaces, lowercasing all characters, and replacing spaces with underscores.
    """
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    return df

## 1) Extraction of protocol types/names and count

In [None]:
import json

users = dataframes['users'].copy()
market = dataframes['market'].copy()

#---------------------------------------------- Protocol Types

def parse_protocols(protocol_column):
    """
    Parses a JSON string into a dictionary. If the string is not a valid JSON, returns an empty dictionary.
    """
    try:
        protocols = json.loads(protocol_column)
        return protocols if isinstance(protocols, dict) else {}
    except json.JSONDecodeError:
        return {}

def process_user_protocols(users):
    """
    Processes user protocols by extracting specific protocol types and adding them as separate columns.
    """
    protocol_columns = ['type_' + k for k in ['DEX', 'Lending', 'Stablecoin', "Yield Farming", "NFT-Fi"]]
    users = users.assign(**{col: 0 for col in protocol_columns})

    users['parsed_protocols'] = users['protocol_types'].apply(parse_protocols)

    for protocol in ['DEX', 'Lending', 'Stablecoin', "Yield Farming", "NFT-Fi"]:
        column_name = f'type_{protocol}'
        users[column_name] = users['parsed_protocols'].apply(lambda x: x.get(protocol, 0))
    
    return users.drop(columns=['protocol_types', 'parsed_protocols'])

users = process_user_protocols(users)

#---------------------------------------------- Protocl Used

def transform_protocols_column(df, column_name='protocols_used'):
    """
    Transforms the 'protocols_used' column by converting string representations of dictionaries into actual dictionaries.
    Then, it creates new columns for each protocol used, counting the occurrences of each protocol.
    """
    
    df[column_name] = df[column_name].apply(lambda x: eval(x) if isinstance(x, str) else x)
    
    count_columns = []
    
    for index, row in df.iterrows():
        protocols = row[column_name]
        for protocol_name, protocol_data in protocols.items():
            count_columns.append(f'{protocol_name}_count')
            df.at[index, f'{protocol_name}_count'] = int(protocol_data.get('count', 0))
    
    return df.drop(columns=[column_name])

users = transform_protocols_column(users)

#---------------------------------------------- Cleaning

users = clean_column_names(users)
users.fillna(0, inplace=True)
protocols_counts = ['curve_dao_count', 'aave_count', 'tether_count', 'uniswap_count', 'maker_count', 'yearn.finance_count', 'usdc_count', 'dai_count', 'balancer_count', 'harvest_finance_count']
users[protocols_counts] = users[protocols_counts].astype(int)

print(users.columns)
users.head(10)

## 2) Extract transactions data

In [None]:
def extract_transactions(row):
    transactions = json.loads(row['transactions'])  # Convertir la chaîne JSON en liste de dictionnaires
    extracted_rows = []
    
    for txn in transactions:
        extracted_row = {
            'address': row['address'],
            'first_seen': row['first_seen'],
            'last_seen': row['last_seen'],
            'received_count': row['received_count'],
            'total_received_(eth)': row['total_received_(eth)'],
            'sent_count': row['sent_count'],
            'total_sent_(eth)': row['total_sent_(eth)'],
            'transactions': row['transactions'],  
            'type_dex': row['type_dex'],
            'type_lending': row['type_lending'],
            'type_stablecoin': row['type_stablecoin'],
            'type_yield_farming': row['type_yield_farming'],
            'type_nft-fi': row['type_nft-fi'],
            'curve_dao_count': row['curve_dao_count'],
            'aave_count': row['aave_count'],
            'tether_count': row['tether_count'],
            'uniswap_count': row['uniswap_count'],
            'maker_count': row['maker_count'],
            'yearn.finance_count': row['yearn.finance_count'],
            'usdc_count': row['usdc_count'],
            'dai_count': row['dai_count'],
            'balancer_count': row['balancer_count'],
            'harvest_finance_count': row['harvest_finance_count'],
            'tx_timestamp': txn['timestamp'],
            'tx_protocol': txn['protocol_name'],
            'tx_value_(eth)': txn['value (ETH)'],
            'tx_is_sender': txn['is_sender'],
            'tx_gas_used': txn['gas_used']
        }
        extracted_rows.append(extracted_row)
    return extracted_rows

expanded_rows = users.apply(extract_transactions, axis=1)

users = pd.DataFrame([item for sublist in expanded_rows for item in sublist])

users.drop(columns=['transactions'], inplace=True)
print(users.columns)
users.head(5)

In [None]:
print(market.columns)

In [None]:
print(users['tx_protocol'].unique())
print(market['protocol_name'].unique())

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def merge_users_with_market(df_users, df_market, batch_size=1000):
    df_users['tx_timestamp'] = pd.to_datetime(df_users['tx_timestamp'])
    df_market['timestamp'] = pd.to_datetime(df_market['timestamp'])

    market_columns = [col for col in df_market.columns if col not in ['timestamp', 'protocol_name']]

    merged_df = df_users.copy()
    for col in market_columns:
        merged_df[col] = str(np.nan)
    
    grouped_market = df_market.groupby('protocol_name')

    for batch_start in tqdm(range(0, len(df_users), batch_size), desc="Processing users", unit="lot"):
        batch_end = min(batch_start + batch_size, len(df_users))
        batch = df_users.iloc[batch_start:batch_end]
        
        for idx, user_row in batch.iterrows():
            protocol = user_row['tx_protocol']
            tx_timestamp = user_row['tx_timestamp']

            if protocol in grouped_market.groups:
                relevant_market_data = grouped_market.get_group(protocol).copy()

                relevant_market_data['time_diff'] = abs(relevant_market_data['timestamp'] - tx_timestamp)
                closest_market_row = relevant_market_data.loc[relevant_market_data['time_diff'].idxmin()]

                for col in market_columns:
                    merged_df.at[idx, col] = closest_market_row[col]
    
    return merged_df

data = merge_users_with_market(users, market)

data = clean_column_names(data)
data.fillna(0, inplace=True)

data.to_parquet('../data/data.parquet', engine='pyarrow')

print(data.columns)
data.head(5)