# Import/Install packages

In [30]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold

from numpy import mean, std
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from collections import Counter

# Input path

In [27]:
input_path = "/Users/paritoshgupta/Desktop/nlp-notebooks/input/"

# Read input data

In [28]:
dataset = pd.read_csv(os.path.join(input_path, 'transaction_dataset.csv'))
print(dataset.shape)
dataset.head()

(9841, 51)


Unnamed: 0.1,Unnamed: 0,Index,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,...,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name,ERC20 most sent token type,ERC20_most_rec_token_type
0,0,1,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,...,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire
1,1,2,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,...,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token
2,2,3,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,,XENON
3,3,4,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,0,10219.6,15785.09,397555.9,25,9,0,...,100.0,9029.231,3804.076893,0.0,0.0,0.0,1.0,11.0,Raiden,XENON
4,4,5,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,...,0.0,45000.0,13726.65922,0.0,0.0,0.0,6.0,27.0,StatusNetwork,EOS


## Data Dictionary

    --Index: the index number of a row
    --Address: the address of the ethereum account
    --FLAG: whether the transaction is fraud or not
    --Avg min between sent tnx: Average time between sent transactions for account in minutes.
    --Avgminbetweenreceivedtnx: Average time between received transactions for account in minutes
    --TimeDiffbetweenfirstand_last(Mins): Time difference between the first and last transaction.
    --Sent_tnx: Total number of sent normal transactions.
    --Received_tnx: Total number of received normal transactions.
    --NumberofCreated_Contracts: Total Number of created contract transactions.
    --UniqueReceivedFrom_Addresses: Total Unique addresses from which account received transactions.
    ---UniqueSentTo_Addresses20: Total Unique addresses from which account sent transactions.
    --MinValueReceived: Minimum value in Ether ever received.
    --AvgValueReceived5Average value in Ether ever received.
    --MinValSent: Minimum value of Ether ever sent.
    --AvgValSent: Average value of Ether ever sent.
    --MinValueSentToContract: Minimum value of Ether sent to a contract
    --AvgValueSentToContract: Average value of Ether sent to contracts.
    --MaxValueSentToContract: Maximum value of Ether sent to a contract
    --TotalTransactions(IncludingTnxtoCreate_Contract): Total number of transactions
    --TotalEtherSent:Total Ether sent for account address
    --TotalEtherReceived: Total Ether received for account address
    --TotalEtherSent_Contracts: Total Ether sent to Contract addresses
    --TotalEtherBalance: Total Ether Balance following enacted transactions
    --TotalERC20Tnxs: Total number of ERC20 token transfer transactions
    --ERC20TotalEther_Received: Total ERC20 token received transactions in Ether
    --ERC20TotalEther_Sent: Total ERC20token sent transactions in Ether
    --ERC20TotalEtherSentContract: Total ERC20 token transfer to other contracts in Ether
    --ERC20UniqSent_Addr: Number of ERC20 token transactions sent to Unique account addresses
    --ERC20UniqRec_Addr: Number of ERC20 token transactions received from Unique addresses.
    --ERC20UniqRecContractAddr: Number of ERC20token transactions received from Unique contract addresses.
    --ERC20AvgTimeBetweenSent_Tnx: Average time between ERC20 token sent transactions in minutes
    --ERC20AvgTimeBetweenRec_Tnx: Average time between ERC20 token received transactions in minutes
    --ERC20AvgTimeBetweenContract_Tnx: Average time ERC20 token between sent token transactions
    --ERC20MinVal_Rec: Minimum value in Ether received from ERC20 token transactions for account.
    --ERC20MaxVal_Rec: Maximum value in Ether received from ERC20 token transactions for account
    --ERC20AvgVal_Rec: Average value in Ether received from ERC20 token transactions for account
    --ERC20MinVal_Sent: Minimum value in Ether sent from ERC20 token transactions for account
    --ERC20MaxVal_Sent: Maximum value in Ether sent from ERC20 token transactions for account
    --ERC20AvgVal_Sent: Average value in Ether sent from ERC20 token transactions for account
    --ERC20UniqSentTokenName: Number of Unique ERC20 tokens transferred
    --ERC20UniqRecTokenName: Number of Unique ERC20 tokens received
    --ERC20MostSentTokenType: Most sent token for account via ERC20 transaction
    --ERC20MostRecTokenType: Most received token for account via ERC20 transactions

## Data Pre-processing

In [32]:
input_df = dataset.iloc[:,2:]
input_df.columns = input_df.columns.str.replace(" ", "")
print(f"N rows present in raw data --> {input_df.shape[0]}")
input_df.head()

N rows present in raw data --> 9841


Unnamed: 0,Address,FLAG,Avgminbetweensenttnx,Avgminbetweenreceivedtnx,TimeDiffbetweenfirstandlast(Mins),Senttnx,ReceivedTnx,NumberofCreatedContracts,UniqueReceivedFromAddresses,UniqueSentToAddresses,...,ERC20minvalsent,ERC20maxvalsent,ERC20avgvalsent,ERC20minvalsentcontract,ERC20maxvalsentcontract,ERC20avgvalsentcontract,ERC20uniqsenttokenname,ERC20uniqrectokenname,ERC20mostsenttokentype,ERC20_most_rec_token_type
0,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,40,118,...,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire
1,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,5,14,...,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token
2,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.3,2,10,0,10,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,,XENON
3,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,0,10219.6,15785.09,397555.9,25,9,0,7,13,...,100.0,9029.231,3804.076893,0.0,0.0,0.0,1.0,11.0,Raiden,XENON
4,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,7,19,...,0.0,45000.0,13726.65922,0.0,0.0,0.0,6.0,27.0,StatusNetwork,EOS


In [40]:
print(f"N rows present in raw data --> {input_df.shape[0]}")
print(f"N unique addresses --> {len(Counter(dataset.Address))}")
print(f"N unique addresses --> {len(Counter(dataset.Address))}")

def counts (data):
    counts = Counter(data)
#     print(counts)
    print("length",len(counts))

counts(input_df.Address)
counts(input_df.ERC20_most_rec_token_type)
counts(input_df.ERC20mostsenttokentype)



N rows present in raw data --> 9841
N unique addresses --> 9816
N unique addresses --> 9816
length 9816
length 468
length 306
