# Label Harmonization -- RPHunter 

## Import and Const

In [1]:
import pandas as pd
import os
from pathlib import Path

from utils.main import clean_column

In [2]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data')
name = 'rphunter'
RP_PATH = os.path.join(DATA_PATH, f'interim/{name}')

## Read label files

In [3]:
df = pd.read_csv(os.path.join(RP_PATH, "dataset.csv"))

## Normalize addresses

In [4]:
df['Address'] = df['Address'].str.lower()

In [5]:
df.columns

Index(['Project Name', 'Chain', 'Address', 'Open Source', 'Source',
       'Address_lower', 'in_normal', 'in_rug', 'Hidden Balance Modification',
       'Hidden Mint/Burn', 'Address Restrict', 'Amount Restrict',
       'Modifiable External Call', 'TimeStamp Restrict',
       'Modifiable Tax Address', 'Modifiable Tax Rate', 'label_combo'],
      dtype='object')

## Select Column

In [6]:
df.drop(columns=['Project Name', 'Chain', 'Open Source', 'Source', 'label_combo'])
desired_order = ['Address', 'Hidden Balance Modification', 'Hidden Mint/Burn', 'Address Restrict',
       'Amount Restrict', 'Modifiable External Call', 'TimeStamp Restrict',
       'Modifiable Tax Address', 'Modifiable Tax Rate']
df = df [desired_order]

In [7]:
df.head()

Unnamed: 0,Address,Hidden Balance Modification,Hidden Mint/Burn,Address Restrict,Amount Restrict,Modifiable External Call,TimeStamp Restrict,Modifiable Tax Address,Modifiable Tax Rate
0,0x93023f1d3525e273f291b6f76d2f5027a39bf302,0,1,0,0,0,0,0,1
1,0x2753dce37a7edb052a77832039bcc9aa49ad8b25,0,0,1,0,0,0,0,0
2,0x5404efafdd8cc30053069df2a1b0c4ba881b3e1e,0,1,0,0,0,0,0,0
3,0x10f6f2b97f3ab29583d9d38babf2994df7220c21,0,1,0,0,0,0,0,1
4,0x11cbc781dadaad13fc3a361772c80b1c027820af,0,0,1,0,0,0,0,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 616 entries, 0 to 615
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Address                      614 non-null    object
 1   Hidden Balance Modification  616 non-null    int64 
 2   Hidden Mint/Burn             616 non-null    int64 
 3   Address Restrict             616 non-null    int64 
 4   Amount Restrict              616 non-null    int64 
 5   Modifiable External Call     616 non-null    int64 
 6   TimeStamp Restrict           616 non-null    int64 
 7   Modifiable Tax Address       616 non-null    int64 
 8   Modifiable Tax Rate          616 non-null    int64 
dtypes: int64(8), object(1)
memory usage: 43.4+ KB


In [9]:
float_cols = df.select_dtypes(include='float64').columns
df[float_cols] = df[float_cols].fillna(0).astype(int)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 616 entries, 0 to 615
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Address                      614 non-null    object
 1   Hidden Balance Modification  616 non-null    int64 
 2   Hidden Mint/Burn             616 non-null    int64 
 3   Address Restrict             616 non-null    int64 
 4   Amount Restrict              616 non-null    int64 
 5   Modifiable External Call     616 non-null    int64 
 6   TimeStamp Restrict           616 non-null    int64 
 7   Modifiable Tax Address       616 non-null    int64 
 8   Modifiable Tax Rate          616 non-null    int64 
dtypes: int64(8), object(1)
memory usage: 43.4+ KB


## Clean Column Names

In [11]:
df = clean_column(df)

## Save to CSV

In [12]:
df.to_csv(os.path.join(RP_PATH, "dataset-modified.csv"), index=False)