# Label Harmonization -- CRPWarner

## Import and Const

In [1]:
import pandas as pd
import os
from pathlib import Path

from utils.main import clean_column

In [2]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data')
name = 'crpwarner'
CRP_PATH = os.path.join(DATA_PATH, f'external/{name}')
INTERIM_PATH = os.path.join(DATA_PATH, f'interim/{name}')
GROUND_PATH = os.path.join(CRP_PATH, 'groundtruth')
LARGE_PATH = os.path.join(CRP_PATH, 'large/sample')

## Read label files

In [3]:
ground_truth = pd.read_excel(os.path.join(GROUND_PATH, "groundTruth.xlsx"))
mint = pd.read_excel(os.path.join(LARGE_PATH, "mint.xlsx"))
leak = pd.read_excel(os.path.join(LARGE_PATH, "leak.xlsx"))
limit = pd.read_excel(os.path.join(LARGE_PATH, "limit.xlsx"))

## Normalize addresses

In [4]:
ground_truth['address'] = ground_truth['address'].str.lower()
mint['Address'] = mint['Address'].str.lower()
leak['Address'] = leak['Address'].str.lower()
limit['Address'] = limit['Address'].str.lower()

## Set binary indicators (TP? == Yes → 1)

In [5]:
mint['Mint'] = (mint['TP?'] == 'Yes').astype(int)
leak['Leak'] = (leak['TP?'] == 'Yes').astype(int)
limit['Limit'] = (limit['TP?'] == 'Yes').astype(int)

## Merge everything into a unified dataframe

In [6]:
df_all = pd.DataFrame({'address': pd.concat([
    mint['Address'], leak['Address'], limit['Address']
]).drop_duplicates()})

In [7]:
df_all = df_all.merge(mint[['Address', 'Mint']], left_on='address', right_on='Address', how='left').drop(columns='Address')
df_all = df_all.merge(leak[['Address', 'Leak']], left_on='address', right_on='Address', how='left').drop(columns='Address')
df_all = df_all.merge(limit[['Address', 'Limit']], left_on='address', right_on='Address', how='left').drop(columns='Address')

## Fill missing labels with 0 (absence of evidence)

In [8]:
len(df_all)

267

In [9]:
df_all.isna().sum()

address      0
Mint       175
Leak       179
Limit      175
dtype: int64

In [10]:
df_all[['Mint', 'Leak', 'Limit']] = df_all[['Mint', 'Leak', 'Limit']].fillna(0).astype(int)

In [11]:
df_all.isna().sum()

address    0
Mint       0
Leak       0
Limit      0
dtype: int64

## Clean Column Names

In [12]:
ground_truth = clean_column(ground_truth)
df_all = clean_column(df_all)

## Save to CSV

In [13]:
ground_truth.to_csv(os.path.join(INTERIM_PATH, "dataset-modified.csv"), index=False)
df_all.to_csv(os.path.join(INTERIM_PATH, "sample_dataset-modified.csv"), index=False)