# get dataframe from file

In [100]:
import pandas as pd 
import numpy as np 

transactions_df = pd.read_csv("/Users/test/Projects/level-up/summer-of-btc/mempool.csv")

# data cleaning

In [101]:
transactions_df.rename(columns=lambda x: x.strip(), inplace=True)

def apply_parent_id_transformation(id_string): 
    if  pd.isna(id_string): 
        return []
    return id_string.split(";")

transactions_df["fee"] = pd.to_numeric(transactions_df["fee"])
transactions_df["weight"] = pd.to_numeric(transactions_df["weight"])
transactions_df["parents"] = transactions_df["parents"].apply(lambda x: apply_parent_id_transformation(x))

# more information on parents

In [103]:
def sanity_check(parent_indices, index, row): 
    greater_than = sum([1 if parent_index > index else 0 for parent_index in parent_indices])
    lesser_than = sum([1 if parent_index < index else 0 for parent_index in parent_indices])
    equal_to = sum([1 if parent_index == index else 0 for parent_index in parent_indices])
    if greater_than > 0 and lesser_than > 0: 
        raise Exception(f"parents on {index} {row['tx_id']} have parents both greater and lesser in index" )
    if equal_to > 0: 
        raise Exception(f"circular index reference at {index} {row['tx_id']}")

def check_if_parent_later(df, index, row):
    if row["total_parents"] == 0:
        return False
    parents = row["parents"]
    
    for parent in parents: 
        parent_indices = df.index[df['tx_id'] == parent].tolist()
        assert len(parent_indices) == 1
        sanity_check(parent_indices, index, row)
        for parent_index in parent_indices: 
            if parent_index > index:
                return True
    return False

def get_earliest_parent_index(df, parents): 
    if len(parents) == 0: 
        return len(df) * 2 # some arbitrary length greater than length of the list. not very elegant
    return min([df.index[df['tx_id'] == parent].tolist()[0] for parent in parents])


transactions_df["total_parents"] = transactions_df["parents"].apply(lambda x: len(x))
transactions_df["parent_later"] = [check_if_parent_later(transactions_df, i, row) for i, row in transactions_df.iterrows()]
transactions_df["earliest_parent_index"] = transactions_df["parents"].apply(lambda x: get_earliest_parent_index(transactions_df, x))

filtered_transactions = transactions_df.drop(transactions_df[transactions_df.parent_later == True].index)
print(len(transactions_df))
print(len(filtered_transactions))

filtered_transactions.to_csv("/Users/test/Projects/level-up/summer-of-btc/earlier_parent_filtered_transtions.csv")

5214
5087
