# get dataframe from file

In [1]:
import pandas as pd 
import numpy as np 

transactions_df = pd.read_csv("/Users/test/Projects/level-up/summer-of-btc/mempool.csv")

# data cleaning

In [2]:
transactions_df.rename(columns=lambda x: x.strip(), inplace=True)

def apply_parent_id_transformation(id_string): 
    if  pd.isna(id_string): 
        return []
    return id_string.split(";")

transactions_df["fee"] = pd.to_numeric(transactions_df["fee"])
transactions_df["weight"] = pd.to_numeric(transactions_df["weight"])
transactions_df["parents"] = transactions_df["parents"].apply(lambda x: apply_parent_id_transformation(x))

# more information on parents

In [3]:
def sanity_check(parent_indices, index, row): 
    greater_than = sum([1 if parent_index > index else 0 for parent_index in parent_indices])
    lesser_than = sum([1 if parent_index < index else 0 for parent_index in parent_indices])
    equal_to = sum([1 if parent_index == index else 0 for parent_index in parent_indices])
    if greater_than > 0 and lesser_than > 0: 
        raise Exception(f"parents on {index} {row['tx_id']} have parents both greater and lesser in index" )
    if equal_to > 0: 
        raise Exception(f"circular index reference at {index} {row['tx_id']}")

def check_if_parent_later(df, index, row):
    if row["total_parents"] == 0:
        return False
    return True # just totally ignoring transactions with unconfirmed parents

#     parents = row["parents"]
    
#     for parent in parents: 
#         parent_indices = df.index[df['tx_id'] == parent].tolist()
#         assert len(parent_indices) == 1
#         sanity_check(parent_indices, index, row)
#         for parent_index in parent_indices: 
#             if parent_index > index:
#                 return True
#     return False

def get_earliest_parent_index(df, parents): 
    if len(parents) == 0: 
        return -1
    return min([df.index[df['tx_id'] == parent].tolist()[0] for parent in parents])

def get_fee_weight_ratio(transaction): 
    return transaction["fee"] / transaction["weight"]

transactions_df["total_parents"] = transactions_df["parents"].apply(lambda x: len(x))
transactions_df["parent_later"] = [check_if_parent_later(transactions_df, i, row) for i, row in transactions_df.iterrows()]
transactions_df["earliest_parent_index"] = transactions_df["parents"].apply(lambda x: get_earliest_parent_index(transactions_df, x))
transactions_df["fee-weight-ratio"] = [get_fee_weight_ratio(x) for _, x in transactions_df.iterrows()]

filtered_transactions = transactions_df.drop(transactions_df[transactions_df.parent_later == True].index)
filtered_transactions["original_index"] = filtered_transactions.index

# Finding the most optimal set of transactions

In [4]:
filtered_transactions = filtered_transactions.sort_values(by=['fee-weight-ratio'], ascending=False)

final_transactions = []
current_weight = 0
current_fees = 0
for _, row in filtered_transactions.iterrows(): 
    if current_weight + row.weight > 4000000: 
        break 
    final_transactions.append((row.original_index, row.tx_id))
    current_weight += row.weight
    current_fees += row.fee
    
with open('/Users/test/Projects/level-up/summer-of-btc/unsorted_block_removing_unconfirmed_parents.txt', 'w') as file: 
    for i in final_transactions: 
        file.write(f"{i[1]}\n")
    
final_transactions = sorted(final_transactions, key=lambda item: item[0])

with open('/Users/test/Projects/level-up/summer-of-btc/block_removing_unconfirmed_parents.txt', 'w') as file: 
    for i in final_transactions: 
        file.write(f"{i[1]}\n")