In [None]:
import pandas as pd
import re
import os
import pathlib

In [None]:
INPUT_PATH = pathlib.Path('D:\checkpoints')

def attempt_convert(v):
    try:
        try:
            return int(v)
        except:
            return float(v)
    except:
        return v

df = []
extract_block_size_regex = re.compile(r'Block Size: (?P<BlockSize>[0-9]+)')
extract_compression_regex = re.compile(r'Compression: (?P<CompressionType>[^\s]+)\s*Compression level: (?P<CompressionLevel>[0-9]+)\s*Size:\s*(?P<CompressedSize>[0-9]+)\s*Blocks:\s*(?P<NumBlocks>[0-9]+)\s*Time Taken:\s*(?P<TimeTakenMicros>[0-9]+)\s*microsecs\s*Compressed:\s*(?P<Compressed>[0-9]+)\s*\(\s*(?P<CompressedPct>.*?)%\)\s*Not compressed \(ratio\):\s*(?P<NotCompressedRatio>[0-9]+)\s*\(\s*(?P<NotCompressedRatioPct>.*?)%\)\s*Not compressed \(abort\):\s*(?P<NotCompressedAbort>[0-9]+)\s*\(\s*(?P<NotCompressedAbortPct>.*?)%\)')
for checkpoint in os.listdir(INPUT_PATH):
    checkpoint_path = INPUT_PATH / checkpoint
    for sst in os.listdir(checkpoint_path):
        sst_path = checkpoint_path / sst
        if not sst_path.match('*.compression*.txt'):
            continue
        
        contents = sst_path.read_text()
        if 'Not able to read table properties' in contents:
            continue
            
        try:
            block_size = int(extract_block_size_regex.search(contents).group('BlockSize'))
            checkpoint = checkpoint_path.stem
            for algorithm in extract_compression_regex.finditer(contents):
                entry = {k: attempt_convert(v) for k, v in algorithm.groupdict().items()}
                entry['BlockSize'] = block_size
                entry['Path'] = str(sst_path)
                entry['Checkpoint'] = checkpoint
                df.append(entry)
        except Exception as e:
            print(e)
            print(contents)
            

df = pd.DataFrame(df)

# Sanitize compression algorithm names
def rename_compression_types(compression_type):
    renamer = {
        'kNoCompression': 'uncompressed',
        'kSnappyCompression': 'snappy',
        'kZlibCompression': 'zlib',
        'kLZ4Compression': 'lz4',
        'kLZ4HCCompression': 'lz4hc',
        'kZSTD': 'zstd'
    }
    return renamer[compression_type]

df['CompressionType'] = df['CompressionType'].apply(rename_compression_types)

# Compute compression rate
uncompressed = df[df['CompressionType'] == 'uncompressed'][['Path', 'CompressedSize']].drop_duplicates()
uncompressed.rename(columns={'CompressedSize': 'UncompressedSize'}, inplace=True)
df = df.merge(uncompressed)
del uncompressed

df['CompressionRate'] = (1.0 - df['CompressedSize']/df['UncompressedSize'])

# Filter out uncompressed, they are useless
df = df[df['CompressionType'] != 'uncompressed']

In [None]:
df.to_csv('dataset.csv')

In [None]:
cf = df.groupby(['Checkpoint', 'CompressionType', 'BlockSize']).sum()[['CompressedSize', 'UncompressedSize']]
cf['CompressionRate'] = cf['CompressedSize']/cf['UncompressedSize']
snappySizes = []
for idx, data in cf.iterrows():
    snappySizes.append(cf.loc[(idx[0], 'snappy', idx[2])]['CompressedSize'])
cf['BaselineCompressedSize'] = snappySizes
cf['BaselineCompressionRate'] = cf['BaselineCompressedSize'] / cf['UncompressedSize']
# A smaller compression rate means a lower size
cf['BaselineCompressionImprovement'] = cf['BaselineCompressionRate'] - cf['CompressionRate']
del snappySizes
del cf['BaselineCompressedSize']

In [None]:
cf.sort_values(by=['BaselineCompressionImprovement'], ascending=False)

In [None]:
cf