In [1]:
import pandas as pd
import numpy as np
from tqdm import *
from pprint import pprint

In [2]:
traces = [
    ('T1' , '/home/tiratatp/Repositories/snia_traces/T1/LiveMapsBackEnd/Combined/disk1_filtered.txt'),
    ('T2' , '/home/tiratatp/Repositories/snia_traces/T2/DisplayAdsDataServer/Combined/disk0.txt'),
    ('T3' , '/home/tiratatp/Repositories/snia_traces/T3/DisplayAdsPayload/Combined/disk0.txt'),
    ('T4' , '/home/tiratatp/Repositories/snia_traces/T4/Exchange-Server-Traces/Combined/disk8.txt'),
    ('T5' , '/home/tiratatp/Repositories/snia_traces/T5/MSNStorageCFS/Combined/disk6_filtered.txt'),
    ('T6' , '/home/tiratatp/Repositories/snia_traces/T6/MSNStorageFileServer/Combined/disk5_filtered.txt'),
    ('T7' , '/home/tiratatp/Repositories/snia_traces/T7/BuildServer/Combined/disk0_filtered.txt'),
    ('T8' , '/home/tiratatp/Repositories/snia_traces/T8/DevelopmentToolsRelease/Combined/disk6_filtered.txt'),
    ('T9' , '/home/tiratatp/Repositories/snia_traces/T9/RadiusAuthentication/Combined/disk0_filtered.txt'),
    ('T10', '/home/tiratatp/Repositories/snia_traces/T10/RadiusBackEndSQLServer/Combined/disk4_filtered.txt'),    
]

# bin size in ms; 1hour
bin_size = 3600000

#info about size of preallocated shelters:
sheltersize = 10
shelterrange = 100

In [3]:
#MB in sectors
one_MB = 2 * 1024
x = sheltersize * one_MB
y = shelterrange * one_MB
assert x < y

In [12]:
def return_merged_trace(trace):    
    # read trace into memory
    t = pd.read_csv(trace, delimiter=' ', usecols=[0,2,3,4], \
                    header=None, names=['offset', 'blkno', 'blkcount', 'flag'], \
                    dtype={'offset':np.float_, 'blkno':np.int_, 'blkcount':np.int_, 'flag':np.bool_}, \
                    na_filter=False, engine='c')

    # compare previous IO and tag it if it's sequential
    t['is_seq'] = (t['flag'].shift(1) == t['flag']) & ((t['blkno'] + t['blkcount']).shift(1) == t['blkno'])
    # use cumsum to help group the IO
    t['io_num'] = (~t['is_seq']).astype(int).cumsum()
    # merge sequential IO
    t = t.groupby(['io_num'], sort=False, as_index=False).agg({
            'offset': 'first',
            'blkno': 'min',
            'blkcount': 'sum',
            'flag': 'first',
        }).reset_index().drop(['index', 'io_num'], 1)
    
    return t

In [13]:
g2_result = {}
g3_result = {}

for trace, f in tqdm(traces):
    t = return_merged_trace(f)
    t['tail'] = t['blkno'] + t['blkcount'] # need for several things
    t_smallwrite = t[(t['flag'] == 0) & (t['blkcount'] <= 64)].drop('flag', 1)
    
    # find the number of shelter    
    t['shifted_tail'] = t['tail'] + ((t['blkno'] / y) * x)
    shelter_count = np.floor(t['shifted_tail'].max() / y)    
    # this is the maximum blocks that can be sheltered in this trace
    max_sheltered_block = x * shelter_count
    
    # second graph
    before_merging = t_smallwrite['blkcount'].sum()
    
    # remove exact IO and then sort by blkno and tail
    g2 = t_smallwrite.drop_duplicates(subset=['blkno', 'blkcount'])\
                                    .sort(['blkno', 'blkcount'])\
                                    .copy(deep=True)
    g2['tail'] = g2['blkno'] + g2['blkcount']
    
    # remove overlapping requests
    before_count = None
    after_count = len(g2)    
    while before_count != after_count:
        before_count = after_count 
        # this should remove most of the overlapping IO
        # except the one that e.g
        # first IO  : ===========
        # second IO :    ===
        # third IO  :          ==
        # The above example is why we need a loop.
        g2['is_ovrlp'] = g2['tail'].shift(1) >= g2['blkno']
        g2['io_num'] = (~g2['is_ovrlp']).astype(int).cumsum()   
        g2 = g2.groupby('io_num', sort=False)\
                .agg({
                    'blkno': 'min',            
                    'tail': 'max',
                })
        after_count = len(g2)
    
    after_merging = (g2['tail'] - g2['blkno']).sum()
    g2_result[trace] = (
        before_merging,
        after_merging,
        float(before_merging-after_merging)/before_merging*100,
        )
    
    # third graph
    sheltered_block_per_period = t_smallwrite['blkcount'].groupby(np.floor(t_smallwrite['offset']/bin_size), sort=False)\
                                                            .sum()
    g3 = sheltered_block_per_period.copy(deep=True)
    if g3.sum() <= max_sheltered_block:
        g3_result[trace] = -1
    else:
        period = 0
        # each iteration is one hour longer period
        while not (g3 > max_sheltered_block).any():
            period += 1
            g3 += sheltered_block_per_period.shift(period)                 
        g3_result[trace] = (shelter_count, max_sheltered_block, g3.max(), period)
        
# the result
pprint(g2_result)
pprint(g3_result)

{'T1': (18127, 8551, 52.82727423180891),
 'T10': (2287446, 2140539, 6.422315543186594),
 'T2': (1747081, 186919, 89.30106846791878),
 'T3': (3761896, 407115, 89.17793049036975),
 'T4': (59741837, 11970701, 79.96261648265018),
 'T5': (518762, 423629, 18.338467351116698),
 'T6': (49487472, 17279296, 65.08349426295204),
 'T7': (142494264, 116285326, 18.39297755873177),
 'T8': (28194859, 12880391, 54.3165262858736),
 'T9': (12299174, 2979913, 75.77143798437196)}
{'T1': -1,
 'T10': -1,
 'T2': -1,
 'T3': -1,
 'T4': -1,
 'T5': -1,
 'T6': (1909.0, 39096320.0, 42962240.0, 4),
 'T7': -1,
 'T8': -1,
 'T9': -1}
