In [7]:
import argparse
import json
from collections import Counter, namedtuple
import itertools
import sys
import os

import pandas as pd


# rep_1000000s_sim_500000t_abcde__100_100.s
class Input(namedtuple('fn', 'st sl tt tl alp rep_mut sim_mut rep_block'.split())):
    @classmethod
    def from_fn(cls, fname, cfg_path='input_spec.json'):
        with open(cfg_path) as fd:
            rep_block = json.load(fd)['rep_block']

        a, b = fname.split('__')
        st, sl, tt, tl, alp = a.split('_')
        rep_mut, sim_mut = b[:-2].split("_")
        return cls(st, int(sl[:-1]), tt, int(tl[:-1]), alp, int(rep_mut), int(sim_mut), rep_block)

    @property
    def num_blocks(self):
        return self.rep_block
    @property
    def block_len(self):
        return self.sl / self.num_blocks

    def path(self, which):
        return "{st}_{sl}s_{tt}_{tl}t_{alp}__{rep_mut}_{sim_mut}.{w}".format(w=which, **self._asdict())

    def _split_in_blocks(self):
        from operator import  itemgetter

        with open('data_dir/' + self.path('s')) as fd:
            s = fd.read().strip()

        res = []
        for k, giter in itertools.groupby(enumerate(s), key=lambda (i, c): i / self.block_len):
            res.append("".join(map(itemgetter(1), giter)))
        return res
            
    def s_mutations(self):
        blocks = self._split_in_blocks()
        m = []
        for i in range(1, len(blocks)):
            m.append(sum(c1 != c2 for c1, c2 in zip(blocks[0], blocks[i])))
        l = map(lambda b: len(b) == self.block_len, blocks)
        assert all(l)
        assert len(l) == len(m) + 1
        return pd.Series(m)

    def t_mutations(self):
        with open('data_dir/' + self.path('s')) as fd:
            s = fd.read().strip()
        with open('data_dir/' + self.path('t')) as fd:
            t = fd.read().strip()
        return sum(c1 != c2 for c1, c2 in zip(s, t))

    def report(self):
        smut = self.s_mutations()
        print smut.describe()        
        tmut = self.t_mutations()
        print tmut

with open('input_spec.json') as fd:
    cfg = json.load(fd)

res = []
for rm, sm, alp in itertools.product(cfg['rep_mut'], cfg['sim_mut'], cfg['alp']):
    i = Input.from_fn('rep_100000s_sim_10000t_{alp}__{rm}_{sm}.s'.format(rm=rm, sm=sm, alp=alp))
    s_mut = i.s_mutations()
    res.append([i.path('s'), i.rep_mut, s_mut.mean(), s_mut.std(), i.sim_mut, i.t_mutations()])
    print i

pd.DataFrame(data=res, columns=['path', 's_exp', 's_found_avg', 's_found_std', 't_exp', 't_found'])

KeyboardInterrupt: 

In [4]:
i.num_blocks

1000

In [82]:
s = open('data_dir/' + i.path('s')).read()
t = open('data_dir/' + i.path('t')).read()

len(s), len(t)

(10000, 500)