# Experiment 2

Compare error rates on **real data** for the following:

- 10% redundancy punctured repetition code (coset shifted): Trellis BMA without lookahead vs Trellis BMA with lookahead

## Load real data

In [None]:
import seaborn as sns
import numpy as np
sns.set()

from Levenshtein import distance, editops
from scipy.stats import mode
from tqdm import trange

print("Loading data...")
centers_list_chars = []
centers_str = []
# with open('TrainCenters.txt') as f:
with open('DataToProcess/TrainCenters.txt') as f:
    for l in f:
        centers_list_chars.append(list(l.split()[0]))
        centers_str.append(l.split()[0])
centers_list_chars = np.array(centers_list_chars)

traces_list_chars = []
traces_str = []
with open('DataToProcess/TrainClusters.txt') as f:
    for l in f:
        if l[0] == '=':
            traces_list_chars.append([])
            traces_str.append([])
        else:
            traces_list_chars[-1].append(np.array(list(l.split()[0])))
            traces_str[-1].append(l.split()[0])

def map2int(strarray, chars):
    maps = {}
    for i in range(len(chars)):
        maps[chars[i]] = i
    intarray = np.zeros_like(strarray, dtype = int)
    for i in range(len(strarray)):
        intarray[i] = maps[strarray[i]]
    return intarray


alphabet = ['A','C','G','T']
centers_list = []
traces_list = []
for i in trange(len(centers_list_chars), desc = "Creating dataset"):
    centers_list.append(map2int(centers_list_chars[i], alphabet))
    traces_list.append([])
    for j in range(len(traces_list_chars[i])):
        traces_list[-1].append(map2int(traces_list_chars[i][j], alphabet))

In [None]:
from helper_functions import *
import pandas as pd
import numpy as np
from scipy.stats import mode

from conv_code import *
from coded_ids_multiD import *
from bma import *
from trellis_bma import *

import time

import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

## Reverse engineering the code given the codeword and a random input sequence

In [None]:
in_len = 100
N_cw = 110
redundancy = N_cw-in_len

A_in = 4
A_cw = 4

num_traces = 1
p_del = 0.011
p_sub = 0.017
p_ins = 0.02
max_drift = 10

In [None]:
# Real data
max_iters = 2000

errors = {}
errors["desc"] = "Experiment comparing error rates on real data with 10% repetition CC code \
(punctured and coset shifted). \n\
Algorithms compared: Trellis BMA no look-ahead, Trellis BMA with look-ahead.\n\
Experiment run on the first {} clusters of the real data.\
".format(max_iters)

errors["Tbma_noLA"] = []
errors["Tbma_LA"] = []
errors["cluster_size"] = [2,3,4,5,6]

for cluster_size in errors["cluster_size"]:
    Tbma_noLA_errors = []
    Tbma_LA_errors = []
    
    for it in trange(max_iters):
        if len(traces_list[it]) == 0:
            print("Encountered empty cluster, ignored it.")
        
        in_seq = np.random.choice(4,size=in_len)       # first generate a random input sequence
        
        cc = conv_code()
        G = np.array([[1],[1]])
        cc.quar_cc(G)
        cc.make_trellis(in_len)
        cc.puncture(redundancy=redundancy)
        cc.make_encoder()
        encoded_seq = cc.encode(in_seq)
        coset = np.mod(centers_list[it]-encoded_seq,4)   # determine appropriate coset vector 
        cc.add_coset(coset)
        
        code_trellis_states = cc.trellis_states
        code_trellis_edges = cc.trellis_edges
        code_time_type = cc.time_type
        
        ids_trellis = coded_ids_multiD(A_in, A_cw, code_trellis_states,code_trellis_edges, code_time_type,\
                 num_traces, p_del, p_sub, p_ins, max_drift, input_prior = None)
        
        tr_list = traces_list[it][:cluster_size]
        Tbma_noLA_estimate = trellis_bma(ids_trellis,tr_list,cc.trellis_states[0][0],\
                                                 cc.trellis_states[-1],lookahead = False)
        Tbma_LA_estimate = trellis_bma(ids_trellis,tr_list,cc.trellis_states[0][0],\
                                                 cc.trellis_states[-1],lookahead = True)
        
        Tbma_noLA_errors.append((Tbma_noLA_estimate != in_seq).sum())
        Tbma_LA_errors.append((Tbma_LA_estimate != in_seq).sum())

    errors["Tbma_noLA"].append(np.array(Tbma_noLA_errors))
    errors["Tbma_LA"].append(np.array(Tbma_LA_errors))
    
    print("Trellis BMA (no LA) error rate for a cluster size: ",cluster_size,"is ",np.array(Tbma_noLA_errors).mean())
    print("Trellis BMA (with LA) error rate for a cluster size:",cluster_size,"is ",np.array(Tbma_LA_errors).mean())
    
    time.sleep(0.5)

errors["Tbma_noLA"] = np.array(errors["Tbma_noLA"])
errors["Tbma_LA"] = np.array(errors["Tbma_LA"])

np.save("SavedData/Exp2.npy",errors)