### Scoring alignments

Previous steps include MARS to orient the sequences in a consistent way, extracting the reference into a separate file, and 
needleall to align the sequences to the reference. For example:




At the end we have a test.fasta with interleaved alignment pairs, the first is the reference and the second is the aligned sequence

### Some helper functions we'll need

In [32]:
def edit_distance(string1, string2):
    # this is very basic -- I think we'll want to improve this to deal with indels a bit better
    return sum([1 if base1 != base2 else 0 for base1,base2 in zip(string1,string2)])

def affine_scoring(string1,string2,match_cost=5.0,mismatch_cost=-4.0,gap_open_cost=-10.0,gap_extend_cost=0.5):
    """
    A more fancy approach: take the alignments and score with an affine gap penality approach; matches and mismatches
    have a set cost, but gaps have a cost to open and a cost to extend. We don't allow for both to be gaps (don't make
    sense), or for the aligner to switch from an deletion on one strand to another strand (rare, generally nonsense)
    """
    assert(len(string1) == len(string2))
    total_score = 0
    in_gap = None
    for base1,base2 in zip(string1,string2):
        if base1 == base2:
            total_score += match_cost
            in_gap = None
        elif base1 == '-' and base2 != '-':
            assert(in_gap != 'string2')
            if in_gap == 'string1':
                total_score += gap_extend_cost
            else: 
                total_score += gap_extend_cost
                in_gap = 'string1'
        elif base1 != '-' and base2 == '-':
            assert(in_gap != 'string1')
            if in_gap == 'string2':
                total_score += gap_extend_cost
            else: 
                total_score += gap_extend_cost
                in_gap = 'string2'
        elif base1 != base2:
            total_score += mismatch_cost
            in_gap = None
        else:
            assert(1 == 0) # we shouldn't be here...
    return(total_score)

def trim_to_minimal_size(string1, string2):
    """Find the minimum substring of two inputs where the maximal contiguous stretch of dashes is removed"""
    assert(len(string1) == len(string2))
    string1_lstrip = string1.lstrip("-")
    string1_size = len(string1_lstrip)
    
    string2_lstrip = string2.lstrip("-")
    string2_size = len(string2_lstrip)
    
    # we cut the longer string down to the same size as the smaller string
    if string1_size > string2_size:
        string1_lstrip = string1[(len(string2) - string2_size):len(string1)]
    else:
        string2_lstrip = string2[(len(string1) - string1_size):len(string2)]
    
    string1_rstrip = string1_lstrip.rstrip("-")
    string1_size = len(string1_rstrip)
    
    string2_rstrip = string2_lstrip.rstrip("-")
    string2_size = len(string2_rstrip)
    
    # again we cut the longer string down to the same size as the smaller string, just from the right
    if string1_size > string2_size:
        string1_rstrip = string1_rstrip[0:string2_size]
    else:
        string2_rstrip = string2_rstrip[0:string1_size]
        
    return((string1_rstrip,string2_rstrip))

def score_aligned_segment(string1,string2):
    minimal_overlaps = trim_to_minimal_size(string1.upper(), string2.upper())
    return(edit_distance(minimal_overlaps[0],minimal_overlaps[1]),len(minimal_overlaps[0]),
           affine_scoring(minimal_overlaps[0],minimal_overlaps[1]),len(minimal_overlaps[0]))
    
# a little test
print(score_aligned_segment("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT","------------ACGgACGTACGTACcTACGTACGTACGT----"))
print(score_aligned_segment("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT","------------ACGTACGTACGTACGTACGTACGTACGT----"))
print(score_aligned_segment("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT","------------ACGT---TACGTACGTACGTACGTACGT----"))

(2, 28, 122.0, 28)
(0, 28, 140.0, 28)
(3, 28, 126.5, 28)


### Now lets read in the file and see how the each sequences scores

We'll assume the reference is the first FASTA entry

In [35]:
from Bio import SeqIO

records = list(SeqIO.parse("test.fasta", "fasta"))

# needleall has a weird 
for i in range(0,len(records),2):
    scored = score_aligned_segment(records[i].seq,records[i+1].seq)
    print(records[i].id + "\t" + records[i+1].id + "\t" + 
          str(scored[0]) + "\t" + str(scored[1]) + "\t" + 
          str(scored[0]/float(scored[1])) + "\t" + str(scored[2]/float(scored[3])))

pY026	medaka	2	9770	0.00020470829068577277	4.998157625383828
pY026	pilon	5	9770	0.0005117707267144319	4.996775844421699
pY026	Sanger_Tn5_001	46	1105	0.0416289592760181	4.800452488687783
