### Python code for measuring conservation

In [11]:
# set up 

# import functions to generate a profile abd simMAtrix
from MultipleAlign import profile
from Alignments import DNA_1, BLOSUM62

#generates a random matrix that contains seven rows of 21 column vectors with A, C, T and G
import random
choices = ['A', 'C', 'T', 'G']
randomAlign = []

for i in range(7):
    randomAlign.append(random.choices(choices,k=21))
    
print(randomAlign)

[['G', 'T', 'A', 'T', 'G', 'G', 'A', 'T', 'G', 'G', 'G', 'G', 'A', 'G', 'C', 'T', 'T', 'C', 'G', 'C', 'A'], ['T', 'G', 'G', 'T', 'T', 'G', 'G', 'A', 'C', 'T', 'T', 'C', 'A', 'T', 'C', 'T', 'T', 'A', 'C', 'A', 'A'], ['G', 'G', 'C', 'C', 'C', 'C', 'A', 'A', 'T', 'A', 'T', 'A', 'A', 'G', 'A', 'C', 'T', 'C', 'C', 'C', 'T'], ['A', 'G', 'A', 'A', 'A', 'C', 'G', 'T', 'C', 'C', 'G', 'T', 'T', 'T', 'C', 'G', 'T', 'T', 'T', 'A', 'A'], ['C', 'C', 'C', 'G', 'C', 'T', 'G', 'C', 'A', 'T', 'G', 'C', 'T', 'C', 'G', 'A', 'G', 'T', 'G', 'C', 'G'], ['A', 'G', 'G', 'A', 'A', 'A', 'A', 'G', 'C', 'A', 'C', 'G', 'A', 'A', 'C', 'G', 'T', 'A', 'T', 'A', 'T'], ['C', 'G', 'T', 'A', 'C', 'G', 'G', 'T', 'T', 'T', 'C', 'A', 'C', 'C', 'T', 'T', 'A', 'G', 'T', 'G', 'A']]


In [12]:
# function for getting conservation
def getConservation(align, simMatrix):
    conservation = []
    prof = profile(align)
    
    for compDict in prof:
        items = list(compDict.items()) 
    
        items.sort(key=lambda x: x[1])
    
        score = 0.0
    
        for resA, compA in items:
            for resB, compB in items:
                score += compA * compB * simMatrix[resA][resB]
    
        bestLetter = items[-1][0]
        maxScore = simMatrix[bestLetter][bestLetter]
        
        score /= maxScore
        conservation.append(score)
        
    return conservation



print(getConservation(randomAlign, DNA_1))

[0.26530612244897955, 0.5510204081632654, 0.26530612244897955, 0.30612244897959184, 0.30612244897959184, 0.30612244897959184, 0.510204081632653, 0.30612244897959184, 0.30612244897959184, 0.30612244897959184, 0.346938775510204, 0.26530612244897955, 0.42857142857142855, 0.26530612244897955, 0.3877551020408163, 0.30612244897959184, 0.5510204081632654, 0.26530612244897955, 0.346938775510204, 0.3877551020408163, 0.42857142857142855]


### Python code for calculating substitution matrices

In [23]:
from math import log

def calcSubstitutionMatrix(alignments, alphabet, maxVal, smooth=5):
    matrix = {}
    counts = {}
    
    for letterA in alphabet:
        subDict = {}
        for letterB in alphabet:
            subDict[letterB] = 0
        matrix[letterA] = subDict
        counts[letterA] = 0
   
    totalRes = 0
    totalSub = 0
    
    
    for align in alignments:
        numPos = len(align[0])
        
        for i in range(numPos):
            letters = []
            
            for seq in align:
                letter = seq[i]
            
            if letter == '-':
                continue
                
            letters.append(letter)
            
    
    for letterA in letters:
        counts[letterA] += 1
        for letterB in letters:
            matrix[letterA][letterB] += 1
            
    numLetters = len(letters)
    totalRes += numLetters
    totalSub += numLetters * numLetters
    
    
    averageComp = {}
    for letter in alphabet:
        averageComp[letter] = counts[letter]/totalRes
    
    
    maxScore = None
    for resA in alphabet:
        for resB in alphabet:
            expected = averageComp[resA] * averageComp[resB]
            
            if not expected:
                continue
                
            observed = matrix[resA][resB]
            weight = 1.0 / (1.0+(observed/smooth))
            
            observed /= totalSub
            observed = weight*expected + (1-weight)*observed
            
            logOdds = log(observed/expected)
           
            if (maxScore is None) or (logOdds>maxScore):
                maxScore = logOdds
            matrix[resA][resB] = logOdds
    
    maxScore = abs(maxScore)
    
    for resA in alphabet:
        for resB in alphabet:
            matrix[resA][resB] = int(maxVal*matrix[resA][resB]/maxScore)
    
    return matrix


align2 = ['QPVHPFSRPAPVVIILIILCVMAGVIGTILLISYGIRLLIK-------------',
          'QLVHRFTVPAPVVIILIILCVMAGIIGTILLISYTIRRLIK-------------',
          'QLAHHFSEPE---ITLIIFGVMAGVIGTILLISYGIRRLIKKSPSDVKPLPSPD',
          'QLVHEFSELV---IALIIFGVMAGVIGTILFISYGSRRLIKKSESDVQPLPPPD',
          'MLEHEFSAPV---AILIILGVMAGIIGIILLISYSIGQIIKKRSVDIQPPEDED',
          'PIQHDFPALV---MILIILGVMAGIIGTILLISYCISRMTKKSSVDIQSPEGGD',
          'QLVHIFSEPV---IIGIIYAVMLGIIITILSIAFCIGQLTKKSSLPAQVASPED',
          '-LAHDFSQPV---ITVIILGVMAGIIGIILLLAYVSRRLRKRP-----PADVP-',
          'SYHQDFSHAE---ITGIIFAVMAGLLLIIFLIAYLIRRMIKKPLPVPKPQDSPD']


aminoAcids = BLOSUM62.keys()
print(calcSubstitutionMatrix([align2,], aminoAcids, 10))

ZeroDivisionError: float division by zero

### Python code for calculating distance matrices

In [22]:
from Alignments import sequenceAlign, calcSeqSimilarity
from math import exp

def getDistanceMatrix(seqs, simMatrix):
    
    n = len(seqs)
    matrix = [[0.0] * n for x in range(n)]
    maxScores = [calcSeqSimilarity(x, x, simMatrix) for x in seqs]
    
    for i in range(n-1):
        seqA = seqs[i]
       
        for j in range(i+1,n):
            seqB = seqs[j]
            score, alignA, alignB = sequenceAlign(seqA, seqB, simMatrix)
            maxScore = max(maxScores[i],maxScores[j])
            dist = maxScore - score
            matrix[i][j] = dist
            matrix[j][i] = dist
            
    return matrix



distMatrix = getDistanceMatrix(randomAlign, DNA_1)

print(distMatrix)

[[0.0, 14.0, 14.0, 15.0, 18.0, 16.0, 17.0], [14.0, 0.0, 15.0, 13.0, 18.0, 13.0, 15.0], [14.0, 15.0, 0.0, 18.0, 18.0, 15.0, 17.0], [15.0, 13.0, 18.0, 0.0, 17.0, 11.0, 15.0], [18.0, 18.0, 18.0, 17.0, 0.0, 21.0, 16.0], [16.0, 13.0, 15.0, 11.0, 21.0, 0.0, 17.0], [17.0, 15.0, 17.0, 15.0, 16.0, 17.0, 0.0]]
