In [511]:
import random 
import numpy as np
from typing import *

In [512]:
class GeneticAlgorithm:
    def __init__(
        self,
        pop_size: int,
        sequences: List[List[str]],
        crossover_prob: float = 0.8,
        mutation_prob: float = 0.01,
        max_iters: int = 1
    ):
        """
        Algoritmo genético para o problema do alinhamento múltiplo de sequências

        Args:
            n_sequences (int): número de sequências a serem alinhadas, dimensão do problema
            pop_size (int): tamanho da população
            sequences (List[str]): sequências a serem alinhadas
        """
        # constantes para melhor legibilidade
        self.GAP = "-"
        self.AMINO_ACIDS= set([
            'A', 'R', 'N', 'D', 'C', 'Q', 'E', 
             'G', 'H', 'I', 'L', 'K', 'M', 'F', 
             'P', 'S', 'T', 'W', 'Y', 'V'
        ])
        
        # matriz da relação de punição na troca de aminoácidos
        self.PAM250 = self.get_PAM250()
        
        # dimensões do problema
        self.n_sequences = len(sequences)
        self.pop_size = pop_size
        self.max_iters = max_iters
        
        self.population = []
        self.pop_fitness = np.zeros(pop_size)
        
        # criando os gaps para que todos os pontos tenham o mesmo tamanho
        # pega o tamanho da maior sequência
        largest_sequence = max(sequences, key=len)
        sequences.remove(largest_sequence)
        
        for i in range(pop_size):
            point = [largest_sequence]
            for sequence in sequences:
                while len(sequence) < len(largest_sequence):
                    idx = random.randint(0, len(sequence)-1)
                    sequence = sequence[:idx] + [self.GAP] + sequence[idx:]
                point.append(sequence)
                
            self.population.append(point)
            self.pop_fitness[i] = self.fitness(point)
            
        
        #self.print_population()
        
        print(self.fitness(self.population[1]))
        
    
    def print_population(self):
        for point in self.population:
            print(point)
            print("\n")
            
            
    def crossover(self, P1: str, P2: str):
        operator = 1 
        
        child_1 = [None] * self.n_sequences
        child_2 = [None] * self.n_sequences
        
        #idx = random.randint(0, len(S1))
        idx = 2
        # como fazer o gap é um problema ainda
        for i in range(self.n_sequences):
            if operator == 1:
                child_1[i] = P1[i][:idx] + P2[i][idx:]
                child_2[i] = P2[i][:idx] + P1[i][idx:]
            
            elif operator == 2:
                if i < idx:
                    child_1[i] = P1[i]
                    child_2[i] = P2[i]
                else:
                    child_1[i] = P2[i]
                    child_2[i] = P1[i]
                    
        return [child_1, child_2]
    
    
    def mutation(self, P: str):
        operator = 3
        
        i = random.randint(0, self.n_sequences-1)
        sequence_size = len(P[i])
        
        if operator == 1:
            while True:
                p1 = random.randint(0, sequence_size-1)
                p2 = random.randint(0, sequence_size-1)
                
                if p1 != p2:
                    temp = P[i][p1]
                    P[i][p1] = P[i][p2]
                    P[i][p2] = temp
                    break
        
        elif operator == 2:
            while True:
                p1 = random.randint(0, sequence_size-1)
                p2 = random.randint(0, sequence_size-1)
                p3 = random.randint(0, sequence_size-1)
                
                if p1 != p2 and p1 != p3 and p2 != p3:
                    temp1 = P[i][p1]
                    temp2 = P[i][p2]
                    
                    P[i][p1] = P[i][p3]
                    P[i][p2] = temp1
                    P[i][p3] = temp2
                    break
        
        elif operator == 3:
            while True:
                p1 = random.randint(0, sequence_size-1)
                p2 = random.randint(0, sequence_size-1)
                
                if p1 > p2:
                    p1, p2 = p2, p1
                
                if p1 != p2:
                    P[i] = P[i][:p1] + list(reversed(P[i][p1:p2])) + P[i][p2:]
                    break
        
        return P
                    
    
    def fitness(self, point):
        sum = 0
        for i in range(len(point)-1):
            for j in range(i+1, len(point)):
                print(point[i])
                print(point[j])

                sum += self.gap_penalty(point[i], point[j])
                print("\n")
        
        return sum
    
    
    def gap_penalty(self, S1: str, S2: str) -> int:
        penalty = 0
        for s1, s2 in zip(S1, S2):
            if s1 == self.GAP and s2 == self.GAP:
                penalty += 1
            
            elif s1 in self.AMINO_ACIDS and s2 == self.GAP:
                penalty += 2
            
            elif s1 == self.GAP and s2 in self.AMINO_ACIDS:
                penalty += 3
            
            elif s1 in self.AMINO_ACIDS and s2 in self.AMINO_ACIDS:
                penalty += self.PAM250[s1][s2]
        
        return penalty
        
    
    def get_PAM250(self): # PAM250 scoring matrix
        sMatrixTxt = '''
        A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  W  Y
        A  2 -2  0  0 -3  1 -1 -1 -1 -2 -1  0  1  0 -2  1  1  0 -6 -3
        C -2 12 -5 -5 -4 -3 -3 -2 -5 -6 -5 -4 -3 -5 -4  0 -2 -2 -8  0
        D  0 -5  4  3 -6  1  1 -2  0 -4 -3  2 -1  2 -1  0  0 -2 -7 -4
        E  0 -5  3  4 -5  0  1 -2  0 -3 -2  1 -1  2 -1  0  0 -2 -7 -4
        F -3 -4 -6 -5  9 -5 -2  1 -5  2  0 -3 -5 -5 -4 -3 -3 -1  0  7
        G  1 -3  1  0 -5  5 -2 -3 -2 -4 -3  0  0 -1 -3  1  0 -1 -7 -5
        H -1 -3  1  1 -2 -2  6 -2  0 -2 -2  2  0  3  2 -1 -1 -2 -3  0
        I -1 -2 -2 -2  1 -3 -2  5 -2  2  2 -2 -2 -2 -2 -1  0  4 -5 -1
        K -1 -5  0  0 -5 -2  0 -2  5 -3  0  1 -1  1  3  0  0 -2 -3 -4
        L -2 -6 -4 -3  2 -4 -2  2 -3  6  4 -3 -3 -2 -3 -3 -2  2 -2 -1
        M -1 -5 -3 -2  0 -3 -2  2  0  4  6 -2 -2 -1  0 -2 -1  2 -4 -2
        N  0 -4  2  1 -3  0  2 -2  1 -3 -2  2  0  1  0  1  0 -2 -4 -2
        P  1 -3 -1 -1 -5  0  0 -2 -1 -3 -2  0  6  0  0  1  0 -1 -6 -5
        Q  0 -5  2  2 -5 -1  3 -2  1 -2 -1  1  0  4  1 -1 -1 -2 -5 -4
        R -2 -4 -1 -1 -4 -3  2 -2  3 -3  0  0  0  1  6  0 -1 -2  2 -4
        S  1  0  0  0 -3  1 -1 -1  0 -3 -2  1  1 -1  0  2  1 -1 -2 -3
        T  1 -2  0  0 -3  0 -1  0  0 -2 -1  0  0 -1 -1  1  3  0 -5 -3
        V  0 -2 -2 -2 -1 -1 -2  4 -2  2  2 -2 -1 -2 -2 -1  0  4 -6 -2
        W -6 -8 -7 -7  0 -7 -3 -5 -3 -2 -4 -4 -6 -5  2 -2 -5 -6 17  0
        Y -3  0 -4 -4  7 -5  0 -1 -4 -1 -2 -2 -5 -4 -4 -3 -3 -2  0 10
    '''
        sMatrixList = sMatrixTxt.strip().split('\n')
        aaList = sMatrixList[0].split()
        sMatrix = dict()
        for aa in aaList:
            sMatrix[aa] = dict()
        for i in range(1, len(aaList) + 1):
            currRow = sMatrixList[i].split()
            for j in range(len(aaList)):
                sMatrix[currRow[0]][aaList[j]] = int(currRow[j + 1])
        return sMatrix
    
    
    def run(self):
        for gen in range(self.max_iters):
            self.population = [self.population[i] for i in np.argsort(self.pop_fitness)]
            self.pop_fitness = [self.pop_fitness[i] for i in np.argsort(self.pop_fitness)]
            
            children = []
            
            for i in range(0, int(0.6 * self.pop_size), 2):
                children += self.crossover(self.population[i], self.population[i+1])
            
            for i in range(0, int(0.4 * self.pop_size)):
                children.append(self.mutation(self.population[i]))
            
            
                

In [513]:
sequences = [
    ["A","A","A","A","A","A"],
    ["R","R","R"],
    ["L","L","L","L"],
    ["K","K","K","K","K"]
]

ga = GeneticAlgorithm(3, sequences)

['A', 'A', 'A', 'A', 'A', 'A']
['-', '-', 'R', 'R', '-', 'R']


['A', 'A', 'A', 'A', 'A', 'A']
['L', 'L', '-', 'L', '-', 'L']


['A', 'A', 'A', 'A', 'A', 'A']
['K', 'K', '-', 'K', 'K', 'K']


['-', '-', 'R', 'R', '-', 'R']
['L', 'L', '-', 'L', '-', 'L']


['-', '-', 'R', 'R', '-', 'R']
['K', 'K', '-', 'K', 'K', 'K']


['L', 'L', '-', 'L', '-', 'L']
['K', 'K', '-', 'K', 'K', 'K']


['A', 'A', 'A', 'A', 'A', 'A']
['-', '-', 'R', 'R', '-', 'R']


['A', 'A', 'A', 'A', 'A', 'A']
['L', 'L', '-', '-', 'L', 'L']


['A', 'A', 'A', 'A', 'A', 'A']
['K', 'K', '-', 'K', 'K', 'K']


['-', '-', 'R', 'R', '-', 'R']
['L', 'L', '-', '-', 'L', 'L']


['-', '-', 'R', 'R', '-', 'R']
['K', 'K', '-', 'K', 'K', 'K']


['L', 'L', '-', '-', 'L', 'L']
['K', 'K', '-', 'K', 'K', 'K']


['A', 'A', 'A', 'A', 'A', 'A']
['-', '-', '-', 'R', 'R', 'R']


['A', 'A', 'A', 'A', 'A', 'A']
['L', 'L', '-', '-', 'L', 'L']


['A', 'A', 'A', 'A', 'A', 'A']
['K', 'K', '-', 'K', 'K', 'K']


['-', '-', '-', 'R', 'R', 'R']
['L', 'L'

In [514]:
S1 = ["EGKVN---EDEVAGEAL-",
      "EDKVNEEE---VGGEAL-",
      "EGKVG--AHLGEYGAEAL",
      "ESKVAGHAA--GEYGAEAL"]

S2 = ["--WGKVDVDLVG-GEAL",
      "ED--KVNEEGVG-GEAL",
      "EGKVGA-AEGEYGAEAL",
      "ESKVAGHAGAY-GAEAL"]

C1, C2 = ga.crossover(S1, S2)

In [515]:
C1

['EGWGKVDVDLVG-GEAL',
 'ED--KVNEEGVG-GEAL',
 'EGKVGA-AEGEYGAEAL',
 'ESKVAGHAGAY-GAEAL']

In [516]:
C2

['--KVN---EDEVAGEAL-',
 'EDKVNEEE---VGGEAL-',
 'EGKVG--AHLGEYGAEAL',
 'ESKVAGHAA--GEYGAEAL']

In [517]:
for i in range(len(S1)):
    print(len(S1[i]))

print("\n")

for i in range(len(S2)):
    print(len(S2[i]))

18
18
18
19


17
17
17
17


In [518]:
for i in range(len(C1)):
    print(len(C1[i]))

print("\n")

for i in range(len(C2)):
    print(len(C2[i]))

17
17
17
17


18
18
18
19


In [519]:
C1

['EGWGKVDVDLVG-GEAL',
 'ED--KVNEEGVG-GEAL',
 'EGKVGA-AEGEYGAEAL',
 'ESKVAGHAGAY-GAEAL']

In [520]:
C2

['--KVN---EDEVAGEAL-',
 'EDKVNEEE---VGGEAL-',
 'EGKVG--AHLGEYGAEAL',
 'ESKVAGHAA--GEYGAEAL']

In [521]:
ga.population[0]

[['A', 'A', 'A', 'A', 'A', 'A'],
 ['-', '-', 'R', 'R', '-', 'R'],
 ['L', 'L', '-', 'L', '-', 'L'],
 ['K', 'K', '-', 'K', 'K', 'K']]

In [522]:
ga.mutation(ga.population[0])

[['A', 'A', 'A', 'A', 'A', 'A'],
 ['-', 'R', 'R', '-', '-', 'R'],
 ['L', 'L', '-', 'L', '-', 'L'],
 ['K', 'K', '-', 'K', 'K', 'K']]

In [523]:
ga.print_population()

[['A', 'A', 'A', 'A', 'A', 'A'], ['-', 'R', 'R', '-', '-', 'R'], ['L', 'L', '-', 'L', '-', 'L'], ['K', 'K', '-', 'K', 'K', 'K']]


[['A', 'A', 'A', 'A', 'A', 'A'], ['-', '-', 'R', 'R', '-', 'R'], ['L', 'L', '-', '-', 'L', 'L'], ['K', 'K', '-', 'K', 'K', 'K']]


[['A', 'A', 'A', 'A', 'A', 'A'], ['-', '-', '-', 'R', 'R', 'R'], ['L', 'L', '-', '-', 'L', 'L'], ['K', 'K', '-', 'K', 'K', 'K']]




In [524]:
ga.pop_fitness

array([ 5., 12.,  4.])

In [525]:
np.argsort(ga.pop_fitness)

array([2, 0, 1])

In [526]:
[ga.pop_fitness[i] for i in np.argsort(ga.pop_fitness)]

[np.float64(4.0), np.float64(5.0), np.float64(12.0)]

In [529]:
ga.run()

[np.float64(4.0), np.float64(5.0), np.float64(12.0)]
