## Biopython.Cluster

## Description

Simple experiments with module `Bio.Cluster`



## Setup
```shell
pip install biopython
pip install numpy
pip install scipy
```

## Utility

In [128]:
import numpy as np
from Bio import Seq, SeqIO
from Bio.Cluster import distancematrix, kcluster, clusterdistance, kmedoids

### Cluster sequences

In [150]:
base_mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}


def map_sequence(seq: Seq) -> np.array:
    return np.array([base_mapping.get(base, -1) for base in seq])


def map_sequences(seqs: list[Seq]) -> np.array:
    return np.array([map_sequence(seq) for seq in seqs])


def read_sequences(filename: str) -> list[Seq]:
    sequences = []
    for record in SeqIO.parse(filename, "fasta"):
        sequences.append(record.seq.upper())
    return sequences


def convert_triangle_to_square_matrix(triangle_matrix: np.array) -> np.array:
    n = len(triangle_matrix)

    square_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(1, i + 1):
            square_matrix[i][j - 1] = triangle_matrix[i][j - 1]
            square_matrix[j - 1][i] = triangle_matrix[i][j - 1]
    
    return square_matrix


def cluster_fasta(filename: str, nclusters: int = 2):
    sequences = read_sequences(filename)
    sequences = map_sequences(sequences)
    distance_matrix = distancematrix(sequences)
    square_distance_matrix = convert_triangle_to_square_matrix(distance_matrix)

    cluster = kcluster(square_distance_matrix, nclusters=nclusters)
    medoids = kmedoids(square_distance_matrix, nclusters=nclusters)
    
    return cluster, medoids

In [151]:
cluster_fasta("grouping-algorithms/data/simple_small_diff.fasta", nclusters=4)

((array([2, 1, 0, 3], dtype=int32), 0.0, 1),
 (array([0, 1, 2, 3], dtype=int32), 0.0, 1))

### Measure distance between clusters

In [152]:
def clusterdistance_fasta(filename: str, indexes1, indexes2):
    sequences = read_sequences(filename)
    sequences = map_sequences(sequences)
    distance_matrix = distancematrix(sequences)
    square_distance_matrix = convert_triangle_to_square_matrix(distance_matrix)
    
    return clusterdistance(square_distance_matrix, mask=None, index1=indexes1, index2=indexes2)

In [153]:
clusterdistance_fasta("grouping-algorithms/data/simple_small_diff.fasta", [0], [1, 2, 3])

33.837890625