In [1]:
!pip install biopython pandas scipy

Defaulting to user installation because normal site-packages is not writeable


In [4]:
# Importar bibliotecas necessárias
from Bio import AlignIO
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from scipy.stats import f_oneway
import numpy as np

In [3]:
from Bio import AlignIO

alignment = AlignIO.read("/home/nathan/Documents/Doutorado_diversidade_genética/Phylogeogra/AMOVA/nuc_files/exon2/cmyc-phased_exon2.fas", "fasta")
print(alignment)
populations = []
for record in alignment:
    if record.id.startswith(("AA", "TAP", "SM")):
        populations.append("Norte")
    elif record.id.startswith(("BC", "BMD", "FC")):
        populations.append("Sul")
    else:
        populations.append("Unknown")

print(populations)

Alignment with 84 rows and 301 columns
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG AA925-1
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG AA925-2
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG AA936-1
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG AA936-2
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG AA946-1
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG AA946-2
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG AA947-1
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG AA947-2
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG AA948-1
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG AA948-2
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG AA949-1
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG AA949-2
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG BC726-1
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG BC726-2
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG BC727-1
GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAG...CTG BC727-2
GAGAAAGTGGTGTCGGA

In [6]:
# Converter alinhamento para matriz de caracteres
sequence_matrix = np.array([list(str(record.seq)) for record in alignment])

#verificar a matriz
print(sequence_matrix.shape)

(84, 301)


In [7]:
# Definir a função p_distance para operar na matriz
def p_distance(seq1, seq2):
    differences = np.sum((seq1 != seq2) & (seq1 != '-') & (seq2 != '-'))
    length = np.sum((seq1 != '-') & (seq2 != '-'))
    return differences / length if length > 0 else 0

In [8]:
# Calcular a matriz de distâncias
distance_matrix = squareform(pdist(sequence_matrix, metric=lambda u, v: p_distance(u, v)))

# Exibir a matriz
print(distance_matrix)

[[0.         0.00996678 0.         ... 0.00996678 0.00332226 0.00332226]
 [0.00996678 0.         0.00996678 ... 0.00664452 0.00664452 0.00664452]
 [0.         0.00996678 0.         ... 0.00996678 0.00332226 0.00332226]
 ...
 [0.00996678 0.00664452 0.00996678 ... 0.         0.00664452 0.00664452]
 [0.00332226 0.00664452 0.00332226 ... 0.00664452 0.         0.        ]
 [0.00332226 0.00664452 0.00332226 ... 0.00664452 0.         0.        ]]


In [None]:
# Criar um DataFrame com as sequências e os grupos
df_sequences = pd.DataFrame({
    'Sequence': [''.join(seq) for seq in sequence_matrix],
    'Group': populations
})

# Exibir o DataFrame
print(df_sequences)

                                             Sequence  Group
0   GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAGGA...  Norte
1   GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAGGA...  Norte
2   GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAGGA...  Norte
3   GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAGGA...  Norte
4   GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAGGA...  Norte
..                                                ...    ...
79  GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAGGA...  Norte
80  GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAGGA...  Norte
81  GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAGGA...  Norte
82  GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAGGA...  Norte
83  GAGAAAGTGGTGTCGGAAAAGCTGGCGTCCTACCAGGCTTCTAGGA...  Norte

[84 rows x 2 columns]


In [10]:
# Calcular a variância total da matriz de distâncias
total_variance = np.var(distance_matrix)
print(f"Variância total: {total_variance}")

Variância total: 3.608383875982428e-05


In [11]:
# Separar as distâncias por grupo
distances_norte = distance_matrix[np.array(populations) == 'Norte']
distances_sul = distance_matrix[np.array(populations) == 'Sul']

# Calcular a variância entre os grupos
variance_between_groups = f_oneway(distances_norte.flatten(), distances_sul.flatten())

# Exibir a variância entre os grupos
print(f"Variância entre grupos: {variance_between_groups.statistic}")
print(f"p-value: {variance_between_groups.pvalue}")

Variância entre grupos: 305.0303236718971
p-value: 6.655107691347894e-67


In [12]:
# Calcular a variância dentro dos grupos
variance_within_norte = np.var(distances_norte)
variance_within_sul = np.var(distances_sul)

# Exibir as variâncias dentro dos grupos
print(f"Variância dentro do grupo Norte: {variance_within_norte}")
print(f"Variância dentro do grupo Sul: {variance_within_sul}")

Variância dentro do grupo Norte: 2.441102722807868e-05
Variância dentro do grupo Sul: 4.384011972335944e-05


In [13]:
from scipy.stats import f_oneway

# Realizar o teste de ANOVA
anova_result = f_oneway(distances_norte.flatten(), distances_sul.flatten())

# Exibir os resultados do teste de ANOVA
print(f"Estatística F: {anova_result.statistic}")
print(f"p-value: {anova_result.pvalue}")

# Verificar se o resultado é significativo
alpha = 0.05
if anova_result.pvalue < alpha:
    print("A diferença entre as variâncias é estatisticamente significativa.")
else:
    print("A diferença entre as variâncias não é estatisticamente significativa.")

Estatística F: 305.0303236718971
p-value: 6.655107691347894e-67
A diferença entre as variâncias é estatisticamente significativa.
