# 1. Umap visualize comparative analyses with public AMP sequences

In [8]:
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt

In [58]:
data = pd.read_csv('all-to-all.2.similarity', sep='\t', header=None)

In [59]:
matrix = data.pivot(index=0, columns=1, values=2)

In [60]:
# 补对称（核心）
matrix = matrix.combine_first(matrix.T)
# similarity → distance
matrix = 1 - matrix
# 如果是 similarity, 对角线补 1; 如果是 distance, 对角线补0;
np.fill_diagonal(matrix.values, 0.0)
# matrix.to_pickle('all-to-all.2.dist.pkl')

In [61]:
matrix.shape

(13382, 13382)

In [68]:
reducer = umap.UMAP(
    metric="precomputed", # Sometimes, you have already calculated the pairwise distances between points, and your input data is a distance/similarity matrix. 
    n_neighbors=15,
    min_dist=.3,
    # spread=1.5,
    # repulsion_strength=3.0,
    n_components=2,
    init="spectral",
    random_state=42
)
embedding = reducer.fit_transform(matrix)

  warn("using precomputed metric; inverse_transform will be unavailable")
  warn(


In [69]:
df = pd.DataFrame(embedding, columns=['UMAP1', 'UMAP2'])
df.insert(0, 'name', matrix.index)
df.to_csv('all-to-all.2.dist.umap.tsv', sep='\t', index=False)

# 2. Generate peptide's json files for openfold3 to predict the peptide structure

In [30]:
import json
import pandas as pd
from Bio import SeqIO

In [50]:
fasta = 'final_AMPs.fa'
out_file = 'openfold3/final_AMPs.json.0'
queries = {}
for record in SeqIO.parse(fasta, 'fasta'):
     # 构建 OpenFold3/AlphaFold3 标准输入格式
    queries[record.id] = {
        'chains': [
            {
                'molecule_type': 'protein',
                'chain_ids': 'A',
                'sequence': str(record.seq),
            }
        ]
    }
    query_json = {'queries': queries}

with open(out_file, 'w') as f:
    json.dump(query_json, f, indent=2)

In [53]:
queries['Bacteriocins-0001']

{'chains': [{'molecule_type': 'protein',
   'chain_ids': 'A',
   'sequence': 'IGLSRLGHRSRWISRL'}]}

# 3. Calculate properties of peptides using modlAMP python library

In [69]:
from Bio import SeqIO
from modlamp import descriptors
import pandas as pd

In [171]:
seq_ids = []
seqs = []
# fas = 'final_AMPs.fa'
fas = 'amps.pep.ref.removeXBZ.fa'
for record in SeqIO.parse(fas, 'fasta'):
    seq_ids.append(record.id)
    seqs.append(str(record.seq))

In [172]:
desc = descriptors.PeptideDescriptor(seqs, scalename='Eisenberg') # use Eisenberg consensus scale
desc.calculate_global()  # calculate global Eisenberg hydrophobicity
desc.calculate_moment(append=True)  # calculate Eisenberg hydrophobic moment

In [173]:
df = pd.DataFrame(desc.descriptor, columns=['H_Eisenberg','uH_Eisenberg'])
df.insert(0, 'name', seq_ids)
df.to_csv('amps.pep.ref.removeXBZ.H_uH.tsv', sep='\t', index=False)

In [174]:
desc = descriptors.GlobalDescriptor(seqs)
desc.calculate_all()

In [175]:
df = pd.DataFrame(desc.descriptor, columns=desc.featurenames)
df.insert(0, 'name', seq_ids)
df.to_csv('amps.pep.ref.removeXBZ.globalDescriptor.tsv', sep='\t', index=False)

# 4. Calculate compostion of AA.

In [2]:
from Bio import SeqIO
from collections import Counter

In [12]:
# fas = 'final_AMPs.fa'
fas = 'amps.pep.ref.removeXBZ.fa'
sequences = SeqIO.parse(fas, 'fasta')

# 初始化一个计数器
amino_acid_count = Counter()

# 遍历每个序列并统计氨基酸
for record in sequences:
    amino_acid_count.update(record.seq)

In [13]:
series = pd.Series(amino_acid_count)
series = series / sum(series)
series.to_frame(name='pct').to_csv('amps.pep.ref.removeXBZ.AA.pct.tsv', sep='\t', index=True, index_label='AA')