In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import re 
from utils import * 
import itertools 
from tqdm import tqdm 
import warnings 
import subprocess
import os 
from src.ggkbase import * 
import glob
import matplotlib as mpl 
from src.files import BLASTFileJSON, FASTAFile, InterProScanFileTSV, GenBankFile
import json


%load_ext autoreload
%autoreload 2

In [3]:
def print_amino_acid_composition(seq:str):
    aas = np.unique(list(seq))
    counts = np.array([seq.count(aa) for aa in aas])
    idxs = np.argsort(counts)[::-1]
    for aa, count in zip(aas[idxs], counts[idxs]):
        print(aa, f'{100 * count / len(seq):.2f}%')


In [4]:
# Several of the spacers from Methanoperedens seem to be targeting scaffolds in the database of putative DPANN scaffolds (<=1 mismatch). 
# However, upon closer inspection, one of the arrays is not a real CRISPR array (ck_bottom_scaffold_10379). 
# The remaining two matched spacers are:
#   (1) SR-VP_05_06_2024_N_top_scaffold_1847_2:4, aligned to SR-VP_05_06_2024_N_middle_scaffold_11254
#   (2) SR-VP_05_06_2024_ck_bottom_scaffold_3220_1:78, aligned to SR-VP_05_06_2024_N_middle_scaffold_11254

# The next question is to figure out what SR-VP_05_06_2024_N_middle_scaffold_11254 belongs to. It has no assigned taxonomy.  
# It seems like it does not actually align to a DPANN Archaeon, and was likely misbinned. It has mostly hypothetical proteins, 
# one of which is annotated as belonging to Methanoperedens (which is the genome in which the CRISPR spacer was identified), 
# and another seemingly belonging to a bacterium. 


# BLAST alignment against sequences from the vernal pool metagenomes shows that it matches
# a high-abundance extrachromosomal element


In [5]:
spacer_scaffold_ids = ['SR-VP_05_06_2024_ck_bottom_scaffold_3220'] # From Methanoperedens. 
spacer_scaffold_ids += ['SR-VP_05_06_2024_N_top_scaffold_1847']
spacer_ids = ['SR-VP_05_06_2024_ck_bottom_scaffold_3220_1:28']
spacer_ids += ['SR-VP_05_06_2024_N_top_scaffold_1847_2:4']

target_scaffold_id = 'SR-VP_05_06_2024_N_middle_scaffold_11254'

# scaffold_to_bin_map = load_scaffold_to_bin_map(cleaned=False)

new_bin_id_map = dict()
new_bin_id_map['SR-VP_05_06_2024_N_top_Candidatus_Methanoperedens_Black-host_type_44_27'] = 'SR-VP_05_06_2024_N_top.maxbin2.40'
new_bin_id_map['SR-VP_05_06_2024_ck_bottom_Methanoperedens_44_24'] = 'SR-VP_05_06_2024_ck_bottom.maxbin2.42'

# The target bin has no easily-assignable taxonomy, and seems to be a bit of a mess. 
# https://ggkbase.berkeley.edu/organisms/668036
target_bin_id = 'SR-VP_05_06_2024_N_middle.concoct.40'

In [6]:
spacer_bin_id = 'SR-VP_05_06_2024_N_top.maxbin2.40'
spacer_bin_df = FASTAFile.from_file(f'../data/spacers/orphan/{spacer_bin_id}.orphan_spacers.fa').to_df()
spacer_seq = spacer_bin_df.loc[spacer_ids[-1]].seq

print(spacer_seq)

ATAATTTATTACGATTACTGACTGATATATCAGG


In [7]:
# SR-VP_05_06_2024_N_middle_scaffold_11254_1 
# https://ggkbase.berkeley.edu/organisms/668036/contigs/1102155721 
# gc_content 25.89 
# length 17736 bp 

# All sequences which align to the target contig have nearly 100 percent identity and query coverage, 
# so almost certainly contain the spacer. 

# Two of the hits are to Final_SR-VP_05_06_2024_coassembly_19kb_linear_ECE_26_1334_complete 

In [8]:
ece_id = 'Final_SR-VP_05_06_2024_coassembly_19kb_linear_ECE_26_1334_complete'
ece_id = 'ece_26_1334'
GenBankFile.from_file(f'../data/{ece_id}.gbk').to_fasta(f'../data/{ece_id}_protein.fa')

In [None]:
ece_seq = FASTAFile.from_file(f'../data/{ece_id}.fasta').seqs[0]
ece_df = GenBankFile.from_file(f'../data/{ece_id}.gbk').to_df()

coordinate_pattern = r'(\d+)..(\d+)'
ece_df['start'] = [int(re.search(coordinate_pattern, coordinate).group(1)) for coordinate in ece_df.coordinate]
ece_df['stop'] = [int(re.search(coordinate_pattern, coordinate).group(2)) for coordinate in ece_df.coordinate]
ece_df['stop_codon'] = [ece_seq[stop - 3:stop] for stop in ece_df.stop]


1 TAA
2 TAA
3 TAA
4 TGA
5 TAA
6 TAA
7 TAA
8 TAA
9 TAG
10 TAA
11 TAA
12 TGA
13 TAA
14 TGA
15 TGA
16 TAA
17 TAA
18 TAA
19 TAA
20 TAA
21 TAA
22 TAA
23 TAA
24 TGA
25 TGA
26 TAA
27 TAA
28 TAG
29 TAA
30 TGA
31 TGA
32 TGA
33 TAA
34 TAA
35 TGA
36 TAA
37 CAT
38 CAT


In [9]:
def make_alphafold_query_file(fa_path:str, path:str):
    fa_df = FASTAFile.from_file(fa_path).to_df()

    content = list()
    for row_ in fa_df.itertuples():
        row = {'modelSeeds':[], 'version':1, 'dialect':'alphafoldserver'}
        row['name'] = row_.Index 
        row['sequences'] = [{'proteinChain':{'sequence':row_.seq, 'count':1, 'useStructureTemplate': False}}]
        content.append(row)

    with open(path, 'w') as f:
        json.dump(content, f)

make_alphafold_query_file(f'../data/{ece_id}_protein.fa', f'../data/{ece_id}_alphafold.json')
    

In [10]:
# foldseek easy-search ./data/ /shared/db/foldseek/latest/db/pdb  ece_26_1334_pdb_foldseek.tsv  tmp --format-mode 4  --format-output query,target,theader,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qcov,tcov,taxid,taxname,taxlineage  --threads 64
# foldseek easy-search ./data/ /shared/db/foldseek/latest/db/alphafold_esmatlas30  ece_26_1334_alphafold_esmatlas30.tsv  tmp --format-mode 4  --format-output query,target,theader,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qcov,tcov,taxid,taxname,taxlineage  --threads 64
# foldseek easy-search ./data/ /shared/db/foldseek/latest/db/alphafold_uniprot  ece_26_1334_uniprot.tsv  tmp --format-mode 4  --format-output query,target,theader,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qcov,tcov,taxid,taxname,taxlineage  --threads 64

get_protein_number = lambda protein_id : int(re.search(rf'{ece_id}_complete_(\d+)', protein_id).group(1))
dest_dir = '../data/biotite' # Dump all relevant models into this folder to port over to biotite. 
for i, path in enumerate(sorted(glob.glob('../data/alphafold/*/*model_0.cif'), key=get_protein_number)):
    file_name = f'{ece_id}_{i + 1}_model_0.cif'
    dest_path = os.path.join(dest_dir, file_name)
    if not os.path.exists(dest_path):
        subprocess.run(f'cp {path} {dest_path}', shell=True, check=True)


In [11]:
interproscan_df = InterProScanFileTSV.from_file(f'../data/{ece_id}_interproscan.tsv').to_df()
interproscan_df = interproscan_df[interproscan_df.signature_analysis != 'MobiDBLite'].copy() # Don't trust any of these. 
print('Num. proteins with InterProScan hits:', interproscan_df['id'].nunique())

has_signal_peptide = lambda df : df.signature_accession.str.contains('signal|SignalP', regex=True, case=False).sum() > 0 
has_transmembrane_helix = lambda df : df.signature_accession.str.contains('transmembrane|TMhelix', case=False).sum() > 0 

ids = interproscan_df.groupby('id')['id'].first()
ids_with_signal_peptide = ids[interproscan_df.groupby('id').apply(has_signal_peptide, include_groups=False)]
ids_with_transmembrane_helix = ids[interproscan_df.groupby('id').apply(has_transmembrane_helix, include_groups=False)]

print('Num. proteins with potential signal peptides:', len(ids_with_signal_peptide))
print('Num. proteins with potential transmembrane helices:', len(ids_with_transmembrane_helix))

Num. proteins with InterProScan hits: 14
Num. proteins with potential signal peptides: 7
Num. proteins with potential transmembrane helices: 11


In [82]:
fields = 'query,target,theader,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qcov,tcov,taxid,taxname,taxlineage'.split(',')
foldseek_df = pd.read_csv(f'../data/{ece_id}_pdb_foldseek.tsv', sep='\t') # , names=fields)
foldseek_df = pd.read_csv(f'../data/{ece_id}_alphafold_uniprot_foldseek.tsv', sep='\t') # , names=fields)
foldseek_df = foldseek_df[foldseek_df['evalue'] < 5].copy()
foldseek_df = foldseek_df.sort_values('evalue')

foldseek_hit_ids = foldseek_df['query'].unique()
print('Num. proteins with solid Foldseek hits:', foldseek_df['query'].nunique())
print(', '.join(foldseek_hit_ids))

# Very interesting to note that proteins 25 and 27 look a bit similar. 
# proteins_with_foldseek_hits = [get_protein_number(protein_id) for protein_id in foldseek_df['query'].unique()]

Num. proteins with solid Foldseek hits: 24
ece_26_1334_20_model_0, ece_26_1334_35_model_0, ece_26_1334_28_model_0, ece_26_1334_17_model_0, ece_26_1334_27_model_0, ece_26_1334_2_model_0, ece_26_1334_26_model_0, ece_26_1334_1_model_0, ece_26_1334_24_model_0, ece_26_1334_12_model_0, ece_26_1334_30_model_0, ece_26_1334_38_model_0, ece_26_1334_25_model_0, ece_26_1334_34_model_0, ece_26_1334_19_model_0, ece_26_1334_37_model_0, ece_26_1334_32_model_0, ece_26_1334_33_model_0, ece_26_1334_23_model_0, ece_26_1334_11_model_0, ece_26_1334_10_model_0, ece_26_1334_9_model_0, ece_26_1334_31_model_0, ece_26_1334_3_model_0


In [83]:
for id_, df in foldseek_df.groupby('query'):
    row = df.iloc[0]
    if id_ != 'ece_26_1334_1_model_0':
        continue
    print(id_)
    for row in df.itertuples():
        print(row._asdict())
        # print(row.theader, f'({row.evalue})', row.qstart, row.qend, row.qcov,  '.....', row.tstart, row.tend, row.tcov)
        # print(f'evalue={row.evalue}, start={row.qstart}, stop={row.qend}, description={row.theader}, taxname={row.taxname}')
    print()
    # Structural alignments are generally to the final 100 residues of the query protein and BamB, which is involved in outer membrane assembly. 

ece_26_1334_1_model_0
{'Index': 3136, 'query': 'ece_26_1334_1_model_0', 'target': 'AF-A0A845DF14-F1-model_v4', 'theader': 'AF-A0A845DF14-F1-model_v4 Uncharacterized protein', 'fident': 0.234, 'alnlen': 94, 'mismatch': 71, 'gapopen': 0, 'qstart': 1, 'qend': 94, 'tstart': 1, 'tend': 94, 'evalue': 0.04941, 'bits': 77, 'qcov': 0.959, 'tcov': 0.94, 'taxid': 2605265, 'taxname': 'Caldilineaceae bacterium SB0670_bin_27', 'taxlineage': '-_cellular organisms;d_Bacteria;-_Terrabacteria group;p_Chloroflexi;c_Caldilineae;o_Caldilineales;f_Caldilineaceae;-_unclassified Caldilineaceae;s_Caldilineaceae bacterium SB0670_bin_27'}
{'Index': 3137, 'query': 'ece_26_1334_1_model_0', 'target': 'AF-A0A6B0YR90-F1-model_v4', 'theader': 'AF-A0A6B0YR90-F1-model_v4 Uncharacterized protein', 'fident': 0.234, 'alnlen': 94, 'mismatch': 71, 'gapopen': 0, 'qstart': 1, 'qend': 94, 'tstart': 1, 'tend': 94, 'evalue': 0.09941, 'bits': 72, 'qcov': 0.959, 'tcov': 0.94, 'taxid': 2605260, 'taxname': 'Caldilineaceae bacterium S

In [79]:
# ncbi_blast_df = BLASTFileJSON.from_file(f'../data/{ece_id}_blast_ncbi.json').to_df()
# ncbi_blast_df = ncbi_blast_df[ncbi_blast_df.bit_score > 45].copy()
# ncbi_blast_df['query_coverage'] = (ncbi_blast_df.query_alignment_stop - ncbi_blast_df.query_alignment_start) / ncbi_blast_df.query_length
# ncbi_blast_df['percent_identity'] = (ncbi_blast_df.identity / ncbi_blast_df.alignment_length)
# # ncbi_blast_df.subject_taxon.value_counts()

1115-267 
644/3
(1115-644 )/ 3

157.0

In [15]:
# Protein 24 and 25 likely in the same operon. 
ece_seq = FASTAFile.from_file(f'../data/{ece_id}.fasta').seqs[0]
slippery_site_pattern = r'([ACGU])\1\1([AU])\2\2[ACU]'

In [None]:
protein_22_start, protein_22_stop = 7890 - 1, 8120
protein_21_start, protein_21_stop = 7600 - 1, 7887

protein_21_seq = FASTAFile.from_file(f'../data/{ece_id}_protein.fa').seqs[20]
protein_22_seq = FASTAFile.from_file(f'../data/{ece_id}_protein.fa').seqs[21]

print_amino_acid_composition(protein_21_seq)
re.findall(slippery_site_pattern, ece_seq[protein_21_start:protein_22_stop + 9])
print(ece_seq[protein_21_stop - 3:protein_21_stop])

N 14.74%
L 11.58%
H 8.42%
K 8.42%
I 8.42%
E 8.42%
G 7.37%
R 6.32%
S 5.26%
Y 4.21%
C 4.21%
D 3.16%
T 2.11%
F 2.11%
M 2.11%
A 2.11%
W 1.05%
TAA


In [42]:
(protein_14_start - protein_13_stop) % 3
(protein_15_start - protein_14_stop) % 3
(protein_16_start - protein_15_stop) % 3

2

In [43]:
protein_13_start, protein_13_stop = 3527 - 1, 3679
protein_14_start, protein_14_stop = 3663 - 1, 3797
protein_15_start, protein_15_stop = 3794 - 1, 3916
protein_16_start, protein_16_stop = 3913 - 1, 4116

seq = ece_seq[protein_13_start:protein_16_stop]
for match in re.finditer(slippery_site_pattern, seq):
    print(match)

for start, stop in [(protein_13_start, protein_13_stop), (protein_14_start, protein_14_stop), (protein_15_start, protein_16_stop), (protein_16_start, protein_16_stop)]:
    print(ece_seq[start - 6:stop])
print(seq)

<re.Match object; span=(290, 297), match='AAAAAAA'>
<re.Match object; span=(518, 525), match='AAAAAAA'>
GATAAAATGATTAAATCATGTATAATTTGTGATATCCAAAACTGTAGTCTTTATAATGAATTAAGAAATTTTGGGTTATTACAAAATCAATTATTAATTAATAATATCGCAGATAAATGTAAAGACTTTGTAAATAAGCCTGATGAAAAGAGATATTAA
AGCCTGATGAAAAGAGATATTAATTTTATGTCTAAAAATTATCAGCTTTATCCAAAATTTAAAAATCAATGTAAGTATTGCGGTATAGCTGTTTATAATTTAAAAGAACATCAAGAAATTAAACACGGAGTTAAAATATGA
TAAAATATGAATGATTTTGAAAGTATAGTAAAAAAAGGAAATTATAAAGCAATATTAGACCAGTGGGGTATAAGTGAGCCTGCTAAATATCCTAATTTAATAAAATTTATAGTGAAGATGATGATATGAAGATAGGATATACTTTTAATATTGGTAATTATGAATCGTTTAAAATTGAATCATCTGAATATGACAGTTCTGATATTAATATAAATTTACAAAAAGCTAATGAAGAAATTATAAATATTTTAGGAATTAAAAAAAGAGTTATTCCAGAACTTGAACACTGGTATAATAAATTATTATCTGAAATTAAATTGCAGTCATAA
GATGATATGAAGATAGGATATACTTTTAATATTGGTAATTATGAATCGTTTAAAATTGAATCATCTGAATATGACAGTTCTGATATTAATATAAATTTACAAAAAGCTAATGAAGAAATTATAAATATTTTAGGAATTAAAAAAAGAGTTATTCCAGAACTTGAACACTGGTATAATAAATTATTATCTGAAATTAAATTGCAGTCATAA
ATGATTAAATCATGTATAATTTGTGATATCCAAAACTGTAGTCTTTATAATGA

In [17]:

protein_25_seq = FASTAFile.from_file(f'../data/{ece_id}_protein.fa').seqs[24]
protein_24_seq = FASTAFile.from_file(f'../data/{ece_id}_protein.fa').seqs[23]

protein_24_start, protein_24_stop = 8796 - 1, 9149
protein_25_start, protein_25_stop = 9146 - 1, 10987

print('Length between proteins 23 and 24:', protein_25_start - protein_24_stop)
# print('protein 25 start codon:', ece_seq[protein_25_start:protein_25_start + 3])
print('Protein 24 stop codon:', ece_seq[protein_24_stop - 3:protein_24_stop])

print(ece_seq[protein_24_stop - 15:protein_24_stop])
print(ece_seq[protein_25_start:protein_25_start + 15])

protein_25_nt_seq = ece_seq[protein_25_start:protein_25_stop]
# re.search('AAATTTC', protein_25_nt_seq)
# re.search(r'([ACGU])\1\1([AU])\2\2[ACU]', ece_seq[protein_24_start:protein_25_stop]).start()

protein_25_codons = np.array([protein_25_nt_seq[i:i + 3] for i in range(0, len(protein_25_nt_seq), 3)])
start_codons = ['GTG', 'TTG', 'ATG']
for i in np.where(np.isin(protein_25_codons, start_codons))[0]:
    print(i, protein_25_codons[i])

protein_25_seq[300:]




Length between proteins 23 and 24: -4
Protein 24 stop codon: TGA
TACATAAAATTATGA
ATGAATAAATTTAAA
0 ATG
20 TTG
46 ATG
54 ATG
75 TTG
214 GTG
227 ATG
259 ATG
300 ATG
397 ATG
450 ATG
469 TTG
535 TTG
595 TTG
599 ATG
607 ATG


'MGVSFTSNIPNTNLGIFNYFIHGVGTINYQFVYPFHVIQPSINYNSTIFFNSNISLSGNIIDYNGAIVYQAKIITNLSNVTFTDINGDYNFSSLNPQMYNITVKRNGYIDNNVQINLSSNTILNFNLSTQNIRQLISAGNNLTYDSNTGIMNDIREFINGTNGINGTNGLNGINGINGTNGINGTNGFNGTTLIQGLSYPDIVYLSSNNILYKINNGNIDFKVILNNEIQGNVIDLNTSVIVTSNNTLYKINSSNGNIDYKVLIPEINKVKVTNNSIFVKSNRNLYTIEKINGNILFKTMINDNFDFMDREVI'

In [18]:
# Protein 25 is particularly weird, it AS A INGTNGI repeat; N is Arginine, T is Tryptophan, I is Isoleucine. 
# Seems to align to polyketide synthases, but seems far too short. Appears to be a linker loop thing (450-500) in between two high-confidence
# beta sheet structures in the AlphaFold structure. One of the regions flanking the linker domain (350-450) is a Carboxypeptidase. 
# Nearly the entire protein is predicted to be extracellular. 

# Residues 300-600 seem like they may nominally resemble BamB, which is a much shorter protein. 

# Honestly the whole N-terminus of the protein looks like absolute garbage. Possible frameshift? What is protein 24?
# There are alternate start codons which could N-terminally truncate Protein 25 and make it closer in size to other proteins involved in outer membrane
# construction; selecting the ATG at position 300 results in a similar AlphaFold structure, though the signal peptide is lost. It seems likely
# that the selected start site is correct. 


# https://www.nature.com/articles/s41589-025-01878-4 about polyketide synthase. 
# https://pmc.ncbi.nlm.nih.gov/articles/PMC8338781/ about polyketide synthase. 
