In [36]:
import pandas as pd
import numpy as np

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [37]:
def read_fasta(fasta_path):
    
    fasta_entries = []

    with open(fasta_path) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            fasta_entries.append([record.id, record.seq])

    fasta_df = pd.DataFrame(data=fasta_entries, columns=['id', 'seq'])
    
    return fasta_df



In [38]:
# # read in strain metadata:
# df = pd.read_csv("motif/actinomycetota_bvbrc_reference_genomes.csv")
# df = df[['Genome Name', 'NCBI Taxon ID', 'Genome ID', 'Order', 'Family', 'Genus']]

# df['name'] = df['Genome Name'].str.replace(" ", "_")

# # read in ind and dep samples, and merge
# ind = read_fasta("motif/biob_independent_subset100_protein.fa")
# dep = read_fasta("motif/biob_dependent_subset100_protein.fa")

# ind['type'] = 'ind'
# dep['type'] = 'dep'

# seqs = pd.concat([ind, dep]).reset_index(drop=True)

In [6]:
df.head()

Unnamed: 0,Genome Name,staxid,Order,Family,Genus,type,name
0,Kutzneria kofuensis strain DSM 43851,103725,Pseudonocardiales,Pseudonocardiaceae,Kutzneria,ind,Kutzneria_kofuensis_strain_DSM_43851
1,Thermocatellispora tengchongensis strain DSM 4...,1073253,Streptosporangiales,Streptosporangiaceae,Thermocatellispora,ind,Thermocatellispora_tengchongensis_strain_DSM_4...
2,Phytohabitans flavus strain NBRC 107702,1076124,Micromonosporales,Micromonosporaceae,Phytohabitans,ind,Phytohabitans_flavus_strain_NBRC_107702
3,Thermoactinospora rubra strain YIM 77501,1088767,Streptosporangiales,Streptosporangiaceae,Thermoactinospora,ind,Thermoactinospora_rubra_strain_YIM_77501
4,Thermocrispum municipale DSM 44069,1111737,Pseudonocardiales,Pseudonocardiaceae,Thermocrispum,ind,Thermocrispum_municipale_DSM_44069


In [7]:
seqs.head()

Unnamed: 0,id,seq
0,Escherichia_coli_CFT073,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
1,Candidatus_Microthrix_parvicella_RN1,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
2,Brevibacterium_luteolum_strain_NEB1784,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
3,Kocuria_tytonis_442,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
4,Knoellia_subterranea_KCTC_19937,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."


In [8]:
np.setdiff1d(seqs['id'].tolist(), df['name'].tolist())

array([], dtype='<U58')

In [11]:
np.setdiff1d(df['name'].tolist(), seqs['id'].tolist()).tolist()

['Catenulispora_acidiphila_DSM_44928',
 'Kribbella_jejuensis_strain_DSM_17305',
 'Mycobacterium_haemophilum_DSM_44634_ATCC_29548',
 'Mycobacterium_triplex_strain_DSM_44626',
 'Nocardia_sp._ET3-3',
 'Nocardia_sp._YIM_PH_21724']

In [13]:
df = df[~df['name'].isin(np.setdiff1d(df['name'].tolist(), seqs['id'].tolist()).tolist())].reset_index(drop=True)

In [14]:
# missing_lines = pd.DataFrame([
#     ['Mycobacterium tuberculosis H37Rv', 83332, 83332,  "Corynebacteriales", "Mycobacteriaceae", "Mycobacterium", "Mycobacterium_tuberculosis_H37Rv"],
#     ['Mycobacterium smegmatis str. MC2 155', np.nan, np.nan, "Corynebacteriales", "Mycobacteriaceae", "Mycobacterium", "Mycobacterium_smegmatis_str._MC2_155"],
#         ], columns=['Genome Name', 'NCBI Taxon ID', 'Genome ID', 'Order', 'Family', 'Genus', 'name'])

# df = pd.concat([df, missing_lines]).reset_index(drop=True)

# df = seqs.merge(df, how='left', left_on='id', right_on='name')[['id', 'type', 'Order', 'Family', 'Genus']]

# df['id'] = df['id'].str.replace("_strain_not_applicable", "").str.replace("_strain_Not_applicable", "")

In [15]:
df.shape

(188, 7)

In [16]:
# now have a dataframe setup that categorizes independent and dependent species
# next need to read in and merge the shared MSA file

# msa_path = "motif/merged_biob_seqs_subset200_protein.mfa"
msa_path = "motif/biob_merged_subset200_pruned.msa"
msa = read_fasta(msa_path)

In [18]:
# np.setdiff1d(df.id.tolist(), msa.id.tolist())

In [19]:
# np.setdiff1d(msa.id.tolist(), df.id.tolist())

In [21]:
df.head()


Unnamed: 0,Genome Name,staxid,Order,Family,Genus,type,name
0,Kutzneria kofuensis strain DSM 43851,103725,Pseudonocardiales,Pseudonocardiaceae,Kutzneria,ind,Kutzneria_kofuensis_strain_DSM_43851
1,Thermocatellispora tengchongensis strain DSM 4...,1073253,Streptosporangiales,Streptosporangiaceae,Thermocatellispora,ind,Thermocatellispora_tengchongensis_strain_DSM_4...
2,Phytohabitans flavus strain NBRC 107702,1076124,Micromonosporales,Micromonosporaceae,Phytohabitans,ind,Phytohabitans_flavus_strain_NBRC_107702
3,Thermoactinospora rubra strain YIM 77501,1088767,Streptosporangiales,Streptosporangiaceae,Thermoactinospora,ind,Thermoactinospora_rubra_strain_YIM_77501
4,Thermocrispum municipale DSM 44069,1111737,Pseudonocardiales,Pseudonocardiaceae,Thermocrispum,ind,Thermocrispum_municipale_DSM_44069


In [22]:
msa.head()

Unnamed: 0,id,seq
0,Escherichia_coli_CFT073,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
1,Candidatus_Microthrix_parvicella_RN1,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
2,Brevibacterium_luteolum_strain_NEB1784,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
3,Kocuria_tytonis_442,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
4,Knoellia_subterranea_KCTC_19937,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."


In [26]:
# df['name'] = df['id'].str.replace("_", " ")
# df = 
df = df.merge(msa, left_on='name', right_on='id')

In [39]:
## write df as metadata file
# df.to_csv("motif/merged_biob_msa_annotation.csv", index=None)

In [40]:
# df.head()

In [56]:
df = pd.read_csv('motif/biob_merged_subset200_annotation_filter3.csv')
msa = read_fasta("motif/biob_merged_subset200_filter3_pruned2.msa")

In [57]:
msa.shape

(184, 2)

In [58]:
df = df.merge(msa, left_on='name', right_on='id')
df.shape

(184, 9)

In [59]:
df['seq'].isna().sum()

0

In [60]:
df.head()

Unnamed: 0,Genome Name,staxid,Order,Family,Genus,type,name,id,seq
0,Nonomuraea soli strain DSM 45533,1032476,Streptosporangiales,Streptosporangiaceae,Nonomuraea,ind,Nonomuraea_soli_strain_DSM_45533,Nonomuraea_soli_strain_DSM_45533,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
1,Actinoplanes atraurantiacus strain CGMCC 4.6857,1036182,Micromonosporales,Micromonosporaceae,Actinoplanes,ind,Actinoplanes_atraurantiacus_strain_CGMCC_4.6857,Actinoplanes_atraurantiacus_strain_CGMCC_4.6857,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
2,Kutzneria kofuensis strain DSM 43851,103725,Pseudonocardiales,Pseudonocardiaceae,Kutzneria,ind,Kutzneria_kofuensis_strain_DSM_43851,Kutzneria_kofuensis_strain_DSM_43851,"(-, -, -, -, -, -, M, T, A, S, V, D, -, -, -, ..."
3,Thermocatellispora tengchongensis strain DSM 4...,1073253,Streptosporangiales,Streptosporangiaceae,Thermocatellispora,ind,Thermocatellispora_tengchongensis_strain_DSM_4...,Thermocatellispora_tengchongensis_strain_DSM_4...,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
4,Phytohabitans flavus strain NBRC 107702,1076124,Micromonosporales,Micromonosporaceae,Phytohabitans,ind,Phytohabitans_flavus_strain_NBRC_107702,Phytohabitans_flavus_strain_NBRC_107702,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."


In [61]:
## write independent msas to their own msa files:

seq_records = []
for i in df[df['type'] == 'ind'].index:
    seq_records.append(SeqRecord(df.loc[i, 'seq'], id=df.loc[i, 'id'], description=""))

with open("motif/biob_merged_to_independent_subset100_filter3_pruned2.msa", "w") as output_handle:
    SeqIO.write(seq_records, output_handle, "fasta")


## write dependent msas to their own msa files:

seq_records = []
for i in df[df['type'] == 'dep'].index:
    seq_records.append(SeqRecord(df.loc[i, 'seq'], id=df.loc[i, 'id'], description=""))

with open("motif/biob_merged_to_dependent_subset100_filter3_pruned2.msa", "w") as output_handle:
    SeqIO.write(seq_records, output_handle, "fasta")



In [190]:
## read in pruned msa file, to write seperated msa files:

In [245]:

msa_path = "motif/merged_biob_seqs_subset200_protein_pruned3.mfa"
msa_pruned = read_fasta(msa_path)

In [247]:
df.head()

Unnamed: 0,id,type,Order,Family,Genus
0,Ornithinimicrobium_tianjinense_strain_CGMCC_1....,ind,Micrococcales,Ornithinimicrobiaceae,Ornithinimicrobium
1,Conexibacter_sp._Seoho-28,ind,Solirubrobacterales,Paraconexibacteraceae,Paraconexibacter
2,Conexibacter_arvalis_strain_DSM_23288,ind,Solirubrobacterales,Conexibacteraceae,Conexibacter
3,Nitriliruptor_alkaliphilus_DSM_45188,ind,Nitriliruptorales,Nitriliruptoraceae,Nitriliruptor
4,Aciditerrimonas_ferrireducens_MV1,ind,Acidimicrobiales,Acidimicrobiaceae,Aciditerrimonas


In [248]:
# msa_pruned = msa_pruned.merge(df.drop(columns=['seq']), how='left', on='id')
msa_pruned = msa_pruned.merge(df, how='left', on='id')

In [249]:
msa_pruned.head()

Unnamed: 0,id,seq,type,Order,Family,Genus
0,Lawsonella_clevelandensis_strain_X1036,"(-, -, -, M, S, Y, C, N, S, T, A, A, V, P, P, ...",ind,Corynebacteriales,Lawsonellaceae,Lawsonella
1,Corynebacterium_terpenotabidum_Y-11,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...",dep,Corynebacteriales,Corynebacteriaceae,Corynebacterium
2,Corynebacterium_variabile_strain_NBRC_15286,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...",dep,Corynebacteriales,Corynebacteriaceae,Corynebacterium
3,Corynebacterium_falsenii_strain_FDAARGOS_1493,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...",dep,Corynebacteriales,Corynebacteriaceae,Corynebacterium
4,Corynebacterium_sp._Sa1YVA5,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...",dep,Corynebacteriales,Corynebacteriaceae,Corynebacterium


In [250]:
## write independent msas to their own msa files:

seq_records = []
for i in msa_pruned[msa_pruned['type'] == 'ind'].index:
    seq_records.append(SeqRecord(msa_pruned.loc[i, 'seq'], id=msa_pruned.loc[i, 'id'], description=""))

with open("motif/biob_merged_to_independent_subset100_pruned3.msa", "w") as output_handle:
    SeqIO.write(seq_records, output_handle, "fasta")


## write dependent msas to their own msa files:

seq_records = []
for i in msa_pruned[msa_pruned['type'] == 'dep'].index:
    seq_records.append(SeqRecord(msa_pruned.loc[i, 'seq'], id=msa_pruned.loc[i, 'id'], description=""))

with open("motif/biob_merged_to_dependent_subset100_pruned3.msa", "w") as output_handle:
    SeqIO.write(seq_records, output_handle, "fasta")


In [251]:
## read in pruned consensus sequences:

In [259]:
dep_consensus = SeqIO.read("motif/merged_biob_to_dependent_subset100_protein_pruned2_75consensus.fa", "fasta")
ind_consensus = SeqIO.read("motif/merged_biob_to_independent_subset100_protein_pruned2_75consensus.fa", "fasta")

In [260]:
print(">BsaP dependent BioB consensus")
print(str(dep_consensus.seq).upper().replace("X", "-"))
print(">BsaP independent BioB consensus")
print(str(ind_consensus.seq).upper().replace("X", "-"))

>BsaP dependent BioB consensus
--------------------------------------------------------------------L--AR--VL--G--L-------VL-L-D------------LL-LAH-VR---CG--VEVEGI-SLKTGGCPEDCHFCSQSG-F-SPVR---------LV-AA------GATEFCIVAAV-GPD--L--Q------AI---------I--A-S-G-L---QV--L---GVHRYNHNLE-A-S-F--VVTTH--EER--T---V--AG-E-C-GGI-GMGE---QRAE-A--LA-L-P-EVP-NFL-P-PGTP-----------AL----AFRLA-P-T-LR-AGGRE--LGDLG---G-LGG-NA-IVGNYLT-LG-----D---------------L--PIK------L------------------
>BsaP independent BioB consensus
--------------------------------------------------------------------L--A---VL--G-GL--------L---D-------------L-LAH-VR---CGPEVEVEGI-S-KTGGCPEDCHFCSQSG-F--PVR---WLDIP-LV-AA--TA--GA-EFCIVAAVRGPD-RLM-Q-R-G--AI-------D-I--A-SLGML---QV--L---GVHRYNHNLE---S-F--VVTTH--EER--T--MV---GME-CCGG--G-GE--EQRAE-A--L--L-P-EVP-NFL-P-PGTP-----------ALR--A-FRLA-PR--LR-AGGRE-TLGDLG---G-LGG-NA-IVGNYLT-LGR----DL--L-----------L-MP-K-----AL-----------------L
