In [5]:
import pandas as pd
import numpy as np

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

import itertools

In [168]:
def read_in_fasta(fasta_path):
    seqs = []

    for record in SeqIO.parse(fasta_path, "fasta"):

        seqs.append([record.id, record.seq])

    seqs = pd.DataFrame(seqs, columns=['id', 'seq'])
    
    return seqs


In [169]:
def write_df_to_fasta(df, seq_col, name_col, fasta_out_path):
    
    sequences = []

    for i in df.index:
        
        seq_id = df.loc[i, name_col].replace(" ", "_")
    
        sequences.append(SeqRecord(Seq(df.loc[i, seq_col]), id=seq_id, description=""))

    with open(fasta_out_path, "w") as output_handle:
        SeqIO.write(sequences, output_handle, "fasta")
        
    return


## quick investigation of coordinates of protein features:

In [170]:
# # ecoli = SeqIO.read("motif/ecoli_biob_aa.fa", "fasta")
# # mtb = SeqIO.read("bioB_h37rv_protein.fa", "fasta")

# nt = SeqIO.read("bspA_h37rv.fa", "fasta")
# aa = SeqIO.read("bspA_h37rv_protein.fa", "fasta")

In [35]:
# nt

In [36]:
# nt[:3]

In [37]:
# (49*3)+3

In [38]:
# nt[126:147]

In [39]:
# nt[126:147].translate()

In [40]:
# aa[42:49]

In [41]:
# aa_cons = "------------------------------------------------------------YN-YTG-----V---G---A---------AARLG--LEPPRFC-QCGRRMVVQV-PDGW-ARCSRHG-VDS--L--R--------"
# nt_cons = "----------------------------------ATGGTGG---------------------------------------AAATCGTGGCTGGAAAACAACGCGCTCC------------------------GGTCGCTGC-----------------CGG----------CGTGTACAACGTGTACACCGGG--------------------------------------------------------------------GAACTGGCGGATACGGCCACGCCGACAGCG-----GCTCG-----GATGG---------GTC--TGGAGCCCCCCCGGTTCTGTGCGCAGTGCGGTCGCCGGATGGTCGTCCAGGTCCGGCCCGACGGCTGGTGGGCGCGCTGTTCTCGCCACGGGC---AGGTGGACTCGG---------------------CCGACTTGGCGACACAGCGGTGA---------"

In [43]:
# aa_cons_expand = ""
# for i in aa_cons:
#     aa_cons_expand = aa_cons_expand + " " + i + " "

In [44]:
# aa_cons_expand


In [33]:
# len(aa_cons_expand)

435

In [45]:
# nt_cons

In [46]:
# mtb[158:168]

In [51]:
# ecoli[127]

In [52]:
# ecoli[123:130]

In [53]:
# ecoli[187]

In [54]:
# ecoli[184:192]

In [55]:
# ecoli[90:100]

### write reference taxids to list

In [2]:
df = pd.read_csv("motif/actinomycetota_bvbrc_reference_genomes.csv")

In [3]:
df[['NCBI Taxon ID']].to_csv("atinomycetota_bvbrc_reference_taxid_list.csv", index=None, header=None)

### Read in bsap and biob blast hits against reference genomes; divide out bsap independent and bsap dependent biob copies:
- Filtered hits to include only hits with at least 33% the length of the biob/bsap

In [215]:
blast_cols = ['seqid', 'evalue', 'bitscore', 'pident', 'length', 'staxid', 'ssname', 'scomname', 'sstart', 'ssend', 'sseq']

bsap_all = pd.read_csv("motif_prev/bsap_bvbrc_blastn_hits_31Aug2023.tsv", delimiter="\t", names=blast_cols, index_col=False)
biob = pd.read_csv("motif_prev/biob_bvbrc_blastn_hits_31Aug2023.tsv", delimiter="\t", names=blast_cols, index_col=False)

In [216]:
# biob = biob[biob['length'] > 349].reset_index(drop=True)
bsap = bsap_all[bsap_all['length'] > 79].reset_index(drop=True)

In [217]:
biob = biob.drop_duplicates(subset='staxid')
bsap = bsap.drop_duplicates(subset='staxid')

In [218]:
bsap_phylogeny_tax_list = pd.read_csv("final_nt_workflow/bsap_nt_subset50_rooted_species_order.txt", header=None, index_col=False
                                     ).transpose()[0].str.replace("_", " ").tolist()

In [219]:
(349*3)/4

261.75

In [220]:
(79*3)/4

59.25

In [221]:
biob_ind_species_list = biob[~biob['staxid'].isin(bsap['staxid'].unique())]['scomname']

In [222]:
len(biob_ind_species_list)

2798

In [223]:
# unique genuses in bsap phylogeny analysis:
len(pd.Series(bsap_phylogeny_tax_list).str.split(" ", expand=True)[0].unique())

25

In [224]:
## allowing that there are some myco derivative species showing up in the bsap independent biob class; we're still going to exclude those
## and any species that sit too close phylogenetically to our bsap investigation --
## definetely going to keep track of these species that are dropped out and run a motif based analysis on them later on, to check in on the
## appearance of biob function. But generally these look like strains with uncertain taxonomy

In [225]:
bsap.shape

(511, 11)

In [226]:
biob.shape

(3309, 11)

In [227]:
pd.DataFrame(biob[~biob['staxid'].isin(bsap['staxid'].unique())]['staxid'].unique()
            ).to_csv("motif/putative_biob_independent_staxid_list.csv", index=False, header=False)


In [228]:
# biob_subset = biob[~biob['staxid'].isin(bsap['staxid'].unique()) & 
#                    ~(biob['ssname'].str.contains('Myco')) &
#                    ~(biob['ssname'].str.contains("Corynebacterium"))
#     ].reset_index(drop=True).copy() #.to_csv('motif/putative_biob_independent.csv', index=False)

biob_subset = biob[~biob['staxid'].isin(bsap['staxid'].unique())].reset_index(drop=True).copy()

In [229]:
biob_subset.head()

Unnamed: 0,seqid,evalue,bitscore,pident,length,staxid,ssname,scomname,sstart,ssend,sseq
0,gi|1888951912|ref|NZ_JACGWZ010000001.1|,1.73e-05,48.2,76.543,81,1000566,Halosaccharopolyspora lacisalsi,Halosaccharopolyspora lacisalsi,1356959,1357038,TCGTCACCAACCGGACGGCCTTCTCGCCGGCGACCGCGGAAGCCGT...
1,gi|1890800121|ref|NZ_JACHBQ010000001.1|,0.002,41.9,96.0,25,1001240,Cryobacterium roopkundense,Cryobacterium roopkundense,976391,976415,GCCTGAACCAGGACCAGGTTCTGGC
2,gi|1124504080|ref|NZ_FOJD01000002.1|,0.021,37.4,87.5,32,100225,Austwickia chelonae,Austwickia chelonae,539733,539763,CCGAGTTCCACCTCGTGGCCGCGG-GCGCGGA
3,gi|1441077360|ref|NZ_QQZY01000004.1|,1.55e-09,60.8,69.484,213,1002870,Gaiella occulta,Gaiella occulta,267788,267995,CTGCACGCTCGGCATGCTCACGGCCGAGCAAGCGGAGGCGCTCGC-...
4,gi|357407371|ref|NC_016113.1|,9.11e-05,47.3,75.294,85,1003195,Streptantibioticus cattleyicolor NRRL 8057 = D...,Streptantibioticus cattleyicolor NRRL 8057 = D...,1386387,1386469,CGGCGAGGCGGGCGCGGACGCCTACAACCACAACCTCAACACCGCC...


In [230]:
biob_subset['ssname'].str.split(" ", expand=True)[0].value_counts()

Streptomyces       532
Corynebacterium    117
Microbacterium     103
Nocardioides       100
Bifidobacterium     95
                  ... 
Specibacter          1
Rubneribacter        1
Baekduia             1
Peptidiphaga         1
Phytomonospora       1
Name: 0, Length: 433, dtype: int64

In [231]:
# write_df_to_fasta(biob_subset, "motif/putative1_biob_independent_seqs.fasta")

In [232]:
# for i in biob[ (~biob['staxid'].isin(bsap['staxid'].unique())) & 
#      (biob['scomname'].str.contains("Myco")) ]['seqid'].str.split("|", expand=True)[3].tolist():
#     print(i)

In [233]:
bsap_all[bsap_all['scomname'].isin(biob[ (~biob['staxid'].isin(bsap['staxid'].unique())) & 
                                        (biob['scomname'].str.contains("Mycobacter")) ]) ]

Unnamed: 0,seqid,evalue,bitscore,pident,length,staxid,ssname,scomname,sstart,ssend,sseq


## use taxid limitations to pull bioB sequences from bvbrc query

In [234]:
genome_map = pd.read_csv("motif_prev/actinomycetota_bvbrc_reference_genomes.csv")
biob_patric = pd.read_csv("motif_prev/PGF_01400330_family_feature_list_full_seq.tsv", delimiter='\t')

In [235]:
# genome_map.head()
genome_map = genome_map[['Genome Name', 'NCBI Taxon ID', 'Genome ID', 'Order', 'Family', 'Genus']]

biob_subset = biob_subset.merge(genome_map, how='left', left_on='staxid', right_on='NCBI Taxon ID')

In [236]:
bsap_phylogeny_genus_list = pd.Series(bsap_phylogeny_tax_list).str.split(" ", expand=True)[0].unique()
biob_subset = biob_subset[~biob_subset['Genus'].isin(bsap_phylogeny_genus_list)]

In [237]:
biob_subset = biob_subset.merge(biob_patric[['feature.genome_id', 'feature.patric_id', 'feature.na_sequence', 'feature.aa_sequence']],
                 how='left', left_on='Genome ID', right_on='feature.genome_id')

In [238]:
biob_subset = biob_subset.dropna().reset_index(drop=True)

In [239]:
biob_subset.shape

(1275, 21)

In [240]:
biob_subset.head()

Unnamed: 0,seqid,evalue,bitscore,pident,length,staxid,ssname,scomname,sstart,ssend,...,Genome Name,NCBI Taxon ID,Genome ID,Order,Family,Genus,feature.genome_id,feature.patric_id,feature.na_sequence,feature.aa_sequence
0,gi|1888951912|ref|NZ_JACGWZ010000001.1|,1.73e-05,48.2,76.543,81,1000566,Halosaccharopolyspora lacisalsi,Halosaccharopolyspora lacisalsi,1356959,1357038,...,Saccharopolyspora lacisalsi strain DSM 45975,1000566,1000566.3,Pseudonocardiales,Pseudonocardiaceae,Halosaccharopolyspora,1000566.3,fig|1000566.3.peg.1406,atgagctccaccttccagcagctcgccgattcggtcctcgtcggga...,MSSTFQQLADSVLVGTPATHDDALAVLRADEAELMSLVAAAGRLRR...
1,gi|1124504080|ref|NZ_FOJD01000002.1|,0.021,37.4,87.5,32,100225,Austwickia chelonae,Austwickia chelonae,539733,539763,...,Austwickia chelonae strain DSM 44178,100225,100225.5,Micrococcales,Dermatophilaceae,Austwickia,100225.5,fig|100225.5.peg.659,atgaccaccctcgatgtgaccgagacgaccggtacctccgagacgg...,MTTLDVTETTGTSETADLQHDGPTLDLTAMVEAGLAGRSITREQAM...
2,gi|1441077360|ref|NZ_QQZY01000004.1|,1.55e-09,60.8,69.484,213,1002870,Gaiella occulta,Gaiella occulta,267788,267995,...,Gaiella occulta strain F2-233,1002870,1002870.3,Gaiellales,Gaiellaceae,Gaiella,1002870.3,fig|1002870.3.peg.2128,atgcgtgacacacttcccccgatgcctgcagccgggtcgatccgcc...,MRDTLPPMPAAGSIRHDWTLREIEEIHALPLPELLFRAQSVHRAHH...
3,gi|357407371|ref|NC_016113.1|,9.11e-05,47.3,75.294,85,1003195,Streptantibioticus cattleyicolor NRRL 8057 = D...,Streptantibioticus cattleyicolor NRRL 8057 = D...,1386387,1386469,...,Streptomyces cattleya NRRL 8057 = DSM 46488,1003195,1003195.11,Streptomycetales,Streptomycetaceae,Streptantibioticus,1003195.11,fig|1003195.11.peg.1990,atggacctgctgaagacgctggtggacaagggactgcggggcgagt...,MDLLKTLVDKGLRGESPTREEALAVLATSDDELLDVVAAAGRVRRR...
4,gi|357407371|ref|NC_016113.1|,9.11e-05,47.3,75.294,85,1003195,Streptantibioticus cattleyicolor NRRL 8057 = D...,Streptantibioticus cattleyicolor NRRL 8057 = D...,1386387,1386469,...,Streptomyces cattleya NRRL 8057 = DSM 46488,1003195,1003195.11,Streptomycetales,Streptomycetaceae,Streptantibioticus,1003195.11,fig|1003195.11.peg.1258,atgcaactactcgacaccctcgtcggcaaggcgttacgccgcgaaa...,MQLLDTLVGKALRRETPTREEALAVLRTEDDDLLDVVAAAFRVRHH...


In [241]:
duplicate_genes_to_drop = [
    "fig|1123320.3.peg.4007",
    "fig|1381558.3.peg.5909",
    "fig|1530042.3.peg.5524"
    "fig|1894969.3.peg.5036",
    "fig|2017.4.peg.2522",
    "fig|2339229.3.peg.6990",
    "fig|2697569.3.peg.1071",
    "fig|2697569.3.peg.1072",
    "fig|2697569.3.peg.1073",
    "fig|2705253.3.peg.6283",
    "fig|2705254.3.peg.6198",
    "fig|469383.5.peg.1574",
    "fig|47989.4.peg.5630",
    "fig|516124.4.peg.3448",
    "fig|556532.3.peg.2151",
    "fig|589240.3.peg.7642",
    "fig|795644.3.peg.4543",
    "fig|795644.3.peg.4542",
    "fig|861266.6.peg.2194",
    "fig|1530042.3.peg.5524",
    "fig|1894969.3.peg.5036"
    ]

In [242]:
# biob_subset[biob_subset.duplicated(subset=['Genome Name'], keep=False)][
#     ['seqid', 'bitscore', 'pident', 'length', 'sstart', 'ssend', 'Genome Name', 'feature.genome_id', 'feature.patric_id']].tail(n=10)

In [243]:
biob_subset = biob_subset[~biob_subset['feature.patric_id'].isin(duplicate_genes_to_drop)].reset_index(drop=True)

In [244]:
# biob_subset[biob_subset['Genome Name'].duplicated(keep=False)]
biob_subset = biob_subset.drop_duplicates(subset=['Genome Name'])

In [245]:
biob_subset.isna().sum().sum()

0

In [246]:
# for i in biob_subset['Order'].unique():
#     print(i)
# biob_subset['Order'].value_counts()


biob_ind_orders = [
    "Streptomycetales", 
    "Micromonosporales",
    "Micrococcales",
    "Pseudonocardiales",
    "Propionibacteriales",
    "Streptosporangiales", 
    "Glycomycetales", 
    "Cryptosporangiales", 
    "Acidimicrobiales"]

In [247]:
biob_subset = biob_subset[biob_subset['Order'].isin(biob_ind_orders)]

In [248]:
biob_subset = biob_subset[biob_subset['length'] > 100]

In [278]:
biob_subset['feature.na_sequence'].apply(len).describe()

count     311.000000
mean     1003.533762
std        34.677193
min       477.000000
25%       996.000000
50%       996.000000
75%      1017.000000
max      1047.000000
Name: feature.na_sequence, dtype: float64

In [249]:
biob_subset = biob_subset.drop_duplicates(subset=['Genome Name']).reset_index(drop=True)

In [250]:
ecoli_protein = "MAHRPRWTLSQVTELFEKPLLDLLFEAQQVHRQHFDPRQVQVSTLLSIKTGACPEDCKYCPQSSRYKTGLEAERLMEVEQVLESARKAKAAGSTRFCMGAAWKNPHERDMPYLEQMVQGVKAMGLEACMTLGTLSESQAQRLANAGLDYYNHNLDTSPEFYGNIITTRTYQERLDTLEKVREAGIKVCSGGIVGLGETVKDRAGLLLQLANLPTPPESVPINMLVKVKGTPLADNDDVDAFDFIRTIAVARIMMPTSYVRLSAGREQMNEQTQAMCFMAGANSIFYGCKLLTTPNPEEDKDLQLFRKLGLNPQQTAVLAGDNEQQQRLEQALMTPDTDEYYNAAAL"
ecoli_gene_len = len(ecoli_protein)*3

biob_subset = biob_subset[biob_subset['feature.na_sequence'].apply(len) < 1050]

In [251]:
subset_cols_of_interest = ['Genome Name', 'NCBI Taxon ID', 'Genome ID', 
                           'seqid', 'Order', 'Family', 'Genus', 'feature.patric_id', 
                           'scomname', 'bitscore', 'pident', 'sstart', 'ssend', 'length',
                           'feature.na_sequence', 'feature.aa_sequence']

biob_subset[subset_cols_of_interest].to_csv("motif/biob_independent_seqs_filter2.csv", index=False)

In [252]:
# write_df_to_fasta(biob_subset, 'feature.na_sequence', 'Genome Name', "motif/putative2_biob_independent_seqs_nt.fasta")
# write_df_to_fasta(biob_subset, 'feature.aa_sequence', 'Genome Name', "motif/putative2_biob_independent_seqs_filter2_aa.fasta")
write_df_to_fasta(biob_subset, 'feature.aa_sequence', 'Genome Name', "motif/putative2_biob_independent_seqs_filter3_aa.fasta")


In [253]:
# biob_subset['Genus'].unique()

In [211]:
biob_subset[biob_subset['Order'] == 'Micrococcales']

Unnamed: 0,seqid,evalue,bitscore,pident,length,staxid,ssname,scomname,sstart,ssend,...,Genome Name,NCBI Taxon ID,Genome ID,Order,Family,Genus,feature.genome_id,feature.patric_id,feature.na_sequence,feature.aa_sequence
40,gi|499073258|ref|NZ_HF570956.1|,1.16e-169,592.0,73.887,1011,1193181,Phycicoccus elongatus Lp2,Phycicoccus elongatus Lp2,2909282,2910283,...,Tetrasphaera elongata Lp2,1193181,1193181.3,Micrococcales,Intrasporangiaceae,Phycicoccus,1193181.3,fig|1193181.3.peg.715,atgaccacgacggcggaaaccacgaccgacatcctcgacgtcgccc...,MTTTAETTTDILDVARAQVLERGEALTQAQIVEVLRTGDDRLADLL...
42,gi|1910128291|ref|NZ_JACXBW010000017.1|,6.71e-156,548.0,74.568,869,121292,Pseudarthrobacter sulfonivorans,Pseudarthrobacter sulfonivorans,1496,2357,...,Pseudarthrobacter sulfonivorans strain ALL,121292,121292.7,Micrococcales,Micrococcaceae,Pseudarthrobacter,121292.7,fig|121292.7.peg.1477,atgacgatgcagccagacctccaggccaaccccaacaccatcctcg...,MTMQPDLQANPNTILETARARVLEQGMGLSESQLVEILRLPDDALP...
96,gi|1148759302|ref|NZ_KV908325.1|,9.87e-172,600.0,76.319,853,1531955,Sinomonas mesophila,Sinomonas mesophila,52898,52050,...,Sinomonas mesophila strain MPKL 26,1531955,1531955.3,Micrococcales,Micrococcaceae,Sinomonas,1531955.3,fig|1531955.3.peg.1813,atgaccagcggctacgccattctcgagacggcccggaagcaggtcc...,MTSGYAILETARKQVLHEGRGLREAQILEVLNLPDEAIPAALQLAH...
100,gi|1905952393|ref|NZ_BMKV01000004.1|,6.1199999999999994e-124,442.0,70.672,982,158897,Pseudarthrobacter scleromae,Pseudarthrobacter scleromae,515291,516262,...,Pseudarthrobacter scleromae strain CGMCC 1.3601,158897,158897.4,Micrococcales,Micrococcaceae,Pseudarthrobacter,158897.4,fig|158897.4.peg.2682,atgacaaccatttcccgcaccacggaatatccaattctggagaccg...,MTTISRTTEYPILETARQQVLEDGIGLTEAQLVEVLRLPDPAVPAA...
106,gi|914681903|ref|NZ_LAIR01000002.1|,6.94e-168,587.0,73.851,979,1631356,Luteipulveratus halotolerans,Luteipulveratus halotolerans,3964322,3963350,...,Luteipulveratus halotolerans strain C296001,1631356,1631356.5,Micrococcales,Dermacoccaceae,Luteipulveratus,1631356.5,fig|1631356.5.peg.4010,gtgacctcgaccacgacgacgacgtccatcctcgaccgcgcccgcg...,MTSTTTTTSILDRAREQVLDRGEALDEQQILEVLQTSDDQLEPLLA...
109,gi|1823028355|ref|NZ_JAAOIV010000013.1|,2.1600000000000003e-160,563.0,74.646,919,1656884,Metallococcus carri,Metallococcus carri,108669,107766,...,Calidifontibacter sp. DB0510,1656884,1656884.3,Micrococcales,Dermacoccaceae,Metallococcus,1656884.3,fig|1656884.3.peg.1085,atgagcgagaccaccgcgcccgcgtccgtgctcgatccggctcgtg...,MSETTAPASVLDPAREVLDRARETVLVRGEPLGYADLVEILRTPDE...
110,gi|1721967527|ref|NZ_VCQV01000004.1|,4.28e-146,516.0,73.118,930,1660198,Leekyejoonella antrihumi,Leekyejoonella antrihumi,60703,59782,...,Dermacoccaceae bacterium C5-26,1660198,1660198.3,Micrococcales,Dermacoccaceae,Leekyejoonella,1660198.3,fig|1660198.3.peg.3462,atgacttcgaccaccaatgtcacctccatcctcgaccgcgcacgcg...,MTSTTNVTSILDRARDRVLVQGERLGYDDLVEVLQTGDDQLEDLLA...
111,gi|1852738371|ref|NZ_JABTYH010000004.1|,3.31e-134,476.0,73.309,828,1671,Pseudarthrobacter oxydans,Pseudarthrobacter oxydans,161419,162243,...,Pseudarthrobacter oxydans strain USM2,1671,1671.13,Micrococcales,Micrococcaceae,Pseudarthrobacter,1671.13,fig|1671.13.peg.3183,atgacaaccatttcccgcaccacggaatacccaatcctggagacgg...,MTTISRTTEYPILETARQQVLEGGIGLAEAQLLEVLRLPDPAVPAA...
112,gi|1906036206|ref|NZ_BMKU01000006.1|,7.23e-117,417.0,71.347,876,1676,Pseudarthrobacter polychromogenes,Pseudarthrobacter polychromogenes,121353,120488,...,Pseudarthrobacter polychromogenes strain CGMCC...,1676,1676.6,Micrococcales,Micrococcaceae,Pseudarthrobacter,1676.6,fig|1676.6.peg.2354,atgacaaccatttcccccaccacggaatatccaatcctggagaccg...,MTTISPTTEYPILETARQQVLEDGIGLTEAQLLEVLRLPDPAVPAA...
114,gi|1317271524|ref|NZ_PJNE01000001.1|,7.07e-173,603.0,74.419,989,173053,Phycicoccus duodecadis,Phycicoccus duodecadis,1986459,1985481,...,Tetrasphaera duodecadis strain DSM 12806,173053,173053.3,Micrococcales,Intrasporangiaceae,Phycicoccus,173053.3,fig|173053.3.peg.1892,atgacggctcaggtgaccggaaccgacatcctcgacgtcgcgcgcg...,MTAQVTGTDILDVAREQVLERGVALTHEQIVAVLETGDDRLQDLLA...


In [56]:
biob_subset.shape

(344, 21)

In [97]:
# biob_subset[biob_subset['Family'].str.contains('Pseudonocardiaceae')]

In [98]:

# biob_dep[biob_dep['Family'].str.contains('Pseudonocardiaceae')]

In [152]:
# biob_subset[subset_cols_of_interest]

# biob_subset[biob_subset['Genus'].str.contains('Pedo')]
# biob_subset[biob_subset['Family'] == 'Mycobacteriaceae']

biob_dep[biob_dep['Order'] == 'Micrococcales']

Unnamed: 0,seqid,evalue,bitscore,pident,length,staxid,ssname,scomname,sstart,ssend,...,Genome Name,NCBI Taxon ID,Genome ID,Order,Family,Genus,feature.genome_id,feature.patric_id,feature.na_sequence,feature.aa_sequence
349,gi|2179245779|ref|NZ_JAKEEC010000044.1|,1.61e-137,487.0,73.241,867,369936,Arthrobacter alkaliphilus,Arthrobacter alkaliphilus,2145,3001,...,Arthrobacter alkaliphilus ZX_2022a,369936,369936.3,Micrococcales,Micrococcaceae,Arthrobacter,369936.3,fig|369936.3.peg.3263,atgacgattcaggcagaccctcctgttaccagctacgccatcctcg...,MTIQADPPVTSYAILDTARKQVLEDGVGLTESQLVDVLRLPDDAVP...
424,gi|1869538579|ref|NZ_JACCAB010000001.1|,0.0,667.0,75.767,978,642776,Pedococcus badiiscoriae,Pedococcus badiiscoriae,2487278,2488251,...,Phycicoccus badiiscoriae strain DSM 23987,642776,642776.3,Micrococcales,Intrasporangiaceae,Pedococcus,642776.3,fig|642776.3.peg.2394,atgacaattgaggagtcggttgccagcgacgtcctgcaggtcgctc...,MTIEESVASDVLQVAREQVLDRGEPLSHGQILDVLRTPDDQLEELL...
430,gi|1595894816|ref|NZ_SMZQ01000012.1|,2.3399999999999998e-161,565.0,73.684,969,683150,Arthrobacter nitrophenolicus,Arthrobacter nitrophenolicus,28527,27568,...,Arthrobacter nitrophenolicus strain S-A1,683150,683150.17,Micrococcales,Micrococcaceae,Arthrobacter,683150.17,fig|683150.17.peg.367,atgacgattcaaccgcaactcctgaagcatccaatccttgagaccg...,MTIQPQLLKHPILETAREQVLEQGRGLSEAQLVEVLRLPDEALPAA...


## generate biob bsap dependent dataset:

In [254]:
## generate a proximity filter between biob and bsap positions:
bsap = bsap.merge(biob[biob['length'] > 100][['staxid', 'pident', 'length', 'sstart', 'ssend']], on='staxid', suffixes=("_bsap", "_biob"))

In [255]:
def get_feature_proximity(row):
    bsap_start = min(row.sstart_bsap, row.ssend_bsap)
    bsap_end = max(row.sstart_bsap, row.ssend_bsap)
    
    biob_start = min(row.sstart_biob, row.ssend_biob)
    biob_end = max(row.sstart_biob, row.ssend_biob)
    
    
    if biob_start < bsap_start:
        prox = biob_start - biob_end
        
    elif bsap_start <= biob_start:
        prox = biob_start - bsap_end
        
    return prox

In [256]:
bsap['proximity'] = bsap.apply(get_feature_proximity, axis=1)

In [257]:
bsap = bsap[bsap['proximity'].abs() < 1000]

In [258]:
biob_dep = biob[biob['staxid'].isin(bsap['staxid'].unique())].reset_index(drop=True).copy()

# genome_map.head()
genome_map = genome_map[['Genome Name', 'NCBI Taxon ID', 'Genome ID', 'Order', 'Family', 'Genus']]

biob_dep = biob_dep.merge(genome_map, how='left', left_on='staxid', right_on='NCBI Taxon ID')

In [259]:
biob_independent_genus_list = biob_subset['Genus'].unique()

biob_dep = biob_dep[~biob_dep['Genus'].isin(biob_independent_genus_list)]

In [260]:
biob_dep = biob_dep.merge(biob_patric[['feature.genome_id', 'feature.patric_id', 'feature.na_sequence', 'feature.aa_sequence']],
                 how='left', left_on='Genome ID', right_on='feature.genome_id')

In [261]:
bsap_filter_df = bsap.merge(genome_map, how='left', left_on='staxid', right_on='NCBI Taxon ID').merge(
    biob_patric[['feature.genome_id', 'feature.patric_id', 'feature.na_sequence', 'feature.aa_sequence']],
                 how='left', left_on='Genome ID', right_on='feature.genome_id').copy()

In [262]:
duplicate_genes_to_drop = [
    "fig|106370.16.peg.4543",
    "fig|111802.3.peg.9038",
    "fig|1120935.3.peg.6134",
    "fig|1123023.3.peg.5886",
    "fig|1202450.10.peg.2450",
    "fig|1206731.4.peg.7885",
    "fig|1206741.4.peg.4760",
    "fig|1210062.4.peg.6350",
    "fig|1210064.5.peg.5593",
    "fig|1210084.4.peg.9515",
    "fig|1220561.3.peg.4400",
    "fig|1220564.3.peg.6715",
    "fig|1238180.7.peg.744",
    "fig|1265311.3.peg.2802",
    "fig|1490222.3.peg.8831",
    "fig|1719.1356.peg.1786"
    ]

In [263]:
biob_dep['pident'].describe()

count    466.000000
mean      80.894691
std        3.591385
min       67.460000
25%       78.934250
50%       81.103000
75%       83.210000
max       92.944000
Name: pident, dtype: float64

In [264]:
# bsap_filter_df[bsap_filter_df['Genome Name'].isin(biob_dep[biob_dep['Genome Name'].duplicated(keep=False)]['Genome Name'].unique())][
#     ['seqid', 'bitscore', 'pident', 'length', 'sstart', 'ssend', 'Genome Name', 'feature.genome_id', 'feature.patric_id']].reset_index(drop=True).loc[30:41]

# biob_dep[biob_dep['Genome Name'].duplicated(keep=False)][
#     ['seqid', 'bitscore', 'pident', 'length', 'sstart', 'ssend', 'Genome Name', 'feature.genome_id', 'feature.patric_id']].reset_index(drop=True).loc[30:41]

In [265]:
biob_dep = biob_dep[~biob_dep['feature.patric_id'].isin(duplicate_genes_to_drop)].dropna().drop_duplicates(subset=['Genome ID']).reset_index(drop=True)

In [266]:
biob_dep[biob_dep['Genome Name'].duplicated(keep=False)]

Unnamed: 0,seqid,evalue,bitscore,pident,length,staxid,ssname,scomname,sstart,ssend,...,Genome Name,NCBI Taxon ID,Genome ID,Order,Family,Genus,feature.genome_id,feature.patric_id,feature.na_sequence,feature.aa_sequence


In [267]:
biob_dep.shape

(425, 21)

In [268]:
biob_dep.head()

Unnamed: 0,seqid,evalue,bitscore,pident,length,staxid,ssname,scomname,sstart,ssend,...,Genome Name,NCBI Taxon ID,Genome ID,Order,Family,Genus,feature.genome_id,feature.patric_id,feature.na_sequence,feature.aa_sequence
0,gi|1360579553|ref|NZ_CP027433.1|,0.0,836.0,78.926,987,1004901,Gordonia iterans,Gordonia iterans,2728595,2727610,...,Gordonia iterans strain Co17,1004901,1004901.3,Corynebacteriales,Gordoniaceae,Gordonia,1004901.3,fig|1004901.3.peg.2573,gtgaccaccgtcgacgaggcccccgcggccgccaccgaccagaccg...,MTTVDEAPAAATDQTESDILVVAREQVLECGEALRYEQILDVLRLP...
1,gi|400537151|ref|NZ_AFVW02000007.1|,0.0,1290.0,87.226,1049,1041522,Mycobacterium colombiense CECT 3035,Mycobacterium colombiense CECT 3035,261638,260602,...,Mycobacterium colombiense CECT 3035,1041522,1041522.3,Corynebacteriales,Mycobacteriaceae,Mycobacterium,1041522.3,fig|1041522.3.peg.3658,gtgactcaagcggcgacgcgaccgacggccgaagccggcagcgacg...,MTQAATRPTAEAGSDEDILAVARQQVLQDGQGLSRDQVLRVLQLPD...
2,gi|2228921663|ref|NZ_JALKBW010000001.1|,0.0,884.0,79.838,987,1053547,Gordonia alkaliphila,Gordonia alkaliphila,3497259,3496273,...,Gordonia alkaliphila WW102,1053547,1053547.23,Corynebacteriales,Gordoniaceae,Gordonia,1053547.23,fig|1053547.23.peg.3220,atgaccatcaccaccgcctccccgtcgatcgacgacaccgccgccg...,MTITTASPSIDDTAADRDILDVARTQVLDEGVGLNYDQLLQVLNLD...
3,gi|86738724|ref|NC_007777.1|,0.0,646.0,75.368,950,106370,Frankia casuarinae,Frankia casuarinae,4512150,4513092,...,Frankia sp. CcI3,106370,106370.16,Frankiales,Frankiaceae,Frankia,106370.16,fig|106370.16.peg.4098,gtgctcgccgtgctccggctcccggacgagacgctgaccgatctgc...,MLAVLRLPDETLTDLLALAHEVRMRWCGPEVEVEGIVSLKTGGCPE...
4,gi|487404592|ref|NZ_ARVW01000001.1|,0.0,782.0,78.452,956,1068980,Amycolatopsis nigrescens CSC17Ta-90,Amycolatopsis nigrescens CSC17Ta-90,5988697,5989649,...,Amycolatopsis nigrescens CSC17Ta-90,1068980,1068980.3,Pseudonocardiales,Pseudonocardiaceae,Amycolatopsis,1068980.3,fig|1068980.3.peg.5690,gtgaccgcagcacccggacaggccgagcaggtccaccgcgccgacg...,MTAAPGQAEQVHRADADVLAVAREQVLERGTGLGEQQLLEVLRLGD...


In [269]:
biob_dep['feature.na_sequence'].apply(len).describe()

count     425.000000
mean     1016.188235
std        57.653506
min       189.000000
25%      1005.000000
50%      1014.000000
75%      1038.000000
max      1101.000000
Name: feature.na_sequence, dtype: float64

In [271]:
biob_dep = biob_dep[biob_dep['feature.na_sequence'].apply(len) < 1050]

In [279]:
biob_dep[subset_cols_of_interest].to_csv("motif/biob_dependent_seqs.csv", index=False)

In [273]:
# write_df_to_fasta(biob_dep, 'feature.na_sequence', 'Genome Name', "motif/putative2_biob_dependent_seqs_nt.fasta")
write_df_to_fasta(biob_dep, 'feature.aa_sequence', 'Genome Name', "motif/putative2_biob_dependent_seqs_aa_filter3.fasta")


## hack together an alternate tremmer lm file to drop out the long biob copies

In [78]:
biob_subset.shape

(344, 21)

In [81]:
biob_subset['feature.na_sequence'].apply(len).describe()

count     344.000000
mean     1013.843023
std        48.426420
min       477.000000
25%       996.000000
50%       996.000000
75%      1020.000000
max      1194.000000
Name: feature.na_sequence, dtype: float64

In [82]:
lm = pd.read_csv("treemmer/biob_family_lm.csv")

In [85]:
biob_ind_lm_edit = biob_subset[['Genome Name', 'Family']]

In [88]:
biob_ind_lm_edit['name'] = biob_ind_lm_edit['Genome Name'].str.replace(" ", "_")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [89]:
biob_ind_lm_edit.head()

Unnamed: 0,Genome Name,Family,name
0,Nonomuraea soli strain DSM 45533,Streptosporangiaceae,Nonomuraea_soli_strain_DSM_45533
1,Actinoplanes atraurantiacus strain CGMCC 4.6857,Micromonosporaceae,Actinoplanes_atraurantiacus_strain_CGMCC_4.6857
2,Kutzneria kofuensis strain DSM 43851,Pseudonocardiaceae,Kutzneria_kofuensis_strain_DSM_43851
3,Actinoplanes rishiriensis strain NBRC 108556,Micromonosporaceae,Actinoplanes_rishiriensis_strain_NBRC_108556
4,Salinispora pacifica DSM 45546,Micromonosporaceae,Salinispora_pacifica_DSM_45546


In [97]:
biob_ind_lm_edit = lm.merge(biob_ind_lm_edit, how='left', on='name', suffixes=('_old', ''))

In [105]:
biob_ind_lm_edit = biob_ind_lm_edit[['name', 'Family']]

In [109]:
biob_ind_lm_edit['Family'] = biob_ind_lm_edit['Family'].fillna('drop')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [111]:
biob_ind_lm_edit.to_csv('treemmer/biob_family_lm_independentEDIT.csv', index=False)

In [114]:
indlist = pd.read_csv("motif/biob_independent_full_consensus_tree.nwk_trimmed_list_X_100", header=None)

In [119]:
indlist.shape

(100, 1)

In [121]:
biob_ind_lm_edit.merge(indlist, left_on='name', right_on=0)['Family'].isna().sum()

0

## read in aa msas and generate base frequency matrix

In [283]:
# ind = read_in_fasta("motif/putative2_biob_independent_seqs_aa.mfa")
# dep = read_in_fasta("motif/putative2_biob_dependent_seqs_aa.mfa")

# ind = read_in_fasta("motif/putative2_biob_independent_seqs_nt.mfa")
# dep = read_in_fasta("motif/putative2_biob_dependent_seqs_nt.mfa")


In [282]:
# dep_seq = pd.DataFrame(dep['seq'].apply(str).apply(lambda x: list(itertools.chain.from_iterable(x))).tolist(), index=dep['id'])
# ind_seq = pd.DataFrame(ind['seq'].apply(str).apply(lambda x: list(itertools.chain.from_iterable(x))).tolist(), index=ind['id'])



In [None]:
biob_subset

In [None]:
# write_df_to_fasta(biob_dep, 'feature.na_sequence', 'Genome Name', "motif/putative2_biob_dependent_seqs_nt.fasta")


In [284]:
# dep_seq

In [285]:
# ind_seq

## write Tremmer subset100 independent and dependent biob copies to a merged fasta:

In [288]:
# ind = pd.read_csv("motif/biob_independent_full_consensus_tree.nwk_trimmed_list_X_100", header=None)

# ind = pd.read_csv("motif/biob_independent_full_edit_consensus_tree.nwk_trimmed_list_X_100", header=None)
# dep = pd.read_csv("motif/biob_dependent_full_consensus_tree.nwk_trimmed_list_X_100", header=None)

ind = pd.read_csv("motif/putative2_biob_independent_seqs_aa_filter3_consensus.nwk_trimmed_list_X_100", header=None)
dep = pd.read_csv("motif/putative2_biob_dependent_seqs_aa_filter3_consensus.nwk_trimmed_list_X_100", header=None)

In [289]:
merged_df_ind = biob_subset[biob_subset['Genome Name'].isin(ind[0].str.replace("_", " "))].copy()
merged_df_dep = biob_dep[biob_dep['Genome Name'].isin(dep[0].str.replace("_", " "))].copy()

merged_df_ind['type'] = 'ind'
merged_df_dep['type'] = 'dep'

merged_df = pd.concat([merged_df_ind, merged_df_dep]).reset_index(drop=True)

In [290]:
merged_df.head()

Unnamed: 0,seqid,evalue,bitscore,pident,length,staxid,ssname,scomname,sstart,ssend,...,NCBI Taxon ID,Genome ID,Order,Family,Genus,feature.genome_id,feature.patric_id,feature.na_sequence,feature.aa_sequence,type
0,gi|1884111766|ref|NZ_JACDUR010000005.1|,1.47e-180,631.0,75.362,966,1032476,Nonomuraea soli,Nonomuraea soli,1028404,1029363,...,1032476,1032476.3,Streptosporangiales,Streptosporangiaceae,Nonomuraea,1032476.3,fig|1032476.3.peg.5624,gtgacgatcctcgacatcgcccgcacccaggtcctggagcagggca...,MTILDIARTQVLEQGKGLDREQALRCLLLDDALLPDLLALAHEVRM...,ind
1,gi|1254547583|ref|NZ_OBDY01000010.1|,1.22e-175,614.0,74.92,937,1036182,Actinoplanes atraurantiacus,Actinoplanes atraurantiacus,77996,77063,...,1036182,1036182.3,Micromonosporales,Micromonosporaceae,Actinoplanes,1036182.3,fig|1036182.3.peg.2302,atgcccgagatcctcgagatggcccgtgcccaggttctcgccggcg...,MPEILEMARAQVLAGGAGLGEAQILEVLRLGDDHLPELLQLAHDVR...,ind
2,gi|1892753580|ref|NZ_JACHIR010000001.1|,0.0,753.0,77.409,965,103725,Kutzneria kofuensis,Kutzneria kofuensis,5947414,5948375,...,103725,103725.4,Pseudonocardiales,Pseudonocardiaceae,Kutzneria,103725.4,fig|103725.4.peg.5640,gtgaccgccagcgtggacaccgacatcatcgccgtcgcccgtgagc...,MTASVDTDIIAVAREQVLERGEGLSQEQLLQVLKLGDDRLAELLQL...,ind
3,gi|1892940858|ref|NZ_JACHGN010000004.1|,0.0,725.0,76.737,993,1073253,Thermocatellispora tengchongensis,Thermocatellispora tengchongensis,576796,577782,...,1073253,1073253.3,Streptosporangiales,Streptosporangiaceae,Thermocatellispora,1073253.3,fig|1073253.3.peg.2482,gtggacattctgaccgtcgcccgcgaccaggtgctcgaccggggcg...,MDILTVARDQVLDRGEGLTAEQALECLRLPDERLPELLALAHEVRV...,ind
4,gi|1850378389|ref|NZ_AP022870.1|,0.0,655.0,75.602,955,1076124,Phytohabitans flavus,Phytohabitans flavus,9487101,9486152,...,1076124,1076124.3,Micromonosporales,Micromonosporaceae,Phytohabitans,1076124.3,fig|1076124.3.peg.9236,atgccagagatcctcgacatcgctcgtgcccaggtgctcgaacgcg...,MPEILDIARAQVLERGTGLDEAGVLAVLRLPDEHVPDALQLAHEVR...,ind


In [291]:
merged_df.columns

Index(['seqid', 'evalue', 'bitscore', 'pident', 'length', 'staxid', 'ssname',
       'scomname', 'sstart', 'ssend', 'sseq', 'Genome Name', 'NCBI Taxon ID',
       'Genome ID', 'Order', 'Family', 'Genus', 'feature.genome_id',
       'feature.patric_id', 'feature.na_sequence', 'feature.aa_sequence',
       'type'],
      dtype='object')

In [292]:
merged_df

Unnamed: 0,seqid,evalue,bitscore,pident,length,staxid,ssname,scomname,sstart,ssend,...,NCBI Taxon ID,Genome ID,Order,Family,Genus,feature.genome_id,feature.patric_id,feature.na_sequence,feature.aa_sequence,type
0,gi|1884111766|ref|NZ_JACDUR010000005.1|,1.470000e-180,631.0,75.362,966,1032476,Nonomuraea soli,Nonomuraea soli,1028404,1029363,...,1032476,1032476.3,Streptosporangiales,Streptosporangiaceae,Nonomuraea,1032476.3,fig|1032476.3.peg.5624,gtgacgatcctcgacatcgcccgcacccaggtcctggagcagggca...,MTILDIARTQVLEQGKGLDREQALRCLLLDDALLPDLLALAHEVRM...,ind
1,gi|1254547583|ref|NZ_OBDY01000010.1|,1.220000e-175,614.0,74.920,937,1036182,Actinoplanes atraurantiacus,Actinoplanes atraurantiacus,77996,77063,...,1036182,1036182.3,Micromonosporales,Micromonosporaceae,Actinoplanes,1036182.3,fig|1036182.3.peg.2302,atgcccgagatcctcgagatggcccgtgcccaggttctcgccggcg...,MPEILEMARAQVLAGGAGLGEAQILEVLRLGDDHLPELLQLAHDVR...,ind
2,gi|1892753580|ref|NZ_JACHIR010000001.1|,0.000000e+00,753.0,77.409,965,103725,Kutzneria kofuensis,Kutzneria kofuensis,5947414,5948375,...,103725,103725.4,Pseudonocardiales,Pseudonocardiaceae,Kutzneria,103725.4,fig|103725.4.peg.5640,gtgaccgccagcgtggacaccgacatcatcgccgtcgcccgtgagc...,MTASVDTDIIAVAREQVLERGEGLSQEQLLQVLKLGDDRLAELLQL...,ind
3,gi|1892940858|ref|NZ_JACHGN010000004.1|,0.000000e+00,725.0,76.737,993,1073253,Thermocatellispora tengchongensis,Thermocatellispora tengchongensis,576796,577782,...,1073253,1073253.3,Streptosporangiales,Streptosporangiaceae,Thermocatellispora,1073253.3,fig|1073253.3.peg.2482,gtggacattctgaccgtcgcccgcgaccaggtgctcgaccggggcg...,MDILTVARDQVLDRGEGLTAEQALECLRLPDERLPELLALAHEVRV...,ind
4,gi|1850378389|ref|NZ_AP022870.1|,0.000000e+00,655.0,75.602,955,1076124,Phytohabitans flavus,Phytohabitans flavus,9487101,9486152,...,1076124,1076124.3,Micromonosporales,Micromonosporaceae,Phytohabitans,1076124.3,fig|1076124.3.peg.9236,atgccagagatcctcgacatcgctcgtgcccaggtgctcgaacgcg...,MPEILDIARAQVLERGTGLDEAGVLAVLRLPDEHVPDALQLAHEVR...,ind
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,gi|2175968702|ref|NZ_CP090063.1|,0.000000e+00,744.0,77.503,969,715473,Amycolatopsis acidiphila,Amycolatopsis acidiphila,6252312,6251348,...,715473,715473.5,Pseudonocardiales,Pseudonocardiaceae,Amycolatopsis,715473.5,fig|715473.5.peg.6308,gtgacctcagcacccgaacagaccgacgtggtcgccgtcgcccggg...,MTSAPEQTDVVAVAREQVLEQGIGLSQRQVLDVLRLADDRLSELLA...,dep
188,gi|1697805133|ref|NZ_BJNG01000006.1|,0.000000e+00,866.0,80.144,972,76726,Pseudonocardia hydrocarbonoxydans,Pseudonocardia hydrocarbonoxydans,46003,45037,...,76726,76726.4,Pseudonocardiales,Pseudonocardiaceae,Pseudonocardia,76726.4,fig|76726.4.peg.840,atggccgaagccgccgccatcctcgccaccgcccgcacccgggtgc...,MAEAAAILATARTRVLEEGVGLDEAQVLEVLRLPDEALDDLLQLAH...,dep
189,gi|1986697868|ref|NZ_BOOC01000053.1|,0.000000e+00,673.0,75.635,985,83302,Microbispora corallina,Microbispora corallina,20894,21873,...,83302,83302.3,Streptosporangiales,Streptosporangiaceae,Microbispora,83302.3,fig|83302.3.peg.7395,gtgaacactgacatcgtggagatcgcccgcgtccaggtgctcgagg...,MNTDIVEIARVQVLEEGRGLDAAQALECLTLPDDRLPELLALAHEV...,dep
190,gi|1816837986|ref|NZ_BLKW01000004.1|,0.000000e+00,1098.0,84.438,996,84962,Mycobacterium botniense,Mycobacterium botniense,1033100,1034095,...,84962,84962.6,Corynebacteriales,Mycobacteriaceae,Mycobacterium,84962.6,fig|84962.6.peg.3194,gtgacgcaggcgatgacccggccggccgccgatgacagcaacaccg...,MTQAMTRPAADDSNTDVLAVARRQVLELGKGLTRDQVLQVLQLPDE...,dep


In [293]:
write_df_to_fasta(merged_df, 'feature.aa_sequence', 'Genome Name', "motif/biob_merged_subset200_filter3.fasta")


In [294]:
merged_df = merged_df[['Genome Name', 'staxid', 'Order', 'Family', 'Genus', 'type']]

In [295]:
# add manually added data:

data = [
    ["Escherichia coli CFT073", 199310, "Enterobacterales", "Enterobacteriaceae", "Escherichia", "ind"],
    ["Mycobacterium smegmatis MC2155", 246196, "Corynebacteriales", "Mycobacteriaceae", "Mycobacterium", "dep"],   
    ["Mycobacterium tuberculosis H37Rv", 83332, "Corynebacteriales", "Mycobacteriaceae", "Mycobacterium", "dep"]
    ]

merged_df = merged_df.append(pd.DataFrame(data, columns=merged_df.columns)).reset_index(drop=True)

In [296]:
merged_df['name'] = merged_df['Genome Name'].str.replace(" ", "_")

In [297]:
merged_df.to_csv("motif/biob_merged_subset200_annotation_filter3.csv", index=False)

## read in msa made from merged independent/dependent biob dataset and split into seperate independent and dependent mfa files

In [72]:
df = read_in_fasta("motif/merged_biob_seqs_aa.mfa")


In [75]:
independent_species = ind['id'].tolist()
dependent_species = dep['id'].tolist()

In [76]:
df[df['id'].isin(independent_species)]

Unnamed: 0,id,seq
0,Ornithinimicrobium_tianjinense_strain_CGMCC_1....,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
8,Sphaerimonospora_thailandensis_strain_NBRC_107569,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
10,Conexibacter_sp._Seoho-28,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
11,Conexibacter_arvalis_strain_DSM_23288,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
12,Nitriliruptor_alkaliphilus_DSM_45188,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
...,...,...
804,Dactylosporangium_siamense_strain_NBRC_106093,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
805,Virgisporangium_aliadipatigenens_strain_NBRC_1...,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
806,Planosporangium_thailandense_strain_TBRC_5610,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
807,Planosporangium_mesophilum_strain_NBRC_109066,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."


In [77]:
df[df['id'].isin(dependent_species)]

Unnamed: 0,id,seq
1,Nocardia_sp._YIM_PH_21724,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
2,Kribbella_jejuensis_strain_DSM_17305,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
3,Nocardia_sp._ET3-3,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
4,Mycobacterium_haemophilum_DSM_44634_ATCC_29548,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
5,Mycobacterium_vulneris_strain_DSM_45247,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
...,...,...
610,Microbispora_sp._NEAU-HEGS1-5,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
611,Microbispora_camponoti_strain_2C-HV3,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
612,Streptosporangium_roseum_DSM_43021,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
614,Streptosporangium_subroseum_strain_CGMCC_4.2132,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."


In [80]:
write_df_to_fasta(df[df['id'].isin(dependent_species)], 'seq', 'id', 'merged_align_biob_dependent_seqs_aa.mfa')
write_df_to_fasta(df[df['id'].isin(independent_species)], 'seq', 'id', 'merged_align_biob_independent_seqs_aa.mfa')