In [3]:
import pandas as pd
import numpy as np
import os

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [None]:
def write_df_to_fasta(df, seq_col, name_col, fasta_out_path):

    sequences = []

    for i in df.index:

        seq_id = df.loc[i, name_col].replace(" ", "_")

        # sequences.append(SeqRecord(Seq(df.loc[i, seq_col]), id=seq_id, description=""))
        sequences.append(SeqRecord(df.loc[i, seq_col], id=seq_id, description=""))

    with open(fasta_out_path, "w") as output_handle:
        SeqIO.write(sequences, output_handle, "fasta")

    return


def read_in_fasta(fasta_path):
    seqs = []

    for record in SeqIO.parse(fasta_path, "fasta"):

        seqs.append([record.id, record.seq])

    seqs = pd.DataFrame(seqs, columns=['id', 'seq'])

    return seqs

## Get Kingdom level biob dataset and sample down to n=1000:

In [None]:
df = pd.read_csv("motif/Bacteria_bvbrc_genomes.csv")

df = df[['Genome ID', 'Genome Name', 'NCBI Taxon ID', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Genome Status', 'Reference', 'BioProject Accession']]
df = df.dropna(subset=['Kingdom'])

# df.to_csv("motif/bacteria_bvbrc_genomes_cleaned.csv", index=None)
# df[['NCBI Taxon ID']].to_csv("motif/bacteria_bvbrc_genomes_cleaned_taxid_list", index=None, header=None)

In [None]:
ncbi_taxid_dict = df[['Genome ID', 'NCBI Taxon ID']].set_index('Genome ID').to_dict()['NCBI Taxon ID']

In [None]:
biob = pd.read_csv("motif_prev/PGF_01400330_family_feature_list_full_seq.tsv", delimiter="\t")

In [None]:
biob = biob.merge(df[['Genome ID', 'Phylum', 'Class', 'Order', 'Family', 'Genus']], left_on='feature.genome_id', right_on='Genome ID', how='inner')
biob = biob.drop_duplicates(subset=['Genome ID'])

biob = biob.dropna(subset=['feature.na_sequence'])
biob = biob[biob['feature.na_sequence'].apply(len) > 500]

biob['class_weight'] = biob['Class'].map(biob['Class'].value_counts().to_dict())

In [None]:
# biob_subset = biob.sample(n=1000, weights='class_weight')

# biob_subset['feature.genome_name'] = biob_subset['feature.genome_name'].str.replace(" strain Not applicable", "")
# biob_subset['feature.genome_name'] = biob_subset['feature.genome_name'].str.replace(" strain not applicable", "")

# biob_subset['name'] = biob_subset['feature.genome_name'].str.replace(" ", "_")

# write_df_to_fasta(biob_subset, 'feature.aa_sequence', 'name', 'motif/bacteria_subset1000_biob_aa.fasta')

In [None]:
biob_subset = read_in_fasta("motif/bacteria_subset1000_biob_aa.fasta")

In [None]:
biob['id'] = biob['feature.genome_name'].str.replace(" strain Not applicable", "")
biob['id'] = biob['id'].str.replace(" strain not applicable", "")

biob['id'] = biob['id'].str.replace(" ", "_")

In [None]:
biob_subset = biob.merge(biob_subset, how='right', on='id').drop(columns=['feature.na_sequence', 'feature.aa_sequence', 'feature.genome_id', 'family'])

In [None]:
# biob_subset.to_csv("motif/bacteria_subset1000_biob_metadata.csv", index=False)

In [None]:
# biob_subset[['id', 'Family']].to_csv("motif/bacteria_subset1000_family_lm.csv", index=False)

## filter biob subset by treemmer 500x and 200x lists and write new fastas:

In [None]:
x500 = pd.read_csv("motif/bacteria_subset1000_biob_aa2.msa.contree_trimmed_list_X_500", names=['id'])
x200 = pd.read_csv("motif/bacteria_subset1000_biob_aa2.msa.contree_trimmed_list_X_200", names=['id'])

In [None]:
x500 = biob_subset[biob_subset['id'].isin(x500['id'])]
x200 = biob_subset[biob_subset['id'].isin(x200['id'])]

In [None]:
x200[['id', 'seq']].head()

Unnamed: 0,id,seq
11,Hydrogenophaga_pseudoflava_strain_DSM_1084,"(M, N, H, I, A, E, A, P, V, T, L, H, R, P, A, ..."
12,Thioalkalivibrio_sulfidophilus_HL-EbGr7,"(M, S, P, A, S, P, M, S, E, I, R, H, D, W, Q, ..."
19,Nocardiopsis_gilva_YIM_90087,"(M, I, F, V, M, V, K, F, D, A, L, A, D, K, A, ..."
22,Candidatus_Methylospira_mobilis_strain_Shm1,"(M, L, I, R, S, I, D, A, D, E, A, T, G, C, V, ..."
23,Thiomicrorhabdus_sp._13-15A,"(M, S, E, N, A, Q, T, S, Q, I, G, Q, I, R, H, ..."


In [None]:
# write_df_to_fasta(x500, 'seq', 'id', 'motif/bacteria_subset500_biob_aa.fasta')
# write_df_to_fasta(x200, 'seq', 'id', 'motif/bacteria_subset200_biob_aa.fasta')

## setup bsap independent/dependent categorization

In [None]:
blast_cols = ['seqid', 'evalue', 'bitscore', 'pident', 'length', 'staxid', 'ssname', 'scomname', 'sstart', 'ssend', 'sseq']

bsap_hits = pd.read_csv("motif/bsap_bvbrc_blastn_hits_1Oct2023.tsv", delimiter="\t", names=blast_cols, index_col=False)
biob_hits = pd.read_csv("motif/biob_bvbrc_blastn_hits_3Oct2023.tsv", delimiter="\t", names=blast_cols, index_col=False)

bsap_hits = bsap_hits[ (bsap_hits['length'] > 75) & (bsap_hits['pident'] > 50) ]
biob_hits = biob_hits[ (biob_hits['length'] > 100) ]

In [None]:
biob_subset['staxid'] = biob_subset['Genome ID'].map(ncbi_taxid_dict)

In [None]:
cols_to_merge = ['pident', 'length', 'staxid', 'ssname', 'sstart', 'ssend']
bsap_hits = biob_hits[cols_to_merge].merge(bsap_hits[cols_to_merge], how='inner', on='staxid', suffixes=("_biob", "_bsap"))

In [None]:
def get_feature_proximity(row):
    bsap_start = min(row.sstart_bsap, row.ssend_bsap)
    bsap_end = max(row.sstart_bsap, row.ssend_bsap)

    biob_start = min(row.sstart_biob, row.ssend_biob)
    biob_end = max(row.sstart_biob, row.ssend_biob)


    if biob_start < bsap_start:
        prox = biob_start - biob_end

    elif bsap_start <= biob_start:
        prox = biob_start - bsap_end

    return prox


In [None]:
bsap_hits['proximity'] = bsap_hits.apply(get_feature_proximity, axis=1)

In [None]:
bsap_hits = bsap_hits[bsap_hits['proximity'] < 1000]

In [None]:
bsap_hits.columns

Index(['pident_biob', 'length_biob', 'staxid', 'ssname_biob', 'sstart_biob',
       'ssend_biob', 'pident_bsap', 'length_bsap', 'ssname_bsap',
       'sstart_bsap', 'ssend_bsap', 'proximity'],
      dtype='object')

In [None]:
biob_subset['type'] = 'ind'
biob_subset.loc[biob_subset['staxid'].isin(bsap_hits['staxid']), 'type'] = 'dep'

In [None]:
biob_subset['type'].value_counts()

type
ind    931
dep     69
Name: count, dtype: int64

In [None]:
biob_subset[['id', 'type']].to_csv('motif/biob_subset1000_species_type.csv', index=False)

## seperate x200 and x500 datasets into independent and dependent msa files

In [None]:
dep_species_list = biob_subset[biob_subset['type'] == 'dep']['id'].tolist()

In [None]:
x500[x500['id'].isin(dep_species_list)].shape

(12, 11)

In [None]:
x200[x200['id'].isin(dep_species_list)].shape

(2, 11)

In [None]:
x503 = read_in_fasta("motif/bacteria_subset503_biob_aa.msa")

In [None]:
write_df_to_fasta(x503[x503['id'].isin(dep_species_list)], 'seq', 'id', 'motif/bacteria_subset503_biob_dependent_aa.msa')
write_df_to_fasta(x503[~x503['id'].isin(dep_species_list)], 'seq', 'id', 'motif/bacteria_subset503_biob_independent_aa.msa')

In [None]:
# biob_subset.append()


Index(['feature.genome_name', 'feature.patric_id', 'Genome ID', 'Phylum',
       'Class', 'Order', 'Family', 'Genus', 'class_weight', 'id', 'seq',
       'staxid', 'type'],
      dtype='object')

In [None]:
# write_df_to_fasta(x500, 'seq', 'id', 'motif/bacteria_subset500_biob_aa.fasta')
# write_df_to_fasta(x200, 'seq', 'id', 'motif/bacteria_subset200_biob_aa.fasta')

## set up labeling files for subset300 (for whitch bacteria x200 was cat-ted with a n100 subset of bsap-dependent biob copies:

In [4]:
import pandas as pd
import os
from google.colab import drive


In [8]:

drive.mount('/content/drive')


Mounted at /content/drive


In [14]:
x300 = pd.read_csv("/content/drive/MyDrive/seds_final_files/bsaP_homology/motif/bacteria_subset300_type.csv")
dep = pd.read_csv("/content/drive/MyDrive/seds_final_files/bsaP_homology/motif/biob_merged_subset200_annotation_filter3.csv")

In [16]:
x300.head()

Unnamed: 0.1,Unnamed: 0,id,type
0,0,Lawsonella_clevelandensis_strain_X1036,ind
1,1,Corynebacterium_terpenotabidum_Y-11,dep
2,2,Corynebacterium_variabile_strain_NBRC_15286,dep
3,3,Mycobacterium_terrae_strain_NCTC10856,dep
4,4,Corynebacterium_sp._Sa1YVA5,dep


In [20]:

np.setdiff1d(x300[x300['type'] == 'dep']['id'].tolist(), dep['name'].tolist())

array(['Actinomadura_macra_NBRC_14102',
       'Alloactinosynnema_iranicum_strain_IBRC-M_10403',
       'Amycolatopsis_pretoriensis_strain_DSM_44654',
       'Amycolatopsis_rhizosphaerae_strain_TBRC_6029',
       'Amycolatopsis_sp._K13G38', 'Corynebacterium_phoceense_4QC4O2',
       'Corynebacterium_sp._Sa1YVA5',
       'Kribbella_albertanoniae_strain_JCM_30547',
       'Kribbella_aluminosa_strain_DSM_18824',
       'Mycobacterium_conspicuum_strain_JCM_14738',
       'Mycobacterium_heidelbergense_strain_JCM_14842',
       'Mycobacterium_sp._CECT_8779',
       'Mycobacterium_thermoresistibile_strain_NCTC10409',
       'Mycobacterium_timonense_CCUG_56329',
       'Mycobacteroides_salmoniphilum_strain_DSM_43276',
       'Mycolicibacterium_litorale_strain_JCM_17423',
       'Mycolicibacterium_sp._CECT_8783', 'Nocardia_anaemiae_NBRC_100462',
       'Nocardia_fluminea_strain_DSM_44489',
       'Nocardia_inohanensis_NBRC_100128',
       'Nocardia_pseudobrasiliensis_strain_DSM_44290',
       '

In [18]:

np.setdiff1d(x300['id'])

In [17]:
dep['name'].tolist()

Unnamed: 0,Genome Name,staxid,Order,Family,Genus,type,name
0,Nonomuraea soli strain DSM 45533,1032476,Streptosporangiales,Streptosporangiaceae,Nonomuraea,ind,Nonomuraea_soli_strain_DSM_45533
1,Actinoplanes atraurantiacus strain CGMCC 4.6857,1036182,Micromonosporales,Micromonosporaceae,Actinoplanes,ind,Actinoplanes_atraurantiacus_strain_CGMCC_4.6857
2,Kutzneria kofuensis strain DSM 43851,103725,Pseudonocardiales,Pseudonocardiaceae,Kutzneria,ind,Kutzneria_kofuensis_strain_DSM_43851
3,Thermocatellispora tengchongensis strain DSM 4...,1073253,Streptosporangiales,Streptosporangiaceae,Thermocatellispora,ind,Thermocatellispora_tengchongensis_strain_DSM_4...
4,Phytohabitans flavus strain NBRC 107702,1076124,Micromonosporales,Micromonosporaceae,Phytohabitans,ind,Phytohabitans_flavus_strain_NBRC_107702


In [None]:
x300 = read_in_fasta("motif/bacteria_subset300FIX_biob_aa.msa")

# dep = read_in_fasta("motif/putative2_biob_dependent_seqs_aa_filter3.fasta")
dep_subset = read_in_fasta("motif/biob_merged_to_dependent_subset100.msa")

# write_df_to_fasta(dep[dep['id'].isin(dep_subset['id'])], 'seq', 'id', 'motif/biob_dependent_subset100.fasta')

In [None]:
x300['type'] = 'ind'
x300.loc[x300['id'].isin(dep_subset['id']), 'type'] = 'dep'

In [None]:

x300[['id', 'type']].to_csv("motif/bacteria_subset300_type.csv")

In [None]:

write_df_to_fasta(x300[x300['type'] == 'ind'], 'seq', 'id', 'motif/bacteria_biob_subset300_independent.msa')
write_df_to_fasta(x300[x300['type'] == 'dep'], 'seq', 'id', 'motif/bacteria_biob_subset300_dependent.msa')

In [None]:
x300

Unnamed: 0,id,seq,type
0,Lawsonella_clevelandensis_strain_X1036,"(M, S, Y, C, N, S, T, A, A, V, P, P, L, A, D, ...",ind
1,Corynebacterium_terpenotabidum_Y-11,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...",dep
2,Corynebacterium_variabile_strain_NBRC_15286,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...",dep
3,Mycobacterium_terrae_strain_NCTC10856,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...",dep
4,Corynebacterium_sp._Sa1YVA5,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...",dep
...,...,...,...
275,Clostridium_botulinum_A_str._ATCC_3502,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...",ind
276,Megamonas_hypermegale_strain_NCTC10570,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...",ind
277,Lachnospiraceae_bacterium_NSJ-38,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...",ind
278,Clostridium_beijerinckii_strain_DSM_791,"(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ...",ind
