# Build BLAST DB from sequences, then query against them

In [157]:
# pip install pandas
# conda install -c bioconda blast

import csv
from subprocess import run
import pandas as pd
import os

def make_unique_index(index):
    seen = {}
    unique_index = []
    for idx in index:
        count = seen.get(idx, 0)
        if count:
            unique_idx = f"{idx}-{count + 1}"
            while unique_idx in seen:
                count += 1
                unique_idx = f"{idx}-{count + 1}"
        else:
            unique_idx = idx
        unique_index.append(unique_idx)
        seen[idx] = count + 1
    return unique_index

def df_to_fasta(tab, fasta_filename,id_cols=['Enhancer_ID'],seq_col='Enhancer_sequence'):
    """Takes a pandas df and writes sequences to a FASTA file (give whole path)."""
    tab=tab.loc[~tab[seq_col].isna(),:]
    tab.index=tab[id_cols].astype(str).agg('_'.join, axis=1)
    tab.index = make_unique_index(tab.index)
    with  open(fasta_filename, 'w') as fasta_file:
        for row in tab.index:
            # Adjust indices as per your CSV structure
            fasta_file.write(f'>{row}\n{tab.loc[row,seq_col]}\n')

def create_blast_db(fasta_filename, db_name,env_bin_path=''):
    """Creates a BLAST database from the given FASTA file."""
    cmd = [os.path.join(env_bin_path,'makeblastdb'), '-in', fasta_filename, '-dbtype', 'nucl', '-out', db_name]
    run(cmd)

def query_blast_db(test_seq, db_name, output_file,other_args=[],env_bin_path=''):
    """Queries the BLAST database with the given test sequence. Outputs in commented tabular format (outfmt7)"""
    cmd = [os.path.join(env_bin_path,'blastn'), '-query', test_seq, '-db', db_name, '-out', output_file,'-outfmt', '7']+other_args
    run(cmd)

def read_blast_output7(output):
    """Reads BLAST outfmt 7 to pandas dataframe."""
    return(pd.read_csv(output,comment='#',sep='\t',header=None,
               names=["qseqid", "sseqid", "pident", "length", "mismatch",
                      "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]))



"\nif __name__ == '__main__':\n    # 1. Convert CSV to FASTA\n    csv_filename = '~/Matthew/data/'\n    fasta_filename = 'dna_sequences.fasta'\n    csv_to_fasta(csv_filename, fasta_filename)\n\n    # 2. Create BLAST database\n    db_name = 'dna_sequences_db'\n    create_blast_db(fasta_filename, db_name)\n\n    # 3. Query the BLAST database with test sequences\n    test_seq_filename = 'test_sequences.fasta'\n    output_filename = 'blast_results.txt'\n    query_blast_db(test_seq_filename, db_name, output_filename)\n"

## Building the database

In [158]:
csv_filename = '~/Matthew/data/Enhancer_library.csv'
out_path='/home/matthew.schmitz/Matthew/deliveries/enhancer_blast/'
env_bin_path='/allen/programs/celltypes/workgroups/rnaseqanalysis/EvoGen/Team/Matthew/utils/miniconda3/envs/blaster/bin/'
#moved loading table out of csv_to_fasta to give more flexibility
encoding = 'ISO-8859-1'
#Read in CSV
tab=pd.read_csv(csv_filename,header=0,sep=',',encoding = encoding)
print(tab)


      Serial Enhancer_ID Enhancer_Alias Target_cell_population_standard  \
0        1.0    AiE0001h      eHGT_001h                             Vip   
1        1.0    AiE0001m      eHGT_001m                             Vip   
2        2.0    AiE0002m      eHGT_002m                           Pvalb   
3        2.0    AiE0002h      eHGT_002h                           Pvalb   
4        3.0    AiE0003h      eHGT_003h                           Pvalb   
...      ...         ...            ...                             ...   
3852     NaN         NaN            NaN                             NaN   
3853     NaN         NaN            NaN                             NaN   
3854     NaN         NaN            NaN                             NaN   
3855     NaN         NaN            NaN                             NaN   
3856     NaN         NaN            NaN                             NaN   

     Target_cell_population_raw Target_brain_region  \
0                     VIP class             

In [160]:
#Write fasta
df_to_fasta(tab, fasta_filename=os.path.join(out_path,'enhancers.fa'),id_cols=['Enhancer_ID'],seq_col='Enhancer_sequence')

In [161]:
#build db
create_blast_db(fasta_filename=os.path.join(out_path,'enhancers.fa'),
                db_name=os.path.join(out_path,'enhancers.db'),
               env_bin_path=env_bin_path)



Building a new DB, current time: 10/23/2023 17:26:25
New DB name:   /home/matthew.schmitz/Matthew/deliveries/enhancer_blast/enhancers.db
New DB title:  /home/matthew.schmitz/Matthew/deliveries/enhancer_blast/enhancers.fa
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/matthew.schmitz/Matthew/deliveries/enhancer_blast/enhancers.db
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 2567 sequences in 0.18595 seconds.


## Querying the db

In [162]:
test_dict={'silly_seq1':'GGGGGGGGGGGGGGG',
          'silly_seq2':'AATAGACCA',
          'silly_seq3':'CATCATCATCATCATCATCATTTTTTTTTTTT',
          'silly_seq4':'CCAAATTCACCTCTTTATTCGGATGAGTGAAGGGAATTGCCTGAAGCCCAGAGGAGTATTGATTTTTGAACAATAATTAAAATTTAATGGTGTTAATCAGTTTCACATAGAACTATAAATTTCACCCAAAAGAGCAGATAAATTGTTGAAGAAGAATATCTGAC'}
test_df= pd.DataFrame(list(test_dict.items()), columns=['Enhancer_ID', 'Enhancer_sequence'])

In [163]:
df_to_fasta(test_df, fasta_filename=os.path.join(out_path,'test.fa'),id_cols=['Enhancer_ID'],seq_col='Enhancer_sequence')

In [164]:
query_blast_db(test_seq=os.path.join(out_path,'test.fa'),
               db_name=os.path.join(out_path,'enhancers.db'),
               output_file='/home/matthew.schmitz/test.blast.out',
               env_bin_path=env_bin_path)

In [173]:
output='/home/matthew.schmitz/test.blast.out'
read_blast_output7(output)

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,silly_seq4,AiE2071m,100.0,164,0,0,1,164,1,164,6.760000000000001e-84,303
