In [3]:
## Libraries 
import re
from Bio import Entrez, SeqIO
# from packages import orf_scan_analyze, query_proteins, blast_proteins
import random
import time

In [2]:
Entrez.email = "neel.mehtani@gmail.com"
handle = Entrez.einfo()

record = Entrez.read(handle)
handle.close()

record.keys()

dict_keys(['DbList'])

In [3]:
record["DbList"]

['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'biosystems', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']

In [3]:
handle = Entrez.esearch(db="nucleotide", term="Brassica napus")

record = Entrez.read(handle)
handle.close()

record.keys()

dict_keys(['Count', 'RetMax', 'RetStart', 'IdList', 'TranslationSet', 'TranslationStack', 'QueryTranslation'])

In [5]:
record["IdList"]

['937575234', '2194135529', '2193981524', '2191950016', '2191950006', '2191387657', '2191387649', '2191387642', '2191387633', '2191387610', '2191387605', '2191387595', '2191387586', '2191387579', '2191387573', '2191387564', '2191387556', '2191387550', '2191387542', '2191387535']

In [4]:
## https://www.ncbi.nlm.nih.gov/Traces/wgs/?display=contigs&page=1
Entrez.email = "neel.mehtani@gmail.com"
handle = Entrez.efetch(db="nucleotide", rettype="gb", id="QGKT01000001")
record = SeqIO.read(handle, "genbank")
record.seq

Seq('TCAACCACAAAGTCAAAAAACCAAATCCATCCTTTTAATTTTAAAAAAAAGTAT...CGT')

In [7]:
print(len(record.seq)/1000000, "Mb")

0.373899 Mb


### Scan genomic strands for potential ORFs longer than 50 codons & translate to proteins

In [49]:
seq = record.seq

table = 11
min_pro_len = 50

## Lists for ORF protein details
orf_proteins = []
protein_wts = []
protein_lens = []

## Start scan of both genomic strands
for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
    
    ## Scan each possible frame
    for frame in range(3):
        
        ## Transalte the frame
        for pro in nuc[frame:].translate(table).split("*"):
            
            if len(pro) >= min_pro_len:
                
                ## Run a protein "param" analysis -- returns molecular weight/amino acid freqs etc.
                analysis = ProteinAnalysis(str(pro))
                
                ## Add protein details 
                orf_proteins.append(pro)
                protein_wts.append(round(analysis.molecular_weight(), 3))
                protein_lens.append(len(pro))
                
                ## Output to screen
#                 print("%s...%s - length %i, strand %i, frame %i" % (pro[:30], pro[-3:], len(pro), strand, frame))

protein_wts = np.array(protein_wts)
protein_lens = np.array(protein_lens)
orf_proteins = np.array(orf_proteins, dtype='object')

query_protein_idxs = np.where(protein_lens < 1000)[0][:]

orf_proteins_arr = np.array(orf_proteins, dtype='object')
query_proteins = orf_proteins_arr[query_protein_idxs]



In [50]:
query_protein_idxs.shape

(1449,)

In [32]:
query_proteins = orf_proteins_arr[query_protein_idxs]
query_proteins[:2]

array([Seq('VFGMLPNLCLLAGKQITKNPTTKPYKVGTTHPKNGDVVIFENPNYNNPNVSEQE...ICF'),
       Seq('AVIKIKESTFPNVKPNSALVMILKQPKAWKSHRKQSFWFPQELKHLFLGTKFY')],
      dtype=object)

### BLAST 5 Proteins of length <= 1000 amino acids

In [33]:
Entrez.email = "neel.mehtani@gmail.com"
query_ct = 1
for q in query_proteins:
    
    out_file = open("blast_{}.xml".format(query_ct), "w") 
    blast_handle = NCBIWWW.qblast("blastp", "nr", q)

    out_file.write(blast_handle.read())
    out_file.close()

    blast_handle.close()
    
    result_handle = open("blast_{}.xml".format(query_ct))
    blast_records = NCBIXML.read(result_handle)
    
    save_file = open("BLAST_query_{}_result_log".format(query_ct), "w")
    save_file.write("BLAST Query #{} Results -- Sequence Example\n".format(query_ct) + "--------------------------" + "\n\n")

    e_value_thresh = 0.25
    
    for alignment in blast_records.alignments:

        for hsp in alignment.hsps:
            if hsp.expect < e_value_thresh:
                
#                 content = ["sequence: {} \n".format(alignment.title), "length: {} \n".format(alignment.length),
#                            "E Value: {} \n".format(hsp.expect), "{} ...  \n".format(hsp.query[0:20]), "{} ...  \n".format(hsp.match[0:20]), 
#                            "{} ...  \n".format(hsp.sbjct[0:20]), "\n\n"]
                
                save_file.write("sequence: {} \n".format(alignment.title))
                save_file.write("length: {} \n".format(alignment.length))
                save_file.write("E Value: {} \n".format(hsp.expect))
                save_file.write("{} ...  \n".format(hsp.query[0:20]))
                save_file.write("{} ...  \n".format(hsp.match[0:20]))
                save_file.write("{} ...  \n".format(hsp.sbjct[0:20]))
                save_file.write("\n\n")
            else:
                save_file.write("NULL RESULT -- No hits returned!!")
    
    result_handle.close()
    query_ct += 1

In [4]:
## Libraries
import numpy as np
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.Blast import NCBIWWW, NCBIXML
import random
import time

def orf_scan_analyze(seq, log_dir, min_pro_len=50, table=11):
    
    ## Lists for ORF protein details
    orf_proteins = []
    protein_wts = []
    protein_lens = []
    
    ## Start scan of both genomic strands
    for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
    
        ## Scan each possible frame
        for frame in range(3):

            ## Transalte the frame
            for pro in nuc[frame:].translate(table).split("*"):

                if len(pro) >= min_pro_len:

                    ## Run a protein "param" analysis -- returns molecular weight/amino acid freqs etc.
                    analysis = ProteinAnalysis(str(pro))

                    ## Add protein details 
                    orf_proteins.append(pro)
                    protein_wts.append(round(analysis.molecular_weight(), 3))
                    protein_lens.append(len(pro))
                    
                    # Output to screen
                    # print("%s...%s - length %i, strand %i, frame %i" % (pro[:30], pro[-3:], len(pro), strand, frame))

    protein_wts = np.array(protein_wts)
    protein_lens = np.array(protein_lens)
    orf_proteins_arr = np.array(orf_proteins, dtype='object')

    save_file_path = log_dir + "/" + "protein_weights"
    save_file = open(save_file_path, "w")
    save_file.write("Protein Molar Mass Results\n" + "--------------------------------------------" + "\n\n")
    
    for i in range(len(protein_wts)):
        save_file.write("Protein " + str(orf_proteins[i]) + ":  " + str(protein_wts[i]) + "\n\n")

    return orf_proteins_arr, protein_lens

def query_proteins(orf_proteins, protein_lens):
    
    min_idxs = np.where(protein_lens < 1000)[0][:]
    query_protein_idxs = np.random.choice(5, min_idxs)
    query_proteins = orf_proteins[query_protein_idxs]
    
    return query_proteins

def blast_proteins(query_proteins, log_dir, e_value_thresh=0.04):
    
    Entrez.email = "neel.mehtani@gmail.com"
    query_ct = 1
    
    for q in query_proteins:
        
        out_file_path = log_dir + "/" + "blast_{}.xml".format(query_ct)
        out_file = open(out_file_path, "w") 
        
        blast_handle = NCBIWWW.qblast("blastp", "nr", q)
        out_file.write(blast_handle.read())
        out_file.close()
        blast_handle.close()

        result_handle = open(out_file_path)
        blast_records = NCBIXML.read(result_handle)
        
        save_file_path = log_dir + "/" + "BLAST_query_{}_result_log".format(query_ct)
        save_file = open(save_file_path, "w")
        save_file.write("BLAST Query #{} Results -- Sequence Example\n".format(query_ct) + "----------------------------------------" + "\n\n")
        
        for alignment in blast_records.alignments:

            for hsp in alignment.hsps:
                if hsp.expect < e_value_thresh:

    #                 content = ["sequence: {} \n".format(alignment.title), "length: {} \n".format(alignment.length),
    #                            "E Value: {} \n".format(hsp.expect), "{} ...  \n".format(hsp.query[0:20]), "{} ...  \n".format(hsp.match[0:20]), 
    #                            "{} ...  \n".format(hsp.sbjct[0:20]), "\n\n"]

                    save_file.write("sequence: {} \n".format(alignment.title))
                    save_file.write("length: {} \n".format(alignment.length))
                    save_file.write("E Value: {} \n".format(hsp.expect))
                    save_file.write("{} ...  \n".format(hsp.query[0:20]))
                    save_file.write("{} ...  \n".format(hsp.match[0:20]))
                    save_file.write("{} ...  \n".format(hsp.sbjct[0:20]))
                    save_file.write("\n\n")
                else:
                    save_file.write("NULL RESULT -- No hits returned!!")

        result_handle.close()
        query_ct += 1

    return 

In [5]:
## main

random.seed(time.time())
Entrez.email = "neel.mehtani@gmail.com"
handle = Entrez.efetch(db="nucleotide", rettype="gb", id="QGKT01000001")
record = SeqIO.read(handle, "genbank")
handle.close()

seq = record.seq

min_pro_len = 50
table = 11
threshold = 0.3

log_dir = "../data"

orf_proteins_arr, protein_lens = orf_scan_analyze(seq, log_dir, table=table)
q_proteins = query_proteins(orf_proteins_arr, protein_lens)
blast_proteins(q_proteins, log_dir, e_value_thresh=threshold)



ValueError: maximum supported dimension for an ndarray is 32, found 1449