In [1]:
from Bio import Entrez
from src.files.blast import BLASTJsonFile
import re
import os 
import pandas as pd 
from tqdm import tqdm

Entrez.email = 'prichter@berkeley.edu'

In [None]:
def get_ids(result):
    ids = [match.group(1) for match in re.finditer(r'<Id>(\d+)</Id>', result)]
    return ids

def get_coordinates(result): # Takes a protein fetch result as input.
    pattern = r'<GBQualifier_name>coded_by</GBQualifier_name>\s+<GBQualifier_value>(.+)</GBQualifier_value>'
    match = re.search(pattern, result, flags=re.MULTILINE)
    return match.group(1) if (match is not None) else 'none'

def get_taxonomy(result): # Takes a nuccore fetch result as input.
    pattern = r'<GBSeq_taxonomy>(.+)</GBSeq_taxonomy>'
    match = re.search(pattern, result, flags=re.DOTALL)
    return match.group(1) if (match is not None) else 'none'

def get_organism(result): # Takes a nuccore fetch result as input.
    pattern = r'<GBSeq_organism>(.+)</GBSeq_organism>'
    match = re.search(pattern, result, flags=re.DOTALL)
    return match.group(1) if (match is not None) else 'none'

# def get_nuccore_accession(result):
#     pattern = r'<GBSeq_accession-version>(.+)</GBSeq_accession-version>'
#     match = re.search(pattern, result, flags=re.DOTALL)
#     return match.group(1) if (match is not None) else 'none'

def get_nuccore_accession(result): # Takes a protein fetch result as input. 
    pattern = r'<GBSeq_source-db>accession (.+)</GBSeq_source-db'
    match = re.search(pattern, result, flags=re.DOTALL)
    return match.group(1) if (match is not None) else 'none'


In [None]:
def download_fasta(nuccore_id:str, fn_dir=None):
    fn_path = os.path.join(fn_dir, f'{nuccore_id}.fn')
    if not os.path.exists(fn_path):
        result = Entrez.efetch(db='nuccore', id=nuccore_id, rettype='fasta').read()
        with open(fn_path, 'w') as f:
            f.write(result)
    return fn_path

In [3]:
blast_df = BLASTJsonFile('../data/arf1-2.json').to_df()
blast_df = blast_df.drop_duplicates('subject_id')
print('Num. BLAST hits:', len(blast_df))

Num. BLAST hits: 549


In [None]:
def download_nr_protein_info(id_, fn_dir='../data/ncbi/nucleotides'):
    assert id_.startswith('WP'), f'download_nr_protein_info: Expected a protein accession beginning with WP, but got {id_}'
    result = Entrez.efetch(db='ipg', id=id_, rettype='html').read().decode('utf-8')
    
    info = list()
    pattern = r'<CDS  accver="(.+)" start="(\d+)" stop="(\d+)" strand="(.+)" taxid="(.+)" org="([^"]+)".+</CDSList></Protein>'
    for match in re.finditer(pattern, result):
        info_ = dict()
        info_['nuccore_id'] = match.group(1)
        info_['start'] = match.group(2)
        info_['stop'] = match.group(3)
        info_['strand'] = match.group(4)
        info_['organism'] = match.group(6)
        info_['html'] = match.group(0)
        info.append(info_)

        download_fasta

    return info

In [None]:
def download_protein_info(id_, fn_dir:str='../data/ncbi/nucleotides'):
    # TODO: The non-redundant protein sequences need to be handled differently.  
    info = {'id':id_}
    
    result = Entrez.efetch(db='protein', id=id_, rettype='html').read().decode('utf-8')
    nuccore_id = get_nuccore_accession(result)
    if nuccore_id == 'none':
        return info

    info['coordinates'] = get_coordinates(result)
    info['nuccore_id'] = nuccore_id

    result = Entrez.efetch(db='nuccore', id=nuccore_id, rettype='html').read().decode('utf-8')

    info['organism'] = get_organism(result)
    info['taxonomy'] = get_taxonomy(result)
    info['fn_path'] = download_fasta(nuccore_id, fn_dir=fn_dir)
    
    return info

# arf12_df = list()
# for id_ in tqdm(blast_df.subject_id, desc='Downloading aRF1-2 sequences...'):
#     arf12_df.append(download_protein_info(id_))
# arf12_df = pd.DataFrame(arf12_df).set_index('id')
# arf12_df.to_csv('../data/arf1-2_blast_hits.csv')


In [None]:
# print(Entrez.efetch(db='protein', id='2154289639', rettype='html').read().decode('utf-8'))
# print(Entrez.esearch(db='protein', term='2154289639', rettype='html').read().decode('utf-8'))
# print(Entrez.efetch(db='protein', term='2154289639', rettype='html').read().decode('utf-8'))
# print(Entrez.elink(dbfrom='protein', db='protein', id='WP_229389239').read().decode('utf-8')) # 2154289639
print(Entrez.efetch(db='ipg', id='WP_229389239', rettype='html').read().decode('utf-8')) # 2154289639
# print(Entrez.efetch(db='nuccore', id='2154289639', rettype='html').read().decode('utf-8'))

get_ipg_info()


<?xml version="1.0" encoding="UTF-8"  ?>
<IPGReportSet xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://www.ncbi.nlm.nih.gov/data_specs/schema/other/seq_report/IPGReportSet.xsd">

<IPGReport  ipg="501664124" product_acc="WP_229389239.1">
<Product  accver="WP_229389239.1" name="peptide chain release factor aRF-1" taxid="2605639" slen="415" org="Methanosarcina sp. DH2" kingdom_taxid="2157" kingdom="Archaea"/>
<ProteinList>
<Protein  accver="WP_229389239.1" source="RefSeq" name="peptide chain release factor aRF-1" taxid="2605639" org="Methanosarcina sp. DH2" kingdom_taxid="2157" kingdom="Archaea" priority="0">
<CDSList>
<CDS  accver="NZ_WJBI01000002.1" start="378548" stop="379795" strand="-" taxid="2605639" org="Methanosarcina sp. DH2" kingdom_taxid="2157" kingdom="Archaea" strain="DH2" assembly="GCF_020804225.1"/></CDSList></Protein>
<Protein  accver="MCC4769673.1" source="INSDC" name="peptide chain release factor 1" taxid="2605639" org="Metha

[{'nuccore_id': 'NZ_WJBI01000002.1',
  'start': '378548',
  'stop': 'NZ_WJBI01000002.1',
  'strand': 'NZ_WJBI01000002.1',
  'organism': 'Methanosarcina sp. DH2'},
 {'nuccore_id': 'WJBI01000002.1',
  'start': '378548',
  'stop': 'WJBI01000002.1',
  'strand': 'WJBI01000002.1',
  'organism': 'Methanosarcina sp. DH2'}]

In [7]:
wp_ids = [id_ for id_ in arf12_df.index if id_.startswith('WP')]

NameError: name 'arf12_df' is not defined

In [None]:
arf12_df.fn_path.dropna()
arf12_df[arf12_df.fn_path.isnull()]

Unnamed: 0_level_0,coordinates,nuccore_id,organism,taxonomy,fn_path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WP_229389239,,,,,
WP_048180500,,,,,
WP_048140256,,,,,
WP_300104941,,,,,
WP_322225704,,,,,
...,...,...,...,...,...
WP_292518519,,,,,
WP_301664146,,,,,
WP_042705156,,,,,
WP_253488479,,,,,


In [None]:
# def download_protein_info(id_, fn_dir:str='../data/ncbi/nucleotides'):
    
#     info = list()
    
    # result = Entrez.esearch(db='protein', term=id_, rettype='html').read().decode('utf-8')
    # protein_ids = get_ids(result)
    # if len(protein_ids) == 0:
    #     print(f'download_protein_info: No results for {id_}')
    #     return info
    
#     for protein_id in protein_ids:
#         info = {'id':id_, 'numerical_protein_id':protein_id}
    
#         result = Entrez.efetch(db='protein', id=protein_id, rettype='html').read().decode('utf-8')
#         info['nuccore_id'], info['coordinates'] = get_coordinates(result)
        
#         result = Entrez.elink(dbfrom='protein', db='nuccore', id=protein_id, rettype='html').read().decode('utf-8')
#         nuccore_id = get_ids(result)
#         if len(nuccore_id) == 0:
#             info.append(info)
#             continue 
#         else:
#             nuccore_id = nuccore_id[1] # First ID tag is just the ID itself. 

#         result = Entrez.efetch(db='nuccore', id=nuccore_id, rettype='html').read().decode('utf-8')

#         info['organism'] = get_organism(result)
#         info['taxonomy'] = get_taxonomy(result)
#         # info['nuccore_id'] = get_nuccore_accession(result)
#         info['numerical_nuccore_id'] = nuccore_id

#         fn_path = os.path.join(fn_dir, f'{nuccore_id}.fn')
#         if not os.path.exists(fn_path):
#             result = Entrez.efetch(db='nuccore', id=nuccore_id, rettype='fasta').read()
#             with open(fn_path, 'w') as f:
#                 f.write(result)
#             info['fn_path'] = fn_path
#         info.append(info)
    
#     return info

# arf12_df = list()
# for id_ in tqdm(blast_df.subject_id, desc='Downloading aRF1-2 sequences...'):
#     arf12_df += download_protein_info(id_)
# arf12_df = pd.DataFrame(arf12_df).set_index('id')
