In [1]:
from utils.data_assistant import *



### Get the accession number using openAI

---

In [2]:
accession = get_accession('You are an expert in the NCBI accession number formatting.',
                          'What is the NCBI accession number for the nucleotide sequence of the ACE2 receptor in humans?')

2025-04-28 10:54:33.192 
  command:

    streamlit run /Users/owenrogers/anaconda3/envs/QAC387/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]


#### Print accession number to make sure that it looks valid

In [3]:
print(accession)

NM_021804.3


### Get the SeqRecord object based on that accession number

---

In [4]:
record, _ = fetch_sequence(accession, email='orogers@wesleyan.edu')

The sequence in GB format for ID NM_021804.3:
GGCACTCATACATACACTCTGGCAATGAGGACACTGAGCTCGCTTCTGAA

Homo sapiens angiotensin converting enzyme 2 (ACE2), transcript variant 2, mRNA


In [5]:
sequence = record.seq

In [6]:
sequence

Seq('GGCACTCATACATACACTCTGGCAATGAGGACACTGAGCTCGCTTCTGAAATTT...CTC')

### Perform a Nucleotide BLAST

---

In [7]:
handle = nucleotide_blast(sequence)

#### Process the results handle

In [8]:
processed_stream = process_stream(handle, save=True, save_name='ACE2_28Apr')

In [9]:
print(processed_stream)

                                                title  length  e_value  \
0   gi|1700998533|ref|NM_021804.3| Homo sapiens an...    3596      0.0   
1   gi|66863987|dbj|AB193259.1| Homo sapiens ace2 ...    3605      0.0   
2   gi|2493543526|ref|XM_055376642.1| PREDICTED: G...    3596      0.0   
3   gi|2694562693|ref|XM_063634561.1| PREDICTED: S...    3597      0.0   
4   gi|1743233336|ref|XM_003261084.3| PREDICTED: N...    3593      0.0   
5   gi|66863989|dbj|AB193260.1| Homo sapiens ace2 ...    3470      0.0   
6   gi|8650465|gb|AF241254.1|AF241254 Homo sapiens...    3405      0.0   
7   gi|2490809741|ref|XM_008974180.3| PREDICTED: P...    6570      0.0   
8   gi|1700998531|ref|NM_001371415.1| Homo sapiens...    3339      0.0   
9   gi|37747876|gb|BC059378.1| Homo sapiens cDNA c...    3375      0.0   
10  gi|2697861032|ref|XM_016942979.3| PREDICTED: P...    3540      0.0   
11  gi|2695108862|ref|XM_063702989.1| PREDICTED: G...    3477      0.0   
12  gi|2695108865|ref|XM_019019204.3| 

In [10]:
processed_stream['title'] = processed_stream['title'].apply(lambda x: x.split('|')[3])

In [11]:
print(processed_stream)

             title  length  e_value   score  query_start  query_end
0      NM_021804.3    3596      0.0  7192.0            1       3596
1       AB193259.1    3605      0.0  7186.0            4       3596
2   XM_055376642.1    3596      0.0  7047.0            1       3596
3   XM_063634561.1    3597      0.0  6860.0            1       3596
4   XM_003261084.3    3593      0.0  6860.0            1       3596
5       AB193260.1    3470      0.0  6790.0          202       3596
6       AF241254.1    3405      0.0  6786.0          204       3596
7   XM_008974180.3    6570      0.0  6686.0          202       3596
8   NM_001371415.1    3339      0.0  6678.0          258       3596
9       BC059378.1    3375      0.0  6669.0          258       3594
10  XM_016942979.3    3540      0.0  6657.0          202       3596
11  XM_063702989.1    3477      0.0  6655.0          202       3596
12  XM_019019204.3    3522      0.0  6655.0          202       3596
13  XM_055376641.1    3577      0.0  6654.0     

### Create and fill a dictionary storing accession numbers, sequence, and summary for clustering

In [13]:
blast_accessions = {'title': [],
                    'sequence': [],
                    'summary': []}

for record in processed_stream['title']:
    
    blast_accessions['title'].append(record)
    
    rec, _ = fetch_sequence(record, email='orogers@wesleyan.edu')
    blast_accessions['sequence'].append(rec.seq)
    
    try:
        
        blast_accessions['summary'].append(rec.annotations['comment'])
        
    except:
        
        print(f'Accession number {record} does not have a summary')
        blast_accessions['summary'].append(f'There is no summary provided for this accession number')
    

The sequence in GB format for ID NM_021804.3:
GGCACTCATACATACACTCTGGCAATGAGGACACTGAGCTCGCTTCTGAA

Homo sapiens angiotensin converting enzyme 2 (ACE2), transcript variant 2, mRNA
The sequence in GB format for ID AB193259.1:
ACTCATACATACACTCTGGCAATGAGGACACTGAGCTCGCTTCTGAAATT

Homo sapiens ace2 mRNA for angiotensin-converting enzyme 2, complete cds, tissue_type: lung
Accession number AB193259.1 does not have a summary
The sequence in GB format for ID XM_055376642.1:
GGCACTCATACATACACTCTGGCAATGAGAACACTGAGCTCGCTTCTGAA

PREDICTED: Gorilla gorilla gorilla angiotensin converting enzyme 2 (ACE2), transcript variant X4, mRNA
The sequence in GB format for ID XM_063634561.1:
GGCACTCATACATACACTCTGGCAGTGAGGACACTGAGCTCGCTTCTGAA

PREDICTED: Symphalangus syndactylus angiotensin converting enzyme 2 (ACE2), transcript variant X1, mRNA
The sequence in GB format for ID XM_003261084.3:
GGCACTCATACATACACTCTGGCAGTGAGGACACTGAGCTCGCTTCTGAA

PREDICTED: Nomascus leucogenys angiotensin I converting enzyme 2 (ACE2)

In [14]:
extended_data = pd.DataFrame(blast_accessions)

In [16]:
assert (extended_data['title'].values == processed_stream['title'].values).all(), 'Something went wrong in the way you generated the dictionary'

In [17]:
all_data = extended_data.merge(processed_stream, 'left', on='title')