# **Retrieve flanking sequencing and output FASTA format for input RS numbers**

### Import required Python modules

In [36]:

from urllib.request import urlopen
import json
import re
import requests
import time

### Defined a function named 'get_rs_flank' to retrieve flanks of given rs IDs.

In [37]:
def get_rs_flank(rsList, len_of_flank=100):  #Get RefSNP(rs) flanks and output FASTA format; 
  for id in rsList:
    rsid=str(id)
    summary_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&id='+rsid+'&retmode=json' #eSummary to get RS docsum including chr/pos.
    response = urlopen(summary_url)
    data_json = json.loads(response.read()) #load eSummary results 

    chr_gi={'1': 568815597, '2': 568815596,'3': 568815595,'4': 568815594,'5': 568815593,'6': 568815592,'7': 568815591,'8': 568815590,
  '9': 568815589,'10': 568815588,'11': 568815587,'12': 568815586,'13': 568815585,'14': 568815584,'15': 568815583,'16': 568815582,
  '17': 568815581,'18': 568815580,'19': 568815579,'20': 568815578,'21': 568815577,'22': 568815576,'X': 568815575,'Y': 568815574, 'MT': 251831106}

    snp_class=data_json['result'][rsid]['snp_class']          #get variant type (SNV, DELETIONs, etc.)
    [chr, pos]=data_json['result'][rsid]['chrpos'].split(':') #get chromosome and position
    docsum=data_json['result'][rsid]['docsum']
    allele=re.findall('\|SEQ=\[(.*)\]\|',docsum)[0]           #get alleles

    seq=str(chr_gi[chr])
    seq_start=0
    seq_stop=0

    if snp_class=='snv': # true SNP
      seq_start_5=int(pos)-len_of_flank
      seq_stop_5=int(pos)-1
      seq_start_3=int(pos)+1
      seq_stop_3=int(pos)+len_of_flank
    elif allele.split('/')[0]=='-':  #INSERTIONs
      seq_start_5=int(pos)-len_of_flank+1
      seq_stop_5=int(pos)
      seq_start_3=int(pos)+1
      seq_strop_3=int(pos)+len_of_flank
    else: #DELETIONs or MNVs
      seq_start_5=int(pos)-len_of_flank
      seq_stop_5=int(pos)-1
      seq_start_3=int(pos)+len(allele.split('/')[0])
      seq_stop_3=int(pos)+len(allele.split('/')[0])+len_of_flank
   
    #retrieve 5' flanks from nucleotide database using eFetch
    seq_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id='+seq+'&seq_start='+str(seq_start_5)+'&seq_stop='+str(seq_stop_5)+'&rettype=fasta'
    response = requests.get(seq_url)
    data = response.text
    five_prime_flank=''.join(data.split('\n')[1:]).strip()

     #retrieve 3' flanks from nucleotide database using eFetch
    seq_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id='+seq+'&seq_start='+str(seq_start_3)+'&seq_stop='+str(seq_stop_3)+'&rettype=fasta'
    response = requests.get(seq_url)
    data = response.text
    three_prime_flank=''.join(data.split('\n')[1:]).strip()
    
    #format and print FASTA results
    fasta_header='>rs'+rsid+'|'+snp_class+'|'+chr+':'+pos+'|'+allele

    print(fasta_header)
    print(five_prime_flank)
    print('['+allele+']')
    print(three_prime_flank)

    time.sleep(1) # set (1 request/sec) to fix HTTP Error 429: Too Many Requests; for faster requests get eUtils API key https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/)

### An example of calling the function

In [38]:
flank_len = 25 #flank length default is 100
get_rs_flank([328,1639546602], flank_len)

>rs328|snv|8:19962213|C/A/G
CATGACAAGTCTCTGAATAAGAAGT
[C/A/G]
AGGCTGGTGAGCATTCTGGGCTAAA
>rs1639546602|delins|1:10130|TAACC/-
ACCCAACCCTAACCCTAACCCTAAC
[TAACC/-]
CCCTAACCCTAACCCCTAACCCTAAC
