# 16S rRNA Web Scraping

#### Vibrio cholerae O1 biovar El Tor str. N16961 chromosome I, complete sequence
: https://www.ncbi.nlm.nih.gov/nuccore/NC_002505.1?report=gbwithparts&log$=seqview

위 예시된 링크에 대해 BeautifulSoup과 Selenium의 WebDriver를 기반으로 Chrome Browser에서 NCBI 사이트를 조작하여 16S rRNA의 FASTA 파일을 자동으로 다운받을 것이다. 마찬가지로, 아래 최종적으로 만들어진 함수인 sixteen_s_crawler()에 충분히 Annotation된 원하는 대상의 GenBank (full) 링크를 입력하면 16S rRNA의 FASTA 파일을 자동으로 다운받을 수 있다.

## Step By Step

In [4]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import time
import re

In [5]:
url = 'https://www.ncbi.nlm.nih.gov/nuccore/NC_002505.1?report=gbwithparts&log$=seqview'

In [25]:
# Chrome를 열어 링크에 접속한다.
driver = webdriver.Chrome() # webdriver.Chrome('chromedriver.exe의 경로')
driver.get(url)

# time.sleep(60)

In [27]:
# Feature Search Bar 생성
make_feature_search_bar = driver.find_element_by_class_name('pseudolink')
make_feature_search_bar.click()

In [28]:
# Dropbox 클릭
bar_click1 = driver.find_element_by_id('fh_bar_select')
bar_click1.click()

In [29]:
# rRNA 클릭
driver.find_element_by_xpath("//select[option/@value='rRNA']/option[text()='rRNA']").click()

In [30]:
html = driver.page_source
soup = BeautifulSoup(html)
total = soup.find_all("span", {"id":"fh_bar_total"})

pattern = 'of [0-9]+'
text = str(total)

r = re.compile(pattern)

results = r.findall(text)
total_num = int(str(results)[5:-2])
print(total_num) # rRNA에 해당하는 결과의 개수

25


In [31]:
# 새창 띄우기

new_window_num = 0
for i in range(total_num) : 
    html = driver.page_source
    soup = BeautifulSoup(html)
    prodList = soup.find_all("div", {"id":"fh_bar_details_body"})
    if 'product="16S ribosomal RNA"' in str(prodList) : 
        fasta_click = driver.find_element_by_id('fh_bar_fasta')
        fasta_click.send_keys(Keys.CONTROL +"\n")
        new_window_num = new_window_num + 1
        print("Yes")
        print(i+1)
        print(prodList)
    else : 
        print("No")
    bar_next = driver.find_element_by_id('fh_bar_next')
    bar_next.click()
    print("Next")

print("New Window Number :", new_window_num)

Yes
1
[<div id="fh_bar_details_body" style="height: 92px; left: 549px; width: 450px; top: -107px;"><div>53815..55363
</div><div> /locus_tag="VC_RS00270"
</div><div> /old_locus_tag="VC_r001"
</div><div> /old_locus_tag="VCr001"
</div><div> /product="16S ribosomal RNA"
</div></div>]
Next
No
Next
No
Next
Yes
4
[<div id="fh_bar_details_body" style="height: 92px; left: 549px; width: 450px; top: -107px;"><div>151051..152599
</div><div> /locus_tag="VC_RS00735"
</div><div> /old_locus_tag="VC_r004"
</div><div> /old_locus_tag="VCr004"
</div><div> /product="16S ribosomal RNA"
</div></div>]
Next
No
Next
No
Next
Yes
7
[<div id="fh_bar_details_body" style="height: 92px; left: 549px; width: 450px; top: -107px;"><div>324139..325691
</div><div> /locus_tag="VC_RS01505"
</div><div> /old_locus_tag="VC_r007"
</div><div> /old_locus_tag="VCr007"
</div><div> /product="16S ribosomal RNA"
</div></div>]
Next
No
Next
No
Next
Yes
10
[<div id="fh_bar_details_body" style="height: 92px; left: 549px; width: 450px; top:

In [364]:
# 탭 전환
driver.switch_to.window(driver.window_handles[1])

# time.sleep(5)

In [223]:
# FASTA 파일 생성
html = driver.page_source
soup = BeautifulSoup(html)
fasta_seq = soup.find_all("div", {"id":"viewercontent1"})
print(fasta_seq)

[<div class="seq gbff" id="viewercontent1" sequencesize="3470537" style="display: block;" val="15640032" virtualsequence=""><pre>&gt;NC_002505.1:53815-55363 Vibrio cholerae O1 biovar El Tor str. N16961 chromosome I, complete sequence
TTAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGG
CAGCACAGAGGAACTTGTTCCTTGGGTGGCGAGCGGCGGACGGGTGAGTAATGCCTGGGAAATTGCCCGG
TAGAGGGGGATAACCATTGGAAACGATGGCTAATACCGCATAACCTCGCAAGAGCAAAGCAGGGGACCTT
CGGGCCTTGCGCTACCGGATATGCCCAGGTGGGATTAGCTAGTTGGTGAGGTAAGGGCTCACCAAGGCGA
CGATCCCTAGCTGGTCTGAGAGGATGATCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGA
GGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCC
TTCGGGTTGTAAAGTACTTTCAGTAGGGAGGAAGGTGGTTAAGTTAATACCTTAATCATTTGACGTTACC
TACAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGA
ATTACTGGGCGTAAAGCGCATGCAGGTGGTTTGTTAAGTCAGATGTGAAAGCCCTGGGCTCAACCTAGGA
ATCGCATTTGAAACTGACAAGCTAGAGTACTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGC
GTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAGATACTGACA

In [319]:
pattern = '<pre>&gt;.+'
text = str(fasta_seq)
 
r = re.compile(pattern)

header = r.findall(text)[0][9:]
header = header[0:0] + '>' + header[0:]
header = header.split('Making a list')
header

['>NC_002505.1:53815-55363 Vibrio cholerae O1 biovar El Tor str. N16961 chromosome I, complete sequence']

In [320]:
pattern = '\n.+'
text = str(fasta_seq)
 
r = re.compile(pattern)

#sequence = str(r.findall(text)[:-1])[2:-2].replace("', '","")
sequence = r.findall(text)[:-1]
sequence

['\nTTAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGG',
 '\nCAGCACAGAGGAACTTGTTCCTTGGGTGGCGAGCGGCGGACGGGTGAGTAATGCCTGGGAAATTGCCCGG',
 '\nTAGAGGGGGATAACCATTGGAAACGATGGCTAATACCGCATAACCTCGCAAGAGCAAAGCAGGGGACCTT',
 '\nCGGGCCTTGCGCTACCGGATATGCCCAGGTGGGATTAGCTAGTTGGTGAGGTAAGGGCTCACCAAGGCGA',
 '\nCGATCCCTAGCTGGTCTGAGAGGATGATCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGA',
 '\nGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCC',
 '\nTTCGGGTTGTAAAGTACTTTCAGTAGGGAGGAAGGTGGTTAAGTTAATACCTTAATCATTTGACGTTACC',
 '\nTACAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGA',
 '\nATTACTGGGCGTAAAGCGCATGCAGGTGGTTTGTTAAGTCAGATGTGAAAGCCCTGGGCTCAACCTAGGA',
 '\nATCGCATTTGAAACTGACAAGCTAGAGTACTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGC',
 '\nGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAGATACTGACACTCAGATGCGAAAG',
 '\nCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCTACTTGGAGGTTGTGC',
 '\nCCTAGAGGTGTGGCTTTCGGAGCTAACGCGTTAAGTAGACCGCCTGGGGAGTACGGTCGCAAGATTAAAA',

In [321]:
fasta = header + sequence
fasta

['>NC_002505.1:53815-55363 Vibrio cholerae O1 biovar El Tor str. N16961 chromosome I, complete sequence',
 '\nTTAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGG',
 '\nCAGCACAGAGGAACTTGTTCCTTGGGTGGCGAGCGGCGGACGGGTGAGTAATGCCTGGGAAATTGCCCGG',
 '\nTAGAGGGGGATAACCATTGGAAACGATGGCTAATACCGCATAACCTCGCAAGAGCAAAGCAGGGGACCTT',
 '\nCGGGCCTTGCGCTACCGGATATGCCCAGGTGGGATTAGCTAGTTGGTGAGGTAAGGGCTCACCAAGGCGA',
 '\nCGATCCCTAGCTGGTCTGAGAGGATGATCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGA',
 '\nGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCC',
 '\nTTCGGGTTGTAAAGTACTTTCAGTAGGGAGGAAGGTGGTTAAGTTAATACCTTAATCATTTGACGTTACC',
 '\nTACAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGA',
 '\nATTACTGGGCGTAAAGCGCATGCAGGTGGTTTGTTAAGTCAGATGTGAAAGCCCTGGGCTCAACCTAGGA',
 '\nATCGCATTTGAAACTGACAAGCTAGAGTACTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGC',
 '\nGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAGATACTGACACTCAGATGCGAAAG',
 '\nCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCG

In [322]:
# f = open("16S_rRNA_Sequence_%d.fasta" % n, 'w')
f = open("16S_rRNA_Sequence_test.fasta", 'w')
for i in range(len(fasta)) : 
    f.write(str(fasta[i]))
f.close()

## Macro

In [10]:
# URL
url = 'https://www.ncbi.nlm.nih.gov/nuccore/NC_002505.1?report=gbwithparts&log$=seqview'

# 접속
driver = webdriver.Chrome()
driver.get(url)
time.sleep(60)

# Feature Search Bar 생성
make_feature_search_bar = driver.find_element_by_class_name('pseudolink')
make_feature_search_bar.click()
time.sleep(2)

# Dropbox 클릭
bar_click1 = driver.find_element_by_id('fh_bar_select')
bar_click1.click()
time.sleep(2)

# rRNA 클릭
driver.find_element_by_xpath("//select[option/@value='rRNA']/option[text()='rRNA']").click()
time.sleep(2)

# 합계 구하기
html = driver.page_source
soup = BeautifulSoup(html)
total = soup.find_all("span", {"id":"fh_bar_total"})

pattern = 'of [0-9]+'
text = str(total)

r = re.compile(pattern)

results = r.findall(text)
total_num = int(str(results)[5:-2])

# 새창 띄우기
new_window_num = 0
for i in range(total_num) : 
    html = driver.page_source
    soup = BeautifulSoup(html)
    prodList = soup.find_all("div", {"id":"fh_bar_details_body"})
    if 'product="16S ribosomal RNA"' in str(prodList) : 
        fasta_click = driver.find_element_by_id('fh_bar_fasta')
        fasta_click.send_keys(Keys.CONTROL +"\n")
        new_window_num = new_window_num + 1
        print("Yes")
        print(i+1)
        print(prodList)
    else : 
        print("No")
    bar_next = driver.find_element_by_id('fh_bar_next')
    bar_next.click()
    print("Next")

print("New Window Number :", new_window_num)

Yes
1
[<div id="fh_bar_details_body" style="height: 92px; left: 549px; width: 450px; top: -107px;"><div>53815..55363
</div><div> /locus_tag="VC_RS00270"
</div><div> /old_locus_tag="VC_r001"
</div><div> /old_locus_tag="VCr001"
</div><div> /product="16S ribosomal RNA"
</div></div>]
Next
No
Next
No
Next
Yes
4
[<div id="fh_bar_details_body" style="height: 92px; left: 549px; width: 450px; top: -107px;"><div>151051..152599
</div><div> /locus_tag="VC_RS00735"
</div><div> /old_locus_tag="VC_r004"
</div><div> /old_locus_tag="VCr004"
</div><div> /product="16S ribosomal RNA"
</div></div>]
Next
No
Next
No
Next
Yes
7
[<div id="fh_bar_details_body" style="height: 92px; left: 549px; width: 450px; top: -107px;"><div>324139..325691
</div><div> /locus_tag="VC_RS01505"
</div><div> /old_locus_tag="VC_r007"
</div><div> /old_locus_tag="VCr007"
</div><div> /product="16S ribosomal RNA"
</div></div>]
Next
No
Next
No
Next
Yes
10
[<div id="fh_bar_details_body" style="height: 92px; left: 549px; width: 450px; top:

In [16]:
clustalw_input = []
for n in range(new_window_num) : 
    # 탭 전환 후 추출
    driver.switch_to.window(driver.window_handles[n+1])
    time.sleep(5)

    html = driver.page_source
    soup = BeautifulSoup(html)
    fasta_seq = soup.find_all("div", {"id":"viewercontent1"})

    # Header
    pattern = '<pre>&gt;.+'
    text = str(fasta_seq)
    r = re.compile(pattern)
    header = r.findall(text)[0][9:]
    header = header[0:0] + '>' + header[0:]
    header = header.split('Making a list')

    # Sequence
    pattern = '\n.+'
    text = str(fasta_seq)
    r = re.compile(pattern)
    sequence = r.findall(text)[:-1]
    
    # FASTA 파일 생성
    fasta = header + sequence
    fasta
    
    number = new_window_num - n
    f = open("16S_rRNA_Sequence_%d.fasta" % number, 'w')
    for i in range(len(fasta)) : 
        f.write(str(fasta[i]))
        clustalw_input.append(str(fasta[i]))
    f.close()
    clustalw_input.append('\n\n')
    print( n+1, 'Done')
    
# ClustalW Input
f = open("ClustalW_Input.txt", 'w')
for i in range(len(clustalw_input)) : 
    f.write(str(clustalw_input[i]))
f.close()
print( 'ClustalW Input Done')

1 Done
2 Done
3 Done
4 Done
5 Done
6 Done
7 Done
8 Done
ClustalW Input Done


## Def

In [1]:
def sixteen_s_crawler(URL, z) : 
    
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support.ui import Select
    import time
    import re
    
    url = URL
    
    # 접속
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(60)

    # Feature Search Bar 생성
    make_feature_search_bar = driver.find_element_by_class_name('pseudolink')
    make_feature_search_bar.click()
    time.sleep(2)

    # Dropbox 클릭
    bar_click1 = driver.find_element_by_id('fh_bar_select')
    bar_click1.click()
    time.sleep(2)

    # rRNA 클릭
    driver.find_element_by_xpath("//select[option/@value='rRNA']/option[text()='rRNA']").click()
    time.sleep(2)

    # 합계 구하기
    html = driver.page_source
    soup = BeautifulSoup(html)
    total = soup.find_all("span", {"id":"fh_bar_total"})

    pattern = 'of [0-9]+'
    text = str(total)

    r = re.compile(pattern)

    results = r.findall(text)
    total_num = int(str(results)[5:-2])

    # 새창 띄우기
    new_window_num = 0
    for i in range(total_num) : 
        html = driver.page_source
        soup = BeautifulSoup(html)
        prodList = soup.find_all("div", {"id":"fh_bar_details_body"})
        if 'product="16S ribosomal RNA"' in str(prodList) : 
            fasta_click = driver.find_element_by_id('fh_bar_fasta')
            fasta_click.send_keys(Keys.CONTROL +"\n")
            new_window_num = new_window_num + 1
        bar_next = driver.find_element_by_id('fh_bar_next')
        bar_next.click()

    clustalw_input = []
    for n in range(new_window_num) : 
        # 탭 전환 후 추출
        driver.switch_to.window(driver.window_handles[n+1])
        time.sleep(5)

        html = driver.page_source
        soup = BeautifulSoup(html)
        fasta_seq = soup.find_all("div", {"id":"viewercontent1"})

        # Header
        pattern = '<pre>&gt;.+'
        text = str(fasta_seq)
        r = re.compile(pattern)
        header = r.findall(text)[0][9:]
        header = header[0:0] + '>' + header[0:]
        header = header.split('Making a list')

        # Sequence
        pattern = '\n.+'
        text = str(fasta_seq)
        r = re.compile(pattern)
        sequence = r.findall(text)[:-1]

        # FASTA 파일 생성
        fasta = header + sequence
        fasta

        number = new_window_num - n
        f = open("16S_rRNA_Sequence_%d.fasta" % number, 'w')
        for i in range(len(fasta)) : 
            f.write(str(fasta[i]))
            clustalw_input.append(str(fasta[i]))
        f.close()
        clustalw_input.append('\n\n')

    # ClustalW Input
    f = open("ClustalW_Input.txt", 'w')
    for i in range(len(clustalw_input)) : 
        f.write(str(clustalw_input[i]))
    f.close()
        
    print('Done.')
    
    filename = 'ClustalW_Input.txt'
    with open(filename) as file_object:
        contents = file_object.read()
        print(contents)

In [2]:
sixteen_s_crawler('https://www.ncbi.nlm.nih.gov/nuccore/NC_002505.1?report=gbwithparts&log$=seqview', 5)

Done.
>NC_002505.1:c2939009-2937460 Vibrio cholerae O1 biovar El Tor str. N16961 chromosome I, complete sequence
TTAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGG
CAGCACAGAGGAACTTGTTCCTTGGGTGGCGAGCGGCGGACGGGTGAGTAATGCCTGGGAAATTGCCCGG
TAGAGGGGGATAACCATTGGAAACGATGGCTAATACCGCATAACCTCGCAAGAGCAAAGCAGGGGACCTT
CGGGCCTTGCGCTACCGGATATGCCCAGGTGGGATTAGCTAGTTGGTGAGGTAAGGGCTCACCAAGGCGA
CGATCCCTAGCTGGTCTGAGAGGATGATCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGA
GGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCC
TTCGGGTTGTAAAGTACTTTCAGTAGGGAGGAAGGTGGTTAAGTTAATACCTTAATCATTTGACGTTACC
TACAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGA
ATTACTGGGCGTAAAGCGCATGCAGGTGGTTTGTTAAGTCAGATGTGAAAGCCCTGGGCTCAACCTAGGA
ATCGCATTTGAAACTGACAAGCTAGAGTACTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGC
GTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAGATACTGACACTCAGATGCGAAAG
CGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCTACTTGGAGGTTGTGC
CCTAGAGGTGTGGCTTTCGGAGCTAACGCGTTAAG

In [3]:
# 다른 예 : Vibrio campbellii ATCC BAA-1116 chromosome I, complete sequence
sixteen_s_crawler('https://www.ncbi.nlm.nih.gov/nuccore/NC_009783.1?report=gbwithparts&log$=seqview', 5)

Done.
>NC_009783.1:c3765265-3763708 Vibrio campbellii ATCC BAA-1116 chromosome I, complete sequence
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGGA
AACGAGTTATCTGAACCTTCGGGGAACGATAACGGCGTCGAGCGGCGGACGGGTGAGTAATGCCTAGGAA
ATTGCCCTGATGTGGGGGATAACCATTGGAAACGATGGCTAATACCGCATAACGCCTACGGGCCAAAGAG
GGGGACCTTTGGGCCTCTCGCGTCAGGATATGCCTAGGTGGGATTAGCTAGTTGGTGAGGTAAGGGCTCA
CCAAGGCGACGATCCCTAGCTGGTCTGAGAGGATGATCAGCCACACTGGAACTGAGACACGGTCCAGACT
CCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTGTG
AAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGTCGTGAGGAAGGTAGTGTAGTTAATAGCTGCATTATTT
GACGTTAGCGACAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCG
TTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGTGGTTTGTTAAGTCAGATGTGAAAGCCCGGGGCTC
AACCTCGGAATTGCATTTGAAACTGGCAGACTAGAGTACTGTAGAGGGGGGTAGAATTTCAGGTGTAGCG
GTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAGATACTGACACTCAG
ATGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCTACTTGG
AGGTTGTGGCCTTGAGCCGTGGCTTTCGGAGCTAACGCGTTAAGTAGA

In [11]:
# 다른 예 : Vibrio campbellii ATCC BAA-1116 chromosome II, complete sequence
sixteen_s_crawler('https://www.ncbi.nlm.nih.gov/nuccore/NC_009784.1?report=gbwithparts&log$=seqview', 3)

Done.
>NC_009784.1:c1944850-1943289 Vibrio campbellii ATCC BAA-1116 chromosome II, complete sequence
AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGGA
AACGAGTTATCTGAACCTTCGGGGAACGATAACGGCGTCGAGCGGCGGACGGGTGAGTAATGCCTAGGAA
ATTGCCCTGATGTGGGGGATAACCATTGGAAACGATGGCTAATACCGCATAATGCCTACGGGCCAAAGAG
GGGGACCTTCGGGCCTCTCGCGTCAGGATATGCCTAGGTGGGATTAGCTAGTTGGTGAGGTAAGGGCTCA
CCAAGGCGACGATCCCTAGCTGGTCTGAGAGGATGATCAGCCACACTGGAACTGAGACACGGTCCAGACT
CCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTGTG
AAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGTCGTGAGGAAGGTAGTGTAGTTAATAGCTGCATTATTT
GACGTTAGCGACAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCG
TTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGTGGTTTGTTAAGTCAGATGTGAAAGCCCGGGGCTC
AACCTCGGAATTGCATTTGAAACTGGCAGACTAGAGTACTGTAGAGGGGGGTAGAATTTCAGGTGTAGCG
GTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAGATACTGACACTCAG
ATGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCTACTTGG
AGGTTGTGGCCTTGAGCCGTGGCTTTCGGAGCTAACGCGTTAAGTAG

## ClustalW
https://www.genome.jp/tools-bin/clustalw

sixteen_s_crawler()를 이용해 얻은 FASTA 파일을 위 링크에서 하나씩 입력할 필요가 없도록 Input 내용을 만들어주었다. 이 내용을 입력하여 결과를 받는 과정을 아래에서 진행한다.

In [3]:
# sixteen_s_crawler('https://www.ncbi.nlm.nih.gov/nuccore/NC_002505.1?report=gbwithparts&log$=seqview')

filename = 'ClustalW_Input.txt'
w_input = []
with open(filename) as file_object:
    contents = file_object.read()
    w_input.append(contents)
    
print(w_input)

['>NC_002505.1:c2939009-2937460 Vibrio cholerae O1 biovar El Tor str. N16961 chromosome I, complete sequence\nTTAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGG\nCAGCACAGAGGAACTTGTTCCTTGGGTGGCGAGCGGCGGACGGGTGAGTAATGCCTGGGAAATTGCCCGG\nTAGAGGGGGATAACCATTGGAAACGATGGCTAATACCGCATAACCTCGCAAGAGCAAAGCAGGGGACCTT\nCGGGCCTTGCGCTACCGGATATGCCCAGGTGGGATTAGCTAGTTGGTGAGGTAAGGGCTCACCAAGGCGA\nCGATCCCTAGCTGGTCTGAGAGGATGATCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGA\nGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCC\nTTCGGGTTGTAAAGTACTTTCAGTAGGGAGGAAGGTGGTTAAGTTAATACCTTAATCATTTGACGTTACC\nTACAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGA\nATTACTGGGCGTAAAGCGCATGCAGGTGGTTTGTTAAGTCAGATGTGAAAGCCCTGGGCTCAACCTAGGA\nATCGCATTTGAAACTGACAAGCTAGAGTACTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGC\nGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAGATACTGACACTCAGATGCGAAAG\nCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCTACTTGGAGGTTGTGC\nCCTAGAGGTGTGGCTTTCGGAGCTAA

In [25]:
driver = webdriver.Chrome()
driver.get('https://www.genome.jp/tools-bin/clustalw')
time.sleep(10)

w_sequence = driver.find_element_by_id('sequence')
w_sequence.send_keys(w_input)
time.sleep(100)

In [26]:
w_submit = driver.find_element_by_xpath("//input[@type='submit']") 
w_submit.click()

위의 결과에서 clustalw.aln를 클릭하여 다운받는다.

In [4]:
def clustalw_align(input_file) : 
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    
    filename = input_file
    w_input = []
    with open(filename) as file_object:
        contents = file_object.read()
        w_input.append(contents)


    driver = webdriver.Chrome()
    driver.get('https://www.genome.jp/tools-bin/clustalw')
    time.sleep(5)

    w_sequence = driver.find_element_by_id('sequence')
    w_sequence.send_keys(w_input)
#    time.sleep(5)

#     w_submit = driver.find_element_by_xpath("//input[@type='submit']") 
#     w_submit.click()
#     print('clustalw.aln를 클릭하여 다운받으세요.')

In [6]:
clustalw_align('ClustalW_Input.txt')