### Webscraper for extracting newly published names, strains and accession numbers from the weeekly IJSEM email (saved in html). Script then compares the IJSEM names to the NCBI names and generates a report used for taxonomy updates. 

In [1]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
#PATH = "C:\Users\mcveigh\Documents\PythonPC\chrome.exe"
#driver = webdriver.Chrome(PATH)
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
import re
import os
import sys
import bs4
from bs4 import BeautifulSoup
import requests
import numpy as np

### List of URLs input to search short list manually entered

In [None]:
urls = ['https://www.microbiologyresearch.org/content/journal/ijsem/10.1099/ijsem.0.006404',
       'https://www.microbiologyresearch.org/content/journal/ijsem/10.1099/ijsem.0.006406?emailalert=true',
        'https://www.microbiologyresearch.org/content/journal/ijsem/10.1099/ijsem.0.006401?emailalert=true',
        'https://www.microbiologyresearch.org/content/journal/ijsem/10.1099/ijsem.0.006417?emailalert=true'
       ]

### Input URLS from saved email in html - save as from outlook in htm format

In [2]:
input = (r'IJSEMemail6.htm')

In [3]:
#with open ('IJSEMemail1.htm', encoding = 'unicode_escape') as f:
with open (input, encoding = 'unicode_escape') as f:
    content = f.read()
    soup = BeautifulSoup(content, 'html.parser')  
print(soup.prettify())

<html xmlns="http://www.w3.org/TR/REC-html40" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns:ns0="http://www.w3.org/1999/xhtml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w="urn:schemas-microsoft-com:office:word">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="Word.Document" name="ProgId"/>
  <meta content="Microsoft Word 15" name="Generator"/>
  <meta content="Microsoft Word 15" name="Originator"/>
  <link href="IJSEMemail6_files/filelist.xml" rel="File-List"/>
  <link href="IJSEMemail6_files/editdata.mso" rel="Edit-Time-Data"/>
  <!--[if !mso]>
<style>
v\:* {behavior:url(#default#VML);}
o\:* {behavior:url(#default#VML);}
w\:* {behavior:url(#default#VML);}
.shape {behavior:url(#default#VML);}
</style>
<![endif]-->
  <!--[if gte mso 9]><xml>
 <o:OfficeDocumentSettings>
  <o:AllowPNG/>
 </o:OfficeDocumentSettings>
</xml><![endif]-->
  <link href="IJSEMemail6_files/them

In [4]:
urls = []
numberoflinks = 0
for url in soup.findAll( 'a', attrs={'href': re.compile("^https:")}):
    urls.append(url.get('href'))
    numberoflinks = numberoflinks + 1
#print(numberoflinks)
#print(urls)
#remove_list = ['TandC','doi.org', 'myaccount']
urls = [e for e in urls if "TandC" not in e and "doi.org" not in e and "myaccount" not in e]
#print(urls)

### Selenium web scraper - title and author note (footnote 1), some of these functions can be turned off following additional testing

In [5]:
#Function to scrape headlines using Selenium
def scrape_with_selenium(url, counter):
    global title
    global footnote
    global description
    title = []
    description = None
    options = Options()
    options.headless = False  # Set to True for headless mode
    driver = webdriver.Chrome(options=options)

    #Navigate to the webpage
    driver.get(url)

    #Allow time for dynamic content to load (you may need to use WebDriverWait for more robust waiting)
    time.sleep(3)

    for element in driver.find_elements(By.CLASS_NAME, "item-meta-data__item-title"):
        #print(element.text)
        title = element.text
        #print(title)
    #for element in driver.find_elements(By.CLASS_NAME, "author-footnote-below-abstract"):get.text() #retrieves all footnotes
    for element in driver.find_elements(By.ID, "FN1"): #retrieves just footnote 1
        #print(element.text)
        footnote = element.text
        #print(footnote)
        #print(type(footnote))
    
    for element in driver.find_elements(By.CSS_SELECTOR, "div.tl-main-part.title"): #finds section headers
        #print(element.text)
        counter += 1
        description = element.text
        print(description)
        if "Description of" in description: 
            print('found', description)  
            #snumber = 's' + str(counter - 4) + '/p[3]'
            snumber = 's' + str(counter - 4)
            print('snumber is', snumber)
            for element in driver.find_elements(By.ID, snumber):
                description = element.text
                #print(description)
        
    #Close the browser window
    driver.quit()
    return title, footnote, description



### Beautifulsoup webscraper - returns abstract, can be turned off following additional testing

In [6]:
def scrape_with_beautifulsoup(url):
    global abstract_text
    abstract_text = None
    response = requests.get(url)
    #soup = BeautifulSoup(response.content, "html.parser")
    soup = BeautifulSoup(response.text, "lxml")
    abstract = soup.find(class_= "articleabstract")
    if abstract is not None:
    #print(abstract.get_text())
        abstract_text = abstract.get_text()
        return abstract_text
    else:
        return

### Main body 

In [7]:
pub_df = pd.DataFrame(columns=['PublishedName', 'Accessions', 'Strains','URL'])
pd.set_option('display.max_columns', None)
for url in urls:
    counter = 1
    strains = []
    accessions = []
    description = None
    #scrape_with_selenium(url, counter)
    scrape_with_beautifulsoup(url)
    options = Options()
    options.headless = False  # Set to True for headless mode
    driver = webdriver.Chrome(options=options)

    #Navigate to the webpage
    driver.get(url)

    #Allow time for dynamic content to load (you may need to use WebDriverWait for more robust waiting)
    time.sleep(3)
    
    for element in driver.find_elements(By.CLASS_NAME, "item-meta-data__item-title"):
        #print(element.text)
        title = element.text
        print(title)
        
    for element in driver.find_elements(By.CSS_SELECTOR, "div.tl-main-part.title"): #finds section headers
        #print(element.text)
        counter += 1
        description = element.text
        #print(description)
        if "Description of" in description: 
            #print('found', description)  
            #snumber = 's' + str(counter - 4) + '/p[3]'
            snumber = 's' + str(counter - 4)
            #print('snumber is', snumber)
            for element in driver.find_elements(By.ID, snumber):
                description = element.text
                #print(description)
        
                #find the organism names    
                match = [r'(\S+\s+){2}(?=sp. nov.)', r'(\S+\s+){2}(?=nom. nov.)']
                regex = re.compile(r'\b(' + '|'.join(match) + r')\b')
                orgname = [m.group() for m in regex.finditer(description)]
                print('orgname', orgname)

                #find the accessions
                pattern = [r'[A-Z]{2}\d{6}', r'[A-Z]{4}\d{8}', r'([A-Z]+)(_[A-Z]+)\d{6}', r'[A-Z]{6}\d{9}']
                regex = re.compile(r'\b(' + '|'.join(pattern) + r')\b')
                if description is not None:
                    accessions = [m.group() for m in regex.finditer(description)]
                    print('accessions', accessions)
    
                #find the strains
                #strainpattern = [r'(?<=type strain).*'] 
                strainpattern = [r'(?<=type strain).*?(?=\))']
                regex = re.compile(r'\b(' + '|'.join(strainpattern) + r')\b')
                if description is not None:
                    strains = [m.group() for m in regex.finditer(description)]
                    print('strain names', strains)
    
                #load data into pandas dataframe
                row_data = [orgname, accessions, strains, url]
                length = len(pub_df)
                pub_df.loc[length] = row_data
            print('BREAK')
    
#Close the browser window
    driver.quit()    

SessionNotCreatedException: Message: session not created: Chrome failed to start: exited normally.
  (session not created: DevToolsActivePort file doesn't exist)
  (The process started from chrome location /home/mcveigh/.cache/selenium/chrome/linux64/126.0.6478.126/chrome is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
Stacktrace:
#0 0x5567334f9c7a <unknown>
#1 0x5567331dce2c <unknown>
#2 0x556733211d6a <unknown>
#3 0x55673320e17b <unknown>
#4 0x556733258c49 <unknown>
#5 0x55673324c363 <unknown>
#6 0x55673321c247 <unknown>
#7 0x55673321cb9e <unknown>
#8 0x5567334c024b <unknown>
#9 0x5567334c42f1 <unknown>
#10 0x5567334abafe <unknown>
#11 0x5567334c4e52 <unknown>
#12 0x55673349079f <unknown>
#13 0x5567334e9638 <unknown>
#14 0x5567334e9810 <unknown>
#15 0x5567334f8dac <unknown>
#16 0x7f75a8d951ca start_thread


In [54]:
pd.set_option('max_colwidth', None)
pub_df['Strains'] = [','.join(map(str, l)) for l in pub_df['Strains']]
pub_df

Unnamed: 0,PublishedName,Accessions,Strains,URL
0,[Rhodoferax lithotrophicus ],"[AP024238, LC658658]",is MIZ03T (=JCM 34246T=DSM 113266T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
1,[Rhodoferax lithotrophicus ],[],,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
2,[Rhodoferax koreensis ],[],is DCY110T (=KCTC 52288T=JCM 31441T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
3,[Rhodoferax koreensis ],[],,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
4,[],[],,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
5,[],[],,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
6,[Svornostia abyssi ],"[OP012654, CP088295]",", J379T (=DSM 113746T=CCM 9300T",https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
7,[Svornostia abyssi ],[],,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
8,"[Tepidibacillus marianensis , Tepidibacillus marianensis ]","[PP728962, CP149939, SAMN40373382]",is LSZ-M11000T (=CCAM 1008T=JCM 39431T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
9,[Tepidibacillus marianensis ],[],,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0


In [55]:
pub_df = pub_df.drop_duplicates(subset='PublishedName', keep="first")
pub_df

Unnamed: 0,PublishedName,Accessions,Strains,URL
0,[Rhodoferax lithotrophicus ],"[AP024238, LC658658]",is MIZ03T (=JCM 34246T=DSM 113266T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
2,[Rhodoferax koreensis ],[],is DCY110T (=KCTC 52288T=JCM 31441T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
4,[],[],,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
6,[Svornostia abyssi ],"[OP012654, CP088295]",", J379T (=DSM 113746T=CCM 9300T",https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
8,"[Tepidibacillus marianensis , Tepidibacillus marianensis ]","[PP728962, CP149939, SAMN40373382]",is LSZ-M11000T (=CCAM 1008T=JCM 39431T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
9,[Tepidibacillus marianensis ],[],,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
10,[Nocardia implantans ],"[OR958755, OR994074, JAYKYQ000000000, JAYESH000000000]",is CDC186T (=GDMCC 4.206T= JCM 34959T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006422%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536789981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=dZ7qF2x1XR4TsGA10U%2BeXyEVuwiBe3%2FJsNKvemc7FmE%3D&reserved=0
12,[Aequorivita marina ],"[OR056292, JAVMBW000000000]",", S2608T (KCTC 92652T=MCCC 1H01361T",https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006423%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536800818%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=mHhSZxJANwMD8L8wMckH3kWIpOUU5CJLDHRK3GUnnr4%3D&reserved=0


In [56]:
#try drop duplicate accessions here
pub_df = pub_df.drop_duplicates(subset='Accessions', keep="first")
pub_df.explode(['PublishedName']).reset_index(drop=True)
pub_df

Unnamed: 0,PublishedName,Accessions,Strains,URL
0,[Rhodoferax lithotrophicus ],"[AP024238, LC658658]",is MIZ03T (=JCM 34246T=DSM 113266T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
2,[Rhodoferax koreensis ],[],is DCY110T (=KCTC 52288T=JCM 31441T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
6,[Svornostia abyssi ],"[OP012654, CP088295]",", J379T (=DSM 113746T=CCM 9300T",https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
8,"[Tepidibacillus marianensis , Tepidibacillus marianensis ]","[PP728962, CP149939, SAMN40373382]",is LSZ-M11000T (=CCAM 1008T=JCM 39431T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
10,[Nocardia implantans ],"[OR958755, OR994074, JAYKYQ000000000, JAYESH000000000]",is CDC186T (=GDMCC 4.206T= JCM 34959T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006422%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536789981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=dZ7qF2x1XR4TsGA10U%2BeXyEVuwiBe3%2FJsNKvemc7FmE%3D&reserved=0
12,[Aequorivita marina ],"[OR056292, JAVMBW000000000]",", S2608T (KCTC 92652T=MCCC 1H01361T",https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006423%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536800818%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=mHhSZxJANwMD8L8wMckH3kWIpOUU5CJLDHRK3GUnnr4%3D&reserved=0


In [57]:
pub2_df = pub_df.explode(['Accessions']).reset_index(drop=True)
pub2_df

Unnamed: 0,PublishedName,Accessions,Strains,URL
0,[Rhodoferax lithotrophicus ],AP024238,is MIZ03T (=JCM 34246T=DSM 113266T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
1,[Rhodoferax lithotrophicus ],LC658658,is MIZ03T (=JCM 34246T=DSM 113266T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
2,[Rhodoferax koreensis ],,is DCY110T (=KCTC 52288T=JCM 31441T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
3,[Svornostia abyssi ],OP012654,", J379T (=DSM 113746T=CCM 9300T",https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
4,[Svornostia abyssi ],CP088295,", J379T (=DSM 113746T=CCM 9300T",https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
5,"[Tepidibacillus marianensis , Tepidibacillus marianensis ]",PP728962,is LSZ-M11000T (=CCAM 1008T=JCM 39431T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
6,"[Tepidibacillus marianensis , Tepidibacillus marianensis ]",CP149939,is LSZ-M11000T (=CCAM 1008T=JCM 39431T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
7,"[Tepidibacillus marianensis , Tepidibacillus marianensis ]",SAMN40373382,is LSZ-M11000T (=CCAM 1008T=JCM 39431T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
8,[Nocardia implantans ],OR958755,is CDC186T (=GDMCC 4.206T= JCM 34959T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006422%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536789981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=dZ7qF2x1XR4TsGA10U%2BeXyEVuwiBe3%2FJsNKvemc7FmE%3D&reserved=0
9,[Nocardia implantans ],OR994074,is CDC186T (=GDMCC 4.206T= JCM 34959T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006422%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536789981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=dZ7qF2x1XR4TsGA10U%2BeXyEVuwiBe3%2FJsNKvemc7FmE%3D&reserved=0


In [58]:
pub4_df = pub2_df.explode(['PublishedName']).reset_index(drop=True)
pub4_df.rename(columns={'Accessions' : 'accession'}, inplace=True)
#pub4_df = pub4_df.dropna()
#pub4_df = pub4_df.drop_duplicates(subset='accession', keep="first")
pub4_df=pub4_df[pub4_df['accession'].isnull() | ~pub4_df[pub4_df['accession'].notnull()].duplicated(subset='accession',keep='first')]
pub4_df

Unnamed: 0,PublishedName,accession,Strains,URL
0,Rhodoferax lithotrophicus,AP024238,is MIZ03T (=JCM 34246T=DSM 113266T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
1,Rhodoferax lithotrophicus,LC658658,is MIZ03T (=JCM 34246T=DSM 113266T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
2,Rhodoferax koreensis,,is DCY110T (=KCTC 52288T=JCM 31441T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
3,Svornostia abyssi,OP012654,", J379T (=DSM 113746T=CCM 9300T",https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
4,Svornostia abyssi,CP088295,", J379T (=DSM 113746T=CCM 9300T",https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
5,Tepidibacillus marianensis,PP728962,is LSZ-M11000T (=CCAM 1008T=JCM 39431T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
7,Tepidibacillus marianensis,CP149939,is LSZ-M11000T (=CCAM 1008T=JCM 39431T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
9,Tepidibacillus marianensis,SAMN40373382,is LSZ-M11000T (=CCAM 1008T=JCM 39431T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
11,Nocardia implantans,OR958755,is CDC186T (=GDMCC 4.206T= JCM 34959T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006422%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536789981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=dZ7qF2x1XR4TsGA10U%2BeXyEVuwiBe3%2FJsNKvemc7FmE%3D&reserved=0
12,Nocardia implantans,OR994074,is CDC186T (=GDMCC 4.206T= JCM 34959T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006422%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536789981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=dZ7qF2x1XR4TsGA10U%2BeXyEVuwiBe3%2FJsNKvemc7FmE%3D&reserved=0


In [59]:
df_unique= pub4_df.drop_duplicates(["accession"], keep="first")
df_unique = df_unique.dropna()
df_unique

Unnamed: 0,PublishedName,accession,Strains,URL
0,Rhodoferax lithotrophicus,AP024238,is MIZ03T (=JCM 34246T=DSM 113266T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
1,Rhodoferax lithotrophicus,LC658658,is MIZ03T (=JCM 34246T=DSM 113266T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
3,Svornostia abyssi,OP012654,", J379T (=DSM 113746T=CCM 9300T",https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
4,Svornostia abyssi,CP088295,", J379T (=DSM 113746T=CCM 9300T",https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
5,Tepidibacillus marianensis,PP728962,is LSZ-M11000T (=CCAM 1008T=JCM 39431T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
7,Tepidibacillus marianensis,CP149939,is LSZ-M11000T (=CCAM 1008T=JCM 39431T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
9,Tepidibacillus marianensis,SAMN40373382,is LSZ-M11000T (=CCAM 1008T=JCM 39431T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
11,Nocardia implantans,OR958755,is CDC186T (=GDMCC 4.206T= JCM 34959T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006422%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536789981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=dZ7qF2x1XR4TsGA10U%2BeXyEVuwiBe3%2FJsNKvemc7FmE%3D&reserved=0
12,Nocardia implantans,OR994074,is CDC186T (=GDMCC 4.206T= JCM 34959T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006422%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536789981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=dZ7qF2x1XR4TsGA10U%2BeXyEVuwiBe3%2FJsNKvemc7FmE%3D&reserved=0
13,Nocardia implantans,JAYKYQ000000000,is CDC186T (=GDMCC 4.206T= JCM 34959T,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006422%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536789981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=dZ7qF2x1XR4TsGA10U%2BeXyEVuwiBe3%2FJsNKvemc7FmE%3D&reserved=0


### Create a dataframe of unique accessions and look up NCBI taxonomy information of each accession with srcchk

In [None]:
#df_unique.dtypes

In [60]:
df_unique['accession'] = df_unique['accession'].astype('str') 

In [61]:
with open('acclist', 'w') as f:
    for text in df_unique['accession'].tolist():
        f.write(text + '\n')

In [62]:
os.system("/netopt/ncbi_tools64/bin/srcchk -i acclist -f taxname,taxid,strain -o acclist.taxdata")


0

In [63]:
taxdata_file_name = (r'acclist.taxdata')    
srcchk_df = pd.read_csv(taxdata_file_name, sep='\t', index_col=None, low_memory=False)
srcchk_df.drop(columns=['Unnamed: 4'], inplace=True)
srcchk_df.rename(columns={'organism' : 'NCBIname'}, inplace=True)
srcchk_df['accession'] = srcchk_df['accession'].astype(str).replace('\.\d+', '', regex=True).astype(str)
srcchk_df = srcchk_df.dropna()
srcchk_df 
#many need to add dropna here

Unnamed: 0,accession,NCBIname,taxid,strain
0,AP024238,Rhodoferax sp. MIZ03,2798804.0,MIZ03
1,LC658658,Rhodoferax sp. MIZ03,2798804.0,MIZ03
2,OP012654,Parviterribacter sp.,2005953.0,J379
3,CP088295,Parviterribacteraceae bacterium J379,2898438.0,J379
4,PP728962,Tepidibacillus sp. LSZ-M11000,3131995.0,LSZ-M11000
5,CP149939,Tepidibacillus sp. LSZ-M11000,3131995.0,LSZ-M11000
7,OR958755,Nocardia sp.,1821.0,CDC186
8,OR994074,Nocardia sp.,1821.0,CDC 192
9,JAYKYQ000000000,Nocardia sp. CDC186,3108168.0,CDC186
10,JAYESH000000000,Nocardia sp. CDC192,3109368.0,CDC192


### Combine dataframes into one

In [64]:
combine_df=pd.merge(left=pub4_df, right=srcchk_df, left_on='accession', right_on='accession', how = 'outer')
combine_df = combine_df[['PublishedName', 'NCBIname', 'Strains', 'accession', 'strain', 'taxid', 'URL' ]]
combine_df

Unnamed: 0,PublishedName,NCBIname,Strains,accession,strain,taxid,URL
0,Rhodoferax lithotrophicus,Rhodoferax sp. MIZ03,is MIZ03T (=JCM 34246T=DSM 113266T,AP024238,MIZ03,2798804.0,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
1,Rhodoferax lithotrophicus,Rhodoferax sp. MIZ03,is MIZ03T (=JCM 34246T=DSM 113266T,LC658658,MIZ03,2798804.0,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
2,Rhodoferax koreensis,,is DCY110T (=KCTC 52288T=JCM 31441T,,,,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006439%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536739767%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6%2F2sJEBPHidPXFHfnAVZ4L9NxB7qWboXDf08nqyr8Us%3D&reserved=0
3,Svornostia abyssi,Parviterribacter sp.,", J379T (=DSM 113746T=CCM 9300T",OP012654,J379,2005953.0,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
4,Svornostia abyssi,Parviterribacteraceae bacterium J379,", J379T (=DSM 113746T=CCM 9300T",CP088295,J379,2898438.0,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006432%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536756576%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=khJrAMfgNyIuL3qWq9aMbEhMgEiCWKULKoHBjQx4Vus%3D&reserved=0
5,Tepidibacillus marianensis,Tepidibacillus sp. LSZ-M11000,is LSZ-M11000T (=CCAM 1008T=JCM 39431T,PP728962,LSZ-M11000,3131995.0,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
6,Tepidibacillus marianensis,Tepidibacillus sp. LSZ-M11000,is LSZ-M11000T (=CCAM 1008T=JCM 39431T,CP149939,LSZ-M11000,3131995.0,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
7,Tepidibacillus marianensis,,is LSZ-M11000T (=CCAM 1008T=JCM 39431T,SAMN40373382,,,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006438%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536779354%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=2LPhoXkZ977zFnrHBIKMsVkXYWm0Txd0Jqo84pPL1uE%3D&reserved=0
8,Nocardia implantans,Nocardia sp.,is CDC186T (=GDMCC 4.206T= JCM 34959T,OR958755,CDC186,1821.0,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006422%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536789981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=dZ7qF2x1XR4TsGA10U%2BeXyEVuwiBe3%2FJsNKvemc7FmE%3D&reserved=0
9,Nocardia implantans,Nocardia sp.,is CDC186T (=GDMCC 4.206T= JCM 34959T,OR994074,CDC 192,1821.0,https://gcc02.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.microbiologyresearch.org%2Fcontent%2Fjournal%2Fijsem%2F10.1099%2Fijsem.0.006422%3Femailalert%3Dtrue&data=05%7C02%7Cmcveigh%40ncbi.nlm.nih.gov%7C388b4976017c4779855a08dc9761f4a7%7C14b77578977342d58507251ca2dc2b06%7C0%7C0%7C638551694536789981%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=dZ7qF2x1XR4TsGA10U%2BeXyEVuwiBe3%2FJsNKvemc7FmE%3D&reserved=0


### write output to excel

In [66]:
combine_df.to_excel('NameCheckweek4.xlsx', engine='xlsxwriter', index = False, na_rep = '') 

### taxonerd test -- skip this

In [None]:
from taxonerd import TaxoNERD
taxonerd = TaxoNERD(prefer_gpu=False)
nlp = taxonerd.load(model="en_ner_eco_md", exclude=[], linker="taxref", threshold=0.7)
nlp.pipe_names

### option to use biopython to retrieve orgnames based on accessions -- this approarch works but using srcchk for now so this isn't needed

In [None]:
import Bio
from Bio import Entrez
from Bio import SeqIO

In [None]:
handle = Entrez.efetch(db="nucleotide", id="AY851612", rettype="gb", retmode="text")
x = SeqIO.read(handle, 'genbank')
x.annotations['organism']

### extra stuff not using, just saving for now

In [None]:
combine_df['boo'] = combine_df['PublishedName'] != combine_df['NCBIname']

combine_df.style.applymap('background-color: red', subset=['PublishedName'])

combine_df

In [None]:
 def color_positive_green(val):
    if combine_df.iloc['boo'] == True:
        color = 'red'
    else:
        color = 'black'
    return 'color: %s' % color
 
combine_df.style.applymap(color_positive_green)

In [None]:
#combine_df.apply_style_by_indexes(combine_df[combine_df['PublishedName'] != combine_df['NCBIname']], styler_obj=Styler(bg_color='red'), cols_to_style=['PublishedName', 'NCBIname'])

In [None]:
#df.style.apply(color, axis=None).to_excel('styled.xlsx', engine='openpyxl')