#**Web scraping validated sgRNAs databases**

This notebook was created to get information from validated sgRNAs for one or multiples queries across different databases.

Some database allow to search validated sgRNAs designed for different species. Thus, you can select or provide a specific specie on the assigned field for it.

**Bugs**
- If you encounter any bugs, please report the issue to https://github.com/paolabc/ValidatedSgRNADatabases/issues

**Cite**

**If you use this notebook, please cite the database article and this noteboook:**
  
- Chari R, Yeo N, Chavez A, Church GM (2017). sgRNA Scorer 2.0 – a species independent model to predict CRISPR/Cas9 activity. ACS Synthetic Biology.

- Wen C, Guoqiang Z, Jing L, Xuan Z, Shulan H, Shuanglin X, Xiang H, Changning L (2019). CRISPRlnc: a manually curated database of validated sgRNAs for lncRNAs. Nucleic Acids Research.

- Sanjana NE, Shalem O, Zhang F. Improved vectors and genome-wide libraries for CRISPR screening (2014). Nat Methods.

- This Notebook : https://github.com/paolabc/ValidatedSgRNADatabases
   


In [2]:
#@title Install and load libraries

##Selenium in colab 

#install and 
!pip install selenium &> /dev/null
!apt-get update &> /dev/null
!apt install chromium-chromedriver &> /dev/null

#import modules and submodules inside of these libraries
from selenium import webdriver 
from selenium.webdriver.common.keys import Keys as keys
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
from google.colab import files
from selenium.webdriver.support.ui import Select

#Create the objects

chrome_options = webdriver.ChromeOptions()   #chrome browser
chrome_options.add_argument('--headless')     # Do not open the browser 
chrome_options.add_argument('--no-sandbox')    # for linux only
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('ignore-certificate-errors')
driver =webdriver.Chrome('chromedriver',options=chrome_options)
print("All libraries are installed and loaded")


All libraries are installedand loaded


#sgRNA scorer

In [9]:
#@title
Genes = ["BRCA1","BRCA2"] #@param 
Download = True #@param {type:"boolean"}
from numpy.lib.twodim_base import triu_indices_from

def do_all_in_site(gene_name:str):
    driver.get("https://sgrnascorer.cancer.gov/dbguide/")

    #Then submit you query 
    driver.find_element(by='name', value="search").click()
    driver.find_element(by='name', value="search").send_keys(gene_name)
    driver.find_element(by='name', value="submitButton").click()

    #Ask for all results 
    driver.find_element(by='xpath',value='/html/body/div/div[1]/div[2]/label/select/option[4]').click() 
    resu =driver.page_source

    #Finally transform your query in a pandas dataframe 
    df = pd.read_html(resu)[1]
    pd.set_option('display.max_column',None)
    pd.set_option('display.max_rows',None)
    df.drop(['Select'], axis=1) # drop this columns, it is not important for us

    return df

def create_data_of_sgRNA(info):
    info_table = pd.DataFrame(info,columns = ['Gene' , 'Data_Length'])
    return info_table 

def search(list_of_genes:list,download):
    info=[]
    data=[]
    for i in list_of_genes:
      try:
        final_table = do_all_in_site(i)
        info.append([i,len(final_table)])
        if download==True:
          name = str(i)+'.csv'
          final_table.to_csv(name)
          files.download(name)
        else:
          data.append(final_table)

      except:
        try:
          final_table = do_all_in_site(i.lower)
          info.append([i,len(final_table)])
          if download==True:
            name = str(i)+'.csv'
            final_table.to_csv(name)
            files.download(name)
          else:
            data.append(final_table)
        except:
          next
    if download==True:
       data =create_data_of_sgRNA(info)
    else:
      data= pd.concat(data)
    return data
      

## Result
search(Genes,Download) 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Gene,Data_Length
0,BRCA1,18
1,BRCA2,677


#Cripsrlnc

In [6]:
#@title
Genes = ["THOR","HOTAIR"] #@param 
Specie = "Homo sapiens" #@param {type:"string"}
Download = True #@param {type:"boolean"}
def do_all_in_site_2(gene_name:str,specie):
      driver.get("http://www.crisprlnc.org/")

      ## Search in the database 
      driver.find_element(by='xpath', value="/html/body/header/nav/div/form/input[2]").send_keys(gene_name)
      driver.find_element(by='xpath',value = "/html/body/header/nav/div/form/button").click()
      resu =driver.page_source

      ##Parser tranform in html by using pandas
      df =pd.read_html(resu)

      ## Get the result by organism type 
      lista_gene =[]
      lista_specie=[]
      lista_chr =[]
      lista_sgrna=[]
      lista_pam=[]
      lista_type=[]
      lista_val =[]
      lista_PMID=[]
      for i in range(len(df[0]['Gene'])):
          if df[0]['Species'][i] == specie:
            lista_gene.append(df[0]['Gene'][i])
            lista_specie.append(df[0]['Species'][i])
            lista_chr.append(df[0]['Chromsome'][i])
            lista_sgrna.append(df[0]['sgRNA'][i])
            lista_pam.append(df[0]['PAM'][i])
            lista_type.append(df[0]['Type'][i])
            lista_val.append(df[0]['Validity'][i])
            lista_PMID.append(df[0]['PMID'][i])
        
 
      ## construct a dataframe with just  Homo Sapiens 
      df_final = pd.DataFrame()
      df_final['Gene'] = lista_gene
      df_final['Species'] = lista_specie
      df_final['Chromosome'] = lista_chr
      df_final['sgRNA'] = lista_sgrna
      df_final['PAM'] = lista_pam
      df_final['Type'] = lista_type
      df_final['Validity'] = lista_val
      df_final['PMID'] = lista_PMID

      return  df_final

def create_data_of_sgRNA_2(info):
    info_table = pd.DataFrame(info,columns = ['Gene' , 'Data_Length'])
    return info_table 

def search_2(list_of_genes:list,specie,download):
    info=[]
    data=[]
    for i in list_of_genes:
      sleep(30)
      try:
        chrome_options = webdriver.ChromeOptions()  
        chrome_options.add_argument('--headless')    
        chrome_options.add_argument('--no-sandbox')    
        chrome_options.add_argument('--disable-dev-shm-usage')
        driver =webdriver.Chrome('chromedriver',options=chrome_options)
        info_tab =create_data_of_sgRNA_2(info)
        final_table = do_all_in_site_2(i,specie)
        info.append([i,len(final_table)])
        if download==True:
          name = str(i)+'.csv'
          final_table.to_csv(name)
          files.download(name)
        else:
          data.append(final_table)

      except:
        chrome_options = webdriver.ChromeOptions()  
        chrome_options.add_argument('--headless')    
        chrome_options.add_argument('--no-sandbox')    
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('ignore-certificate-errors')
        driver =webdriver.Chrome('chromedriver',options=chrome_options)
        try:
          print("Need time to new conection with the database")
          print("Sleep....Zzz")
          sleep(180)
          print("Finish sleeping.")
          count=0
          final_table = do_all_in_site_2(i.lower,specie)
          info.append([i,len(final_table)])
          if download==True:
            name = str(i)+'.csv'
            final_table.to_csv(name)
            files.download(name)
          else:
            data.append(final_table)

        except:
          next
    if download==True:
       data =create_data_of_sgRNA_2(info)
    else:
      data= pd.concat(data)

    return data
    

## Result
search_2(Genes,Specie,Download)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Gene,Data_Length
0,THOR,7
1,HOTAIR,11


#Genscript

In [4]:
#@title
Genes = ["IDS","IDUA"] #@param 
Specie ="Human"  #@param  ["Human","Mouse"]
Download = True #@param {type:"boolean"}
def do_all_in_site_3(gene_name:str,specie):
      if specie not in ['Human',"Mouse"]:
          return "Error: This database support only Humans and Mouse validated sgRNA information as input."
      if specie =="Mouse":
         gene_name = gene_name.capitalize()
      elif specie =="Human":
         gene_name = gene_name.upper()
      driver.get("https://www.genscript.com/gRNA-database.html")
      driver.maximize_window()
      driver.find_element(by='xpath',value = "/html/body/div[1]/div[3]/div[4]/div[1]/div[1]/div/form/table/tbody/tr[2]/td/input").submit()
      select=Select(driver.find_element(by="xpath", value="/html/body/div[1]/div[3]/div[4]/div[1]/div[1]/div/form/table/tbody/tr[1]/td/select"))
      select.select_by_visible_text(specie)
      driver.find_element(by='xpath', value="/html/body/div[1]/div[3]/div[4]/div[1]/div[1]/div/form/table/tbody/tr[2]/td/input").send_keys(gene_name)
      driver.find_element(by='xpath', value="/html/body/div[1]/div[3]/div[4]/div[1]/div[1]/div/form/table/tbody/tr[3]/td/input").submit()
      handle=[]
      handle=driver.window_handles
      for i in range(0,len(handle)):
          window_after=driver.window_handles[i]
          driver.switch_to.window(window_after)
          if gene_name in driver.title:
              new_url=driver.current_url
              break
      resu=pd.read_html(driver.page_source)
      resu=resu[0]
      return resu.drop(['Price','Select'],axis=1)


def create_data_of_sgRNA_3(info):
    info_table = pd.DataFrame(info,columns = ['Gene' , 'Data_Length'])
    return info_table 

def search_3(list_of_genes:list,specie,download):
    info=[]
    data=[]
    for i in list_of_genes:
      sleep(60)
      try:
        chrome_options = webdriver.ChromeOptions()  
        chrome_options.add_argument('--headless')    
        chrome_options.add_argument('--no-sandbox')    
        chrome_options.add_argument('--disable-dev-shm-usage')
        driver =webdriver.Chrome('chromedriver',options=chrome_options)
        info_tab =create_data_of_sgRNA_3(info)
        final_table = do_all_in_site_3(i,specie)
        info.append([i,len(final_table)])
        if Download==True:
          name = str(i)+'.csv'
          final_table.to_csv(name)
          files.download(name)
        else:
           data.append(final_table)

      except:
        chrome_options = webdriver.ChromeOptions()  
        chrome_options.add_argument('--headless')    
        chrome_options.add_argument('--no-sandbox')    
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('ignore-certificate-errors')
        driver =webdriver.Chrome('chromedriver',options=chrome_options)
        try:
          print("Need time to new conection with the database")
          print("Sleep....Zzz")
          sleep(180)
          print("Finish sleeping.")
          count=0
          final_table = do_all_in_site_3(i.lower,specie)
          info.append([i,len(final_table)])
          if Download==True:
            name = str(i)+'.csv'
            final_table.to_csv(name)
            files.download(name)
          else:
            data.append(final_table)
        except:
          next
    if download==True:
       data =create_data_of_sgRNA_3(info)
    else:
      data= pd.concat(data)

    return data

search_3(Genes,Specie,Download)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Gene,Data_Length
0,IDS,13
1,IDUA,12
