##### Example scrape segment polarity
- Get the segment polarity genes from SBD online

In [None]:
from bs4 import BeautifulSoup
# https://www.tutorialspoint.com/how-can-beautifulsoup-package-be-used-to-parse-data-from-a-webpage-in-python
from urllib.request import urlopen, Request
from urllib.error import HTTPError
import os
import re
import pandas as pd
sys.path.append('../scripts')
from annotation_utilities import *


In [None]:
table_file = '../../resources/genelists/segment_polarity.html'

outdir = '../Figures/Examples/'
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [None]:
# https://www.geeksforgeeks.org/beautifulsoup-find-all-li-in-ul/
# Getting the data via Request with 403 error
# https://itsmycode.com/python-urllib-error-httperror-http-error-403-forbidden/
# HTTPError: HTTP Error 404: Not Found
parent_site = 'https://www.sdbonline.org/sites/fly/'
soup = BeautifulSoup (open(table_file), "html.parser")
# https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
tables = soup.find_all('ul')
genedict = {}
# Beginning of section
p = 0
for pg in tables:
    gene_pgs = pg.find_all('a')
    # Each row is a different gene page, i.e. link to a gene page
    for row in gene_pgs:
        gene_ids = []
        p += 1
        name = row.text
        if row['href'].startswith('http'):
            link = row['href']
            gene_id = link.split('/')[-1].rstrip('.html')
            print('gene id', gene_id)
        else:
            link = os.path.join(parent_site, row['href'].lstrip('../'))
            try:
                req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
                html = urlopen(req).read()
                soup = BeautifulSoup(html, features="html.parser")
                result = soup.findAll('href', text=re.compile('FlyBase ID'))
                first_sourceline = ''
                for a in soup.find_all('a', href=True):
                    sourceline = a.sourceline
                    if 'flybase.org/reports' in a['href']:
                        fb_id = a['href'].split('/')[-1].split('.')[0].strip()
                        # First FBg on the page will be the one we want
                        if first_sourceline == '':
                            gene_ids.append(fb_id)
                            first_sourceline = sourceline
                        else:
                            if (sourceline - first_sourceline) < 5:
                                gene_ids.append(fb_id)
            except HTTPError:
                print(f'{name} not found')
        # hash by FBg ID b/c these will be unique and names might not be unique
        for g in gene_ids:
            genedict[g] = name

In [None]:
# Manaully put this one in as Shank because the page was not found
shank = 'FBgn0040752'
sg_genes = set(list(genedict.keys()))
sg_genes.add(shank)
pd.DataFrame(sg_genes)[0].to_csv(os.path.join(outdir, 'segment_pol_genes_SBD.csv'), index=False, header=None)

In [None]:
# Add the patterning genes and NB temporal genes
# Manually look up the gene IDs in Flybase v6.32
# This corresponse to the NB7-1 lineage, in fig. 1 of Doe review 2017
embryo_nb = ['hb', 'svp', 'Kr', 'pdm2', 'nub', 'cas', 'grh']
# These from the Syed & Doe eLife paper, 2017
larval_nb = ['Imp', 'Syp', 'br', 'chinmo', 'Eip93F', 'svp', 'lin-28']
# Additional ones from the Chris Doe review 2017
extra_embryo_nb = ['sqz', 'nab']
opc_nbs = ['hth', 'klu', 'ey', 'slp1', 'slp2', 'D', 'tll']
topc_nbs = ['Dll', 'ey', 'slp1', 'slp2', 'D']
mb_nbs = ['Imp', 'chinmo', 'Syp', 'br', 'mir-let7']
ad_nbs = ['Imp', 'chinmo', 'Kr', 'Syp']
thoracic_nbs = ['Imp', 'chinmo', 'cas', 'svp', 'Syp', 'br']
typeII_nbs = ['cas', 'D', 'Imp', 'chinmo', 'lin-28', 'EcR', 'Syp', 'br', 'Eip93F']
# pair-rule and gap genes reported by Interactive Fly
# Classic set of pair-rule genes doesnt inclue Ten-m
pairrule = ['eve', 'ftz', 'h', 'opa', 'odd', 'prd', 'runt', 'slp1', 'slp2', 'Ten-m']
pairrule_classic = pairrule.remove('Ten-m')
gap = ['btd', 'cnc', 'cad', 'kn', 'croc', 'ems', 'gt', 'hb', 'hkb', 'Kr', 'kni', 'oc', 'slp1', 'slp2', 'tll']

genes = {'hb':'FBgn0001180', 'Kr':'FBgn0001325', 'pdm2':'FBgn0004394', 'cas':'FBgn0004878', 'eve':'FBgn0000606', 'ftz':'FBgn0001077', 
         'h':'FBgn0001168', 'opa':'FBgn0003002', 'odd':'FBgn0002985', 'prd':'FBgn0003145', 'runt':'FBgn0003300', 'slp2':'FBgn0004567', 
         'slp1':'FBgn0003430', 'Ten-m':'FBgn0004449', 'btd':'FBgn0000233', 'cnc':'FBgn0262975', 'cad':'FBgn0000251', 'kn':'FBgn0001319',
         'croc':'FBgn0014143', 'ems':'FBgn0000576', 'gt':'FBgn0001150', 'hb':'FBgn0001180', 'hkb':'FBgn0261434', 'Kr':'FBgn0001325', 
         'kni':'FBgn0001320', 'oc':'FBgn0004102', 'tll':'FBgn0003720', 'Imp':'FBgn0285926', 'Syp':'FBgn0038826', 'nub':'FBgn0085424', 
         'grh':'FBgn0259211', 'br':'FBgn0283451', 'chinmo':'FBgn0086758', 'Eip93F':'FBgn0264490', 'svp':'FBgn0003651', 'lin-28':'FBgn0035626',
         'sqz':'FBgn0010768', 'nab':'FBgn0259986', 'hth':'FBgn0001235', 'klu':'FBgn0013469', 'ey':'FBgn0005558', 'D':'FBgn0000411', 'tll':'FBgn0003720',
         'Dll':'FBgn0000157','mir-let7':'FBgn0262406', 'EcR':'FBgn0000546'}

all_nb_factors = [genes[i] for i in set(embryo_nb + extra_embryo_nb + larval_nb + opc_nbs + topc_nbs + mb_nbs + ad_nbs + thoracic_nbs + typeII_nbs)]
all_pattern_factors = [genes[i] for i in set(pairrule + gap)]

In [None]:
# Create the ID mapping table. The index is the old ID and the columns contain the newID
id_dir = '../../resources/id_conversion/'
dmel632_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2020_01.tsv')
dmel628_file = os.path.join(id_dir, 'fbgn_annotation_ID_fb_2019_03.tsv')
# Get the mapping between 628 -> 632
cdf = update_ids(dmel632_file, dmel628_file)

In [None]:
# Used the Flybase ID validator v6.32 to validate IDs for the segment polarity genes
# Remove ones from another species => current_symbol contains a backslash \
m = re.compile('\\\\')
sp_df = pd.read_csv('../../resources/genelists/segpol_conversion_632.txt', sep='\t')
sp_df['nonDmel'] = sp_df['current_symbol'].apply(lambda x: True if m.search(x) else False)
sp_df = sp_df.query('~nonDmel').copy()
sp_df = sp_df.loc[sp_df['current_id'] != 'unknown ID'].copy()
sp_df = sp_df.set_index('# submitted_id', drop=False)
sp_df.index.name = 'index'
sp_df = resolve_splits(sp_df, old_sym='# submitted_id', new_sym='current_symbol', new_ID='converted_id')
# Check if any genes have been converted
print(f'Genes after resolve split are equal to input genes: {sp_df.equals(sp_df)}')

In [None]:
outdir2 = '../../resources/'
cdf['seg_pol'] = cdf['new_ID'].isin(sp_df['current_id'])
cdf['patterning'] = cdf['new_ID'].isin(all_pattern_factors)
cdf['neuraldev'] = cdf['new_ID'].isin(all_nb_factors)
pd.DataFrame(cdf.query('seg_pol').index).to_csv(os.path.join(outdir2, 'segpol_genes_628.csv'), index=False, header=False)
pd.DataFrame(cdf.query('patterning').index).to_csv(os.path.join(outdir2, 'patterning_genes_628.csv'), index=False, header=False)
pd.DataFrame(cdf.query('neuraldev').index).to_csv(os.path.join(outdir2, 'neuraldev_genes_628.csv'), index=False, header=False)

In [None]:
# Check that the most ones with two actually have two on the web
from collections import Counter
counter = Counter(genedict.values())
counter.most_common()