# Biomart

#### First we have to install Biomart:

In [1]:
# To install pybiomart
# !pip install pybiomart

In [2]:
# Connect to biomart server
import pybiomart
from pybiomart import Server
import pandas as pd
server = Server(host='http://www.ensembl.org')

#### Show all services available

#### The output of server.marts is a dictionary. Using pandas to convert into a dataframe

In [3]:
marts = server.marts
pd.DataFrame.from_dict(marts, orient='index')

Unnamed: 0,0
ENSEMBL_MART_ENSEMBL,"<biomart.Mart name='ENSEMBL_MART_ENSEMBL', dis..."
ENSEMBL_MART_MOUSE,"<biomart.Mart name='ENSEMBL_MART_MOUSE', displ..."
ENSEMBL_MART_SEQUENCE,"<biomart.Mart name='ENSEMBL_MART_SEQUENCE', di..."
ENSEMBL_MART_ONTOLOGY,"<biomart.Mart name='ENSEMBL_MART_ONTOLOGY', di..."
ENSEMBL_MART_GENOMIC,"<biomart.Mart name='ENSEMBL_MART_GENOMIC', dis..."
ENSEMBL_MART_SNP,"<biomart.Mart name='ENSEMBL_MART_SNP', display..."
ENSEMBL_MART_FUNCGEN,"<biomart.Mart name='ENSEMBL_MART_FUNCGEN', dis..."


#### Select a specific Biomart and list all avalible dataset

In [4]:
ensembl = server.marts['ENSEMBL_MART_ENSEMBL'].datasets
pd.DataFrame.from_dict(ensembl, orient='index')

Unnamed: 0,0
mmoschiferus_gene_ensembl,<biomart.Dataset name='mmoschiferus_gene_ensem...
otshawytscha_gene_ensembl,<biomart.Dataset name='otshawytscha_gene_ensem...
smerianae_gene_ensembl,<biomart.Dataset name='smerianae_gene_ensembl'...
eburgeri_gene_ensembl,"<biomart.Dataset name='eburgeri_gene_ensembl',..."
csabaeus_gene_ensembl,"<biomart.Dataset name='csabaeus_gene_ensembl',..."
...,...
rbieti_gene_ensembl,"<biomart.Dataset name='rbieti_gene_ensembl', d..."
gmorhua_gene_ensembl,"<biomart.Dataset name='gmorhua_gene_ensembl', ..."
pnyererei_gene_ensembl,<biomart.Dataset name='pnyererei_gene_ensembl'...
cdromedarius_gene_ensembl,<biomart.Dataset name='cdromedarius_gene_ensem...


#### Look for human genome

#### filter function paired with lambda to grep elements that contains keyword "sapiens"

In [5]:
list(filter(lambda x:'sapiens' in x, ensembl.keys()))

['hsapiens_gene_ensembl']

In [6]:
# select dataset
hg38 = ensembl['hsapiens_gene_ensembl']

In [7]:
hg38.query(attributes=['ensembl_gene_id', 'external_gene_name'],
              filters={'chromosome_name': ['1','2']})

Unnamed: 0,Gene stable ID,Gene name
0,ENSG00000223972,DDX11L1
1,ENSG00000227232,WASH7P
2,ENSG00000278267,MIR6859-1
3,ENSG00000243485,MIR1302-2HG
4,ENSG00000284332,MIR1302-2
...,...,...
9666,ENSG00000261186,LINC01238
9667,ENSG00000220804,LINC01881
9668,ENSG00000224160,CICP10
9669,ENSG00000244528,SEPTIN14P2


#### What are filters

In [8]:
# filters
pd.DataFrame.from_dict(hg38.filters,orient='index').head()

Unnamed: 0,0
link_so_mini_closure,"<biomart.Filter name='link_so_mini_closure', t..."
link_go_closure,"<biomart.Filter name='link_go_closure', type='..."
link_ensembl_transcript_stable_id,<biomart.Filter name='link_ensembl_transcript_...
gene_id,"<biomart.Filter name='gene_id', type='text'>"
transcript_id,"<biomart.Filter name='transcript_id', type='te..."


#### What are attributes

In [9]:
# attributes
pd.DataFrame.from_dict(hg38.attributes,orient='index').head()

Unnamed: 0,0
ensembl_gene_id,"<biomart.Attribute name='ensembl_gene_id', dis..."
ensembl_gene_id_version,<biomart.Attribute name='ensembl_gene_id_versi...
ensembl_transcript_id,<biomart.Attribute name='ensembl_transcript_id...
ensembl_transcript_id_version,<biomart.Attribute name='ensembl_transcript_id...
ensembl_peptide_id,"<biomart.Attribute name='ensembl_peptide_id', ..."


#### Ways to look for specific attributes

In [10]:
list(filter(lambda x:'affy' in x, hg38.attributes.keys()))

['affy_hc_g110',
 'affy_hg_focus',
 'affy_hg_u133a',
 'affy_hg_u133a_2',
 'affy_hg_u133b',
 'affy_hg_u133_plus_2',
 'affy_hg_u95a',
 'affy_hg_u95av2',
 'affy_hg_u95b',
 'affy_hg_u95c',
 'affy_hg_u95d',
 'affy_hg_u95e',
 'affy_hta_2_0',
 'affy_huex_1_0_st_v2',
 'affy_hugenefl',
 'affy_hugene_1_0_st_v1',
 'affy_hugene_2_0_st_v1',
 'affy_primeview',
 'affy_u133_x3p']

In [11]:
hg38.query(attributes=['ensembl_gene_id', 'description', "affy_hg_u133a"],
              filters={'chromosome_name': 1})

Unnamed: 0,Gene stable ID,Gene description,AFFY HG U133A probe
0,ENSG00000223972,DEAD/H-box helicase 11 like 1 (pseudogene) [So...,
1,ENSG00000227232,"WASP family homolog 7, pseudogene [Source:HGNC...",
2,ENSG00000278267,microRNA 6859-1 [Source:HGNC Symbol;Acc:HGNC:5...,
3,ENSG00000243485,MIR1302-2 host gene [Source:HGNC Symbol;Acc:HG...,
4,ENSG00000284332,microRNA 1302-2 [Source:HGNC Symbol;Acc:HGNC:3...,
...,...,...,...
7340,ENSG00000171163,zinc finger protein 692 [Source:HGNC Symbol;Ac...,
7341,ENSG00000227237,novel transcript,
7342,ENSG00000185220,piggyBac transposable element derived 2 [Sourc...,
7343,ENSG00000200495,"RNA, U6 small nuclear 1205, pseudogene [Source...",
