In [35]:
report('''
# Sample analysis of Cancer Dependency Map (DepMap) data 
## Request
* Identify the most frequent genetic alterations (could be mutations or copy number variations) in the cancer cell lines
* Match them with the best genetic dependencies that could be used for drug development for the cancers that carry those mutations
* Take into account the lineage of cancer cell lines (certain mutations/CNVs may be restricted to a specific lineage)

## Resources
### DepMap (https://depmap.org/portal) Data 
* Cell line metadata
* Expression
 * RNAseq
 * Protein
* Copy number variation
* Mutations
* Genetic dependency
 * Crispr (Achilles)
 * RNAi (DEMETER2)

### NCBI (via https://humanmine.org)
* Entrez gene IDs mapped to symbol, name, descrion, uniprot

### Reactome
* Entrez gene IDs mapped to Reactome pathways

### MFR (http://bmbl.sdstate.edu/MFR)
* A Machine Learning Model for measuring relatedness between a pair of genes

### Jupyter (https://jupyter.org/)
* Python programming framework for analysis prototyping and reporting

### GitHub (https://github.com/)
* Revision control for Python code
* Reporting mechanism for analysi summary and details

''')


# Sample analysis of Cancer Dependency Map (DepMap) data 
## Request
* Identify the most frequent genetic alterations (could be mutations or copy number variations) in the cancer cell lines
* Match them with the best genetic dependencies that could be used for drug development for the cancers that carry those mutations
* Take into account the lineage of cancer cell lines (certain mutations/CNVs may be restricted to a specific lineage)

## Resources
### DepMap (https://depmap.org/portal) Data 
* Cell line metadata
* Expression
 * RNAseq
 * Protein
* Copy number variation
* Mutations
* Genetic dependency
 * Crispr (Achilles)
 * RNAi (DEMETER2)

### NCBI (via https://humanmine.org)
* Entrez gene IDs mapped to symbol, name, descrion, uniprot

### Reactome
* Entrez gene IDs mapped to Reactome pathways

### MFR (http://bmbl.sdstate.edu/MFR)
* A Machine Learning Model for measuring relatedness between a pair of genes

### Jupyter (https://jupyter.org/)
* Python programming framework for analysis prototyping and reporting

### GitHub (https://github.com/)
* Revision control for Python code
* Reporting mechanism for analysi summary and details



In [36]:
from __future__ import print_function
import pandas as pd
import os, sys, re, pickle, wget, shutil
from intermine.webservice import Service
from IPython.display import Markdown, display

# Load data direct from source
class A(object):
    data = {}
    url = {}
    url['expression'] = 'https://ndownloader.figshare.com/files/16757690'
    url['sample_info'] = 'https://ndownloader.figshare.com/files/16757723'
    url['copy_number'] = 'https://ndownloader.figshare.com/files/17857886'
    url['mutations'] = 'https://ndownloader.figshare.com/files/16757702'
    url['achilles_crispr'] = 'https://ndownloader.figshare.com/files/16757666'
    url['d2_rnai'] = 'https://ndownloader.figshare.com/files/13515395'
    url['sensitivity'] = 'https://ndownloader.figshare.com/files/17008628'
    url['mfr'] = 'http://bmbl.sdstate.edu/MFR/data/resource%20data/tr_dv_ts.dataset.zip'
    url['reactome'] = 'https://reactome.org/download/current/NCBI2Reactome.txt'
    summary = 'README.md'
    
os.system("echo '' > "+A.summary)
    
def printmd(string):
    """Prints formatted markdown text"""
    display(Markdown(string))
    
def report(text):
    """Print markdown in this notebook and saves markdown-only summary in a file"""
    printmd(text)
    
    f = open(A.summary,'a')
    f.write(text+'\n')
    
def get_gene_descriptions():
    """Load gene names and descriptions from humanmine (http://www.humanmine.org),
    an integrated database of human genome information.  Use cached data if available"""
    
    archive = 'data/gene_info.p'
    if os.path.exists(archive):
        df = pd.read_pickle(archive)
        return df
    
    service = Service("https://www.humanmine.org/humanmine/service")
    query = service.new_query("Gene")
    cols = ["primaryIdentifier", "symbol", "briefDescription", "description","proteins.uniprotAccession"]
    query.add_view(*cols)
    query.add_constraint("organism.taxonId", "=", "9606", code = "A")    
    df_rows = []

    for row in query.rows():
        df_rows.append(
            [row["primaryIdentifier"], 
             row["symbol"], 
             row["briefDescription"], 
             row["description"],
             row["proteins.uniprotAccession"]
            ])

    df = pd.DataFrame(data=df_rows,columns=cols)
    df.to_pickle(archive)
    return df
    
def get_data(key):
    """Load input data.
    Arguments: key for the data source (eg: expression, sample_info...)
    1) If the data is cached on the filesystem, load and return the dataframe
    2) Otherwise, load the data from the source URL, cache, return the dataframe
    """
#     if A.data.get(key) is not None:
#         return A.data[key]
    
    data_cache = 'data/'+key+'.p'
    if os.path.exists(data_cache):
        df = pd.read_pickle(data_cache)
    else:
        print("Downloading",key,"from source")
        df = pd.read_csv(A.url[key],low_memory=False,index_col=0)
        df.to_pickle(data_cache)
        
    A.data[key] = df    
    return df

def ncbi_gene_ids(genes):
    """Converts gene name "symbol (ncbi_id)"
    to integer NCBI ID
    """
    ncbi_cols = []
    for g in genes:
        match = re.search(' \((\d+)\)',g)
        if match:
            g = match.group(1)
        else:
            print("No match",g)
        ncbi_cols.append(int(g))

    return ncbi_cols

def get_pathway_info():
    # Map NCBI IDs to reactome pathways
    # File downloaded from https://reactome.org/download/current/NCBI2Reactome.txt
    archive = 'data/pathway_info.p'
    
    if os.path.exists(archive):
        pathway_info = pickle.load(open(archive,'rb'))
        return pathway_info
    
    pathway_info = {}
    if not os.path.exists('data/NCBI2Reactome.txt'):
        wget.download(A.url['reactome'],out='data/NCBI2Reactome.txt')
    with open('data/NCBI2Reactome.txt') as n2r:
        for line in n2r.readlines():
            ncbi, pathway_id, url, pathway_name, type, species = line.strip().split('\t')
            # only human pathways
            if species != 'Homo sapiens':
                continue
            # only curated pathways
            if type == 'IEA':
                continue
            pathway_info[ncbi] = [pathway_id, url, pathway_name]

    pickle.dump(pathway_info,open(archive,'wb'))
    return pathway_info


In [2]:
report('''
## Get gene descriptions, etc
Use data from humanmine (http://www.humanmine.org/) to map NCBI gene IDs to name, summary, symbol, uniprot
''')


## Get gene descriptions, etc
Use data from humanmine (http://www.humanmine.org/) to map NCBI gene IDs to name, summary, symbol, uniprot


In [21]:
gd = get_gene_descriptions()
report('sample:\n'+gd.head(2).to_html(index=False))
ncbi2name    = {}
ncbi2symbol  = {}
ncbi2desc    = {}
ncbi2uniprot = {}
uniprot2ncbi = {}

for i, r in gd.iterrows():
    ncbi, symbol, name, description, uniprot = list(r)
    ncbi = int(ncbi)
    ncbi2name[ncbi] = name
    ncbi2symbol[ncbi] = symbol
    ncbi2desc[ncbi] = description
    
    # ncbi <-> uniprot can be 1:many
    if ncbi2uniprot.get(ncbi) is None:
        ncbi2uniprot[ncbi] = set()
    ncbi2uniprot[ncbi].add(uniprot)
    uniprot2ncbi[uniprot] = ncbi
    
print("Done mapping gene info")

sample:
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>primaryIdentifier</th>
      <th>symbol</th>
      <th>briefDescription</th>
      <th>description</th>
      <th>proteins.uniprotAccession</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>2632</td>
      <td>GBE1</td>
      <td>1,4-alpha-glucan branching enzyme 1</td>
      <td>The protein encoded by this gene is a glycogen branching enzyme that catalyzes the transfer of alpha-1,4-linked glucosyl units from the outer end of a glycogen chain to an alpha-1,6 position on the same or a neighboring glycogen chain. Branching of the chains is essential to increase the solubility of the glycogen molecule and, consequently, in reducing the osmotic pressure within cells. Highest level of this enzyme are found in liver and muscle. Mutations in this gene are associated with glycogen storage disease IV (also known as Andersen's disease). [provided by RefSeq, Jul 2008]</td>
      <td>Q04446</td>
    </tr>
    <tr>
      <td>3248</td>
      <td>HPGD</td>
      <td>15-hydroxyprostaglandin dehydrogenase</td>
      <td>This gene encodes a member of the short-chain nonmetalloenzyme alcohol dehydrogenase protein family. The encoded enzyme is responsible for the metabolism of prostaglandins, which function in a variety of physiologic and cellular processes such as inflammation. Mutations in this gene result in primary autosomal recessive hypertrophic osteoarthropathy and cranioosteoarthropathy. Multiple transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Mar 2009]</td>
      <td>E9PBZ2</td>
    </tr>
  </tbody>
</table>

Done mapping gene info


In [22]:
report('''
## Aggregate cell lines by lineage
<b>DepMap source file:</b> sample_info.csv
* Group all cell lines by main (parent) lineage
* If a lineage has > 1 defined sublineages, also aggregate cell lines by sublineage (eg: leukemia -> AML)
''')


## Aggregate cell lines by lineage
<b>DepMap source file:</b> sample_info.csv
* Group all cell lines by main (parent) lineage
* If a lineage has > 1 defined sublineages, also aggregate cell lines by sublineage (eg: leukemia -> AML)


In [26]:
sample_info = get_data('sample_info')
lineages = set(sample_info.lineage.dropna())

lineage2cell    = {}
sublineage2cell = {}
cell2lineage    = {}
cell2sublineage = {}
has_sub = set()

for l in lineages:
    ldf = sample_info[sample_info.lineage == l]
    subtypes = set(ldf.lineage_subtype.dropna())
    
    # cell lines for sublineage
    if len(subtypes) > 1:
        has_sub.add(l)
        for sub in subtypes:
            if l in sub:
                lname = sub
            else:
                lname = l + '_' + sub
            lname = l + '_' + sub
            sub_df = ldf[ldf.lineage_subtype == sub]

            # map sublineage to cell lines
            sublineage2cell[lname] = list(sub_df.index)
    
            # map cell lines to sub-lineage
            for cell in sublineage2cell[lname]:
                cell2sublineage[cell] = sub
                
    # all cell lines for lineage
    lineage2cell[l] = list(ldf.index)
    
    # parent lineage of each cell line
    for cell in lineage2cell[l]:
        cell2lineage[cell] = l
    

report('<pre>')
report("Number of cell lines: "+str(len(cell2lineage)))
report("Number of lineages: "+str(len(lineage2cell)))
report("Number of lineages with sub-lineages: "+str(len(has_sub)))
report("Number of sub-lineages "+str(len(sublineage2cell)))
report('</pre>')

<pre>

Number of cell lines: 1429

Number of lineages: 33

Number of lineages with sub-lineages: 16

Number of sub-lineages 61

</pre>

In [6]:
report('''
## Mutations 
<b>DepMap source file:</b> CCLE_mutations.csv
* Keep track of TCGA and COSMIC hotspot genes by lineage
* Track deleterious mutations by lineage for future reference
''')


## Mutations 
<b>DepMap source file:</b> CCLE_mutations.csv
* Keep track of TCGA and COSMIC hotspot genes by lineage
* Track deleterious mutations by lineage for future reference


In [7]:
mutations = get_data('mutations')

In [27]:
# Filter hotspot genes
hotspots = []
hotspots.append(mutations[mutations.isTCGAhotspot])
hotspots.append(mutations[mutations.isCOSMIChotspot])
hotspots = pd.concat(hotspots).drop_duplicates()

genes = set(hotspots.Entrez_Gene_Id)

lineage_hotspots = {}
sublineage_hotspots = {}

for g in genes:
    lineage_hotspots[g] = {}
    sublineage_hotspots[g] = {}
    
    gdf = hotspots[hotspots.Entrez_Gene_Id == g]
    
    for i, r in gdf.iterrows():
        cell = r.DepMap_ID
        if cell2lineage.get(cell) is not None:
            lineage = cell2lineage[cell]
            if lineage_hotspots[g].get(lineage) is None:
                lineage_hotspots[g][lineage] = 0
            lineage_hotspots[g][lineage] += 1
            
        if cell2sublineage.get(cell) is not None:
            sublineage = cell2sublineage[cell]
            if sublineage_hotspots[g].get(sublineage) is None:
                sublineage_hotspots[g][sublineage] = 0
            sublineage_hotspots[g][sublineage] += 1
print("Done mapping hotspots")
cell_lineage[cell]

Done mapping hotspots


NameError: name 'cell_lineage' is not defined

In [9]:
# Filter deleterious mutations
damaging = mutations[mutations.Variant_annotation.isin(['damaging','non-conserving'])]
damaging = damaging[damaging.isDeleterious == True]
genes = set(hotspots.Entrez_Gene_Id)


genes = set(damaging.Entrez_Gene_Id)

lineage_mutations = {}
sublineage_mutations = {}

for g in genes:
    lineage_mutations[g] = {}
    sublineage_mutations[g] = {}
    
    gdf = damaging[damaging.Entrez_Gene_Id == g]
    
    for i, r in gdf.iterrows():
        cell = r.DepMap_ID
        if cell2lineage.get(cell) is not None:
            lineage = cell2lineage[cell]
            if lineage_mutations[g].get(lineage) is None:
                lineage_mutations[g][lineage] = 0
            lineage_mutations[g][lineage] += 1
            
        if cell2sublineage.get(cell) is not None:
            sublineage = cell2sublineage[cell]
            if sublineage_mutations[g].get(sublineage) is None:
                sublineage_mutations[g][sublineage] = 0
            sublineage_mutations[g][sublineage] += 1
            
print("Done mapping deleterious mutations")

Done mapping deleterious mutations


In [10]:
report('''
## Ingest DEMETER2 Data v5 combines RNAi gene dependency data
DEMETER2 Data v5
Cancer cell line genetic dependencies estimated using the DEMETER2 model. DEMETER2 is applied to three large-scale RNAi screening datasets: the Broad Institute Project Achilles, Novartis Project DRIVE, and the Marcotte et al. breast cell line dataset. The model is also applied to generate a combined dataset of gene dependencies covering a total of 712 unique cancer cell lines.

<b>DepMap source file:</b> D2_combined_gene_dep_scores.csv 

<b>Citation:</b> Jordan G. Bryan, John M. Krill-Burger, Thomas M. Green, Francisca Vazquez, Jesse S. Boehm, Todd R. Golub, William C. Hahn, David E. Root, Aviad Tsherniak. (2018). Improved estimation of cancer dependencies from large-scale RNAi screens using model-based normalization and data integration. Nature Communications 9, 1. https://doi.org/10.1038/s41467-018-06916-5</font>

* Data source uses CCLE names rather than DepMap cell line IDS
* Translate the cell line names to IDS for consistency with other data sources
* Also deal with rows in the table with multiple gene names (eg 'GTF2IP4&GTF2IP1 (100093631&2970)')
''')


## Ingest DEMETER2 Data v5 combines RNAi gene dependency data
DEMETER2 Data v5
Cancer cell line genetic dependencies estimated using the DEMETER2 model. DEMETER2 is applied to three large-scale RNAi screening datasets: the Broad Institute Project Achilles, Novartis Project DRIVE, and the Marcotte et al. breast cell line dataset. The model is also applied to generate a combined dataset of gene dependencies covering a total of 712 unique cancer cell lines.

<b>DepMap source file:</b> D2_combined_gene_dep_scores.csv 

<b>Citation:</b> Jordan G. Bryan, John M. Krill-Burger, Thomas M. Green, Francisca Vazquez, Jesse S. Boehm, Todd R. Golub, William C. Hahn, David E. Root, Aviad Tsherniak. (2018). Improved estimation of cancer dependencies from large-scale RNAi screens using model-based normalization and data integration. Nature Communications 9, 1. https://doi.org/10.1038/s41467-018-06916-5</font>

* Data source uses CCLE names rather than DepMap cell line IDS
* Translate the cell line names to IDS for consistency with other data sources
* Also deal with rows in the table with multiple gene names (eg 'GTF2IP4&GTF2IP1 (100093631&2970)')


In [11]:
# Ingest data
d2 = get_data('d2_rnai')
print("Initial number of rows in dataframe:",d2.shape[0])
# split rows with multuple genes
cols = ['gene'] + list(d2.columns)
rows = []

print("Splitting multigene rows...")
for i, r in d2.iterrows():
    if '&' in i:
        symbols, ncbi = i.split(' ')
        symbols = symbols.split('&')
        ncbi = re.sub('\(|\)','',ncbi)
        ncbi = ncbi.split('&')
        assert len(symbols) == len(ncbi), "Length mismatch"
        for s, symbol in enumerate(symbols):
            gid = ncbi[s]
            row = [symbol+' ('+gid+')'] + list(r)
            rows.append(row)
    else:
        rows.append([i]+list(r))
d2 = pd.DataFrame(data=rows,columns=cols).set_index('gene')
print("Final number of rows in dataframe:",d2.shape[0])

Initial number of rows in dataframe: 17309
Splitting multigene rows...
Final number of rows in dataframe: 17731


In [12]:
# Map cell line names to IDs
sample_info = get_data('sample_info')
ccle_name2id = {}
for i, r in sample_info.iterrows():
    ccle_name2id[r['CCLE Name']] = i 

cell_line_names = list(d2.columns)

# Rename columns to use CCLE IDs and rows to use NCBI gene IDs
if isinstance(list(d2.index)[0],str):
    d2.columns = [ccle_name2id.get(i) or i for i in cell_line_names]
    d2.index = ncbi_gene_ids(list(d2.index))
print(d2.shape[0],"genes")
print(d2.shape[1],"cell lines")
d2.head()

17731 genes
712 cell lines


Unnamed: 0,ACH-001270,ACH-001000,ACH-001001,ACH-002319,ACH-001827,ACH-000956,ACH-000948,ACH-002320,ACH-000070,ACH-000411,...,ACH-000899,ACH-000765,ACH-000534,ACH-000762,ACH-000630,ACH-000570,ACH-001249,ACH-000097,ACH-000828,ACH-002331
1,,,0.146042,-0.190388,0.907063,-0.019331,-0.016734,0.09158,0.035023,-0.122046,...,-0.088267,0.002171,,0.120294,0.01254,0.11153,,-0.079313,-0.141559,0.214268
10,,,0.102854,0.384106,0.403192,0.001925,-0.153933,-0.317969,0.012341,-0.205077,...,-0.003747,-0.321445,,-0.003256,-0.220472,0.07346,,-0.130921,0.127358,-0.405974
100,,,0.168839,-0.1207,0.004394,-0.1887,-0.060818,-0.755058,0.12977,0.076273,...,-0.014085,0.039679,,0.076521,0.106995,0.227977,,-0.134479,0.083506,-0.404291
1000,-0.194962,-0.028171,0.063047,-0.237251,-0.017059,-0.103047,-0.04946,0.130107,0.146864,-0.126198,...,-0.073435,-0.140041,-0.154436,-0.040308,-0.078707,0.000769,-0.139126,0.047022,-0.097644,-0.062622
10000,-0.256108,0.100751,-0.008077,0.060267,-0.094749,-0.066591,0.166029,0.149969,-0.053022,0.092426,...,0.028714,-0.054628,0.450581,0.002932,0.129679,-0.072564,0.017161,0.123615,0.046846,0.125711


In [13]:
report('''
## Ingest Achilles Crispr gene dependency data
CERES data with principle components strongly related to known batch effects removed, then shifted and scaled per cell line so the median nonessential KO effect is 0 and the median essential KO effect is -1.

<b>source data:</b> Achilles_gene_effect.csv 

<b>Citation:</b> DepMap, Broad (2019): DepMap 19Q3 Public. figshare. Dataset doi:10.6084/m9.figshare.9201770.v1.
<br>
Robin M. Meyers, Jordan G. Bryan, James M. McFarland, Barbara A. Weir, ... David E. Root, William C. Hahn, Aviad Tsherniak. Computational correction of copy number effect improves specificity of CRISPR-Cas9 essentiality screens in cancer cells. Nature Genetics 2017 October 49:1779–1784. doi:10.1038/ng.3984

* Translate gene names (column labels) to NCBI IDS
* Transpose rows and columns so each cell line is a column label with vertivally stacked gene data
''')


## Ingest Achilles Crispr gene dependency data
CERES data with principle components strongly related to known batch effects removed, then shifted and scaled per cell line so the median nonessential KO effect is 0 and the median essential KO effect is -1.

<b>source data:</b> Achilles_gene_effect.csv 

<b>Citation:</b> DepMap, Broad (2019): DepMap 19Q3 Public. figshare. Dataset doi:10.6084/m9.figshare.9201770.v1.
<br>
Robin M. Meyers, Jordan G. Bryan, James M. McFarland, Barbara A. Weir, ... David E. Root, William C. Hahn, Aviad Tsherniak. Computational correction of copy number effect improves specificity of CRISPR-Cas9 essentiality screens in cancer cells. Nature Genetics 2017 October 49:1779–1784. doi:10.1038/ng.3984

* Translate gene names (column labels) to NCBI IDS
* Transpose rows and columns so each cell line is a column label with vertivally stacked gene data


In [14]:
achilles = get_data('achilles_crispr').T
genes = list(achilles.index)
achilles.index = ncbi_gene_ids(genes)
print(achilles.shape[0],"genes")
print(achilles.shape[1],"cell lines")
achilles.head()

18333 genes
625 cell lines


Unnamed: 0,ACH-000004,ACH-000005,ACH-000007,ACH-000009,ACH-000011,ACH-000012,ACH-000013,ACH-000014,ACH-000015,ACH-000017,...,ACH-001736,ACH-001737,ACH-001740,ACH-001745,ACH-001750,ACH-001765,ACH-001814,ACH-001838,ACH-001956,ACH-001957
1,0.168684,-0.068759,0.053893,0.059874,0.277165,0.008073,0.062131,0.143078,-0.09089,0.178427,...,0.154567,-0.050307,0.005125,0.208843,0.044674,0.136364,0.216507,-0.086149,-0.076893,0.05575
29974,0.089128,0.218792,0.081444,-0.011153,0.085354,0.167177,0.038687,-0.035837,0.007894,0.106952,...,0.019334,0.189813,0.349099,0.153637,0.126563,0.021261,-0.172366,0.082798,0.109464,0.004545
2,-0.196966,0.178252,-0.06017,-0.054367,0.007972,0.088705,-0.043841,0.010997,-0.18569,-0.068145,...,-0.124875,-0.07922,-0.194522,-0.134906,-0.0821,-0.107147,-0.265359,-0.147978,0.021325,0.06799
144568,-0.02126,0.15839,0.153435,0.060886,0.445843,0.307599,0.200285,0.182625,0.111529,0.109807,...,0.051671,0.100741,0.217812,0.167583,0.132673,0.076223,0.045942,0.256595,0.204297,0.199098
127550,0.038541,-0.193862,0.087362,0.039767,-0.036717,0.01544,-0.070484,-0.034048,-0.033507,-0.195903,...,-0.196632,0.164481,-0.052438,-0.130067,-0.17235,-0.116583,0.123916,-0.054596,-0.080814,-0.042784


### How many cell lines and genes are shared between D2 and Achilles gene dependency data sets?

In [15]:
d2_cells = set(d2.columns)
d2_genes = set(d2.index)
ac_cells = set(achilles.columns)
ac_genes = set(achilles.index)
report(str(len(d2_cells.intersection(ac_cells)))+" cell lines are shared")
print(str(len(d2_genes.intersection(ac_genes)))+" genes are shared")


423 cell lines are shared

16052 genes are shared


In [16]:
pi = get_pathway_info()
pi['10002']

['R-HSA-383280',
 'https://reactome.org/PathwayBrowser/#/R-HSA-383280',
 'Nuclear Receptor transcription pathway']

In [17]:
cn = get_data('copy_number').T
gd = get_gene_descriptions()
gd.head()

Unnamed: 0,primaryIdentifier,symbol,briefDescription,description,proteins.uniprotAccession
0,2632,GBE1,"1,4-alpha-glucan branching enzyme 1",The protein encoded by this gene is a glycogen...,Q04446
1,3248,HPGD,15-hydroxyprostaglandin dehydrogenase,This gene encodes a member of the short-chain ...,E9PBZ2
2,3248,HPGD,15-hydroxyprostaglandin dehydrogenase,This gene encodes a member of the short-chain ...,P15428
3,3248,HPGD,15-hydroxyprostaglandin dehydrogenase,This gene encodes a member of the short-chain ...,P15428
4,3248,HPGD,15-hydroxyprostaglandin dehydrogenase,This gene encodes a member of the short-chain ...,P15428


In [18]:
df = get_data('mfr')

df.head()
genes = set()
for p in df['gene.pair']:
    my_genes = p.split('_')
    for g in my_genes:
        genes.add(g)
len(genes)
    

15155