In [1]:
#| echo: false
#| output: false

%load_ext autoreload
%autoreload 2

In [7]:
import geneinfo.utils as utils

Load Google spreadsheet with gene lists

In [9]:
gene_lists = utils.GeneListCollection('google_sheet.csv')

Individual gene lists behaves like normal lists, but when displayed they render in columns to make them easier to read:

In [10]:
ech90_regions = gene_lists.get('ech90_regions')
ech90_regions

ABCB7        CNKSR2       FRMD8P1      LOC101928627 MIR500A      PGAM4        TRPC5        
ACTRT1       COX7B        FTX          LOC729609    MIR500B      PHF8         TSIX         
AKAP4        CYBB         FUNDC2       LRCH2        MIR501       PRRG1        UPRT         
ALG13        DCX          GAB3         MAGT1        MIR502       RAB39B       USP27X       
ARHGAP36     DKC1         GPC3         MAP7D2       MIR660       RAP2C        USP27X-DT    
ATP7A        DYNLT3       H2AB1        MBNL3        MIR664B      RAP2C-AS1    VBP1         
ATRX         ENOX2        HTR2C        MIR1184-1    MPP1         RLIM         WNK3         
BCLAF3       ENOX2-AS1    IL13RA2      MIR188       MSN          SERTM2       XIST         
BRCC3        EZHIP        JPX          MIR23C       MTCP1        SH3KBP1      XK           
CAPN6        F8           LANCL3       MIR362       NEXMIF       SMIM9        ZC3H12B      
CCNB3        F8A1         LAS1L        MIR374B      NUDT10       SNORA35B       

In [44]:
import os, glob
import pandas as pd
from pandas.api.types import is_object_dtype
import numpy as np
from math import isclose, floor, log10
import matplotlib.pyplot as plt
from IPython.display import Markdown, display
from pandas.api.types import is_object_dtype
from math import sqrt
from collections.abc import Callable
from typing import Any, TypeVar, List, Tuple, Dict, Union
from itertools import zip_longest
from matplotlib.patches import Rectangle
import seaborn as sns
import requests
from collections.abc import Callable
from typing import Any, TypeVar, List, Tuple, Dict, Union
import warnings
from itertools import chain
import shelve
from pathlib import Path

AMPL_ABBREV_MAP = {    
 'amplicon_chrX_CPXCR1': ['CPXCR1'],
 'amplicon_chrX_CSAG1/2/3': ['CSAG1', 'CSAG2', 'CSAG3'],
 'amplicon_chrX_CT45A1/2/3//6/7/8/9/10': ['CT45A1', 'CT45A2', 'CT45A3', 'CT45A6', 'CT45A7', 'CT45A8', 'CT45A9', 'CT45A10'],
 'amplicon_chrX_CT47A1/2/3/4/5/6/7/8/9/10/11/12/B1': ['CT47A1', 'CT47A2', 'CT47A3', 'CT47A4', 'CT47A5', 'CT47A6', 'CT47A7', 'CT47A8', 'CT47A9', 'CT47A10', 'CT47A11', 'CT47A12', 'CT47B1'],
 'amplicon_chrX_CT55': ['CT55'],
 'amplicon_chrX_CT83': ['CT83'],
 'amplicon_chrX_CTAG1A/1B/2': ['CTAG1A', 'CTAG1B', 'CTAG2'],
 'amplicon_chrX_CXorf49/B': ['CXorf49', 'CXorf49B'],
 'amplicon_chrX_CXorf51A/B': ['CXorf51A', 'CXorf51B'],
 'amplicon_chrX_DDX53': ['DDX53'],
 'amplicon_chrX_DMRTC1/B/FAM236A/B/C/D': ['DMRTC1', 'DMRTC1B', 'FAM236A', 'FAM236B', 'FAM236C', 'FAM236D'],
 'amplicon_chrX_EOLA1/2/HSFX3/4': ['EOLA1', 'EOLA2', 'HSFX3', 'HSFX4'],
 'amplicon_chrX_ETD1/B/ZNF75D': ['ETD1', 'ETD1B', 'ZNF75D'],
 'amplicon_chrX_F8/F8A1/2/3/H2AB1/2/3': ['F8', 'F8A1', 'F8A2', 'F8A3', 'H2AB1', 'H2AB2', 'H2AB3'],
 'amplicon_chrX_FAM156A/B': ['FAM156A', 'FAM156B'],
 'amplicon_chrX_FAM47A/B/C': ['FAM47A', 'FAM47B', 'FAM47C'],
 'amplicon_chrX_G6PD/IKBKG': ['G6PD', 'IKBKG'],
 'amplicon_chrX_GAGE10/1/2A/13/12B/12C/12D/12E/12F/12G/12H/12J': ['GAGE10', 'GAGE1', 'GAGE2A', 'GAGE13', 'GAGE12B', 'GAGE12C', 'GAGE12D', 'GAGE12E', 'GAGE12F', 'GAGE12G', 'GAGE12H', 'GAGE12J'],
 'amplicon_chrX_HSFX1/2': ['HSFX1', 'HSFX2'],
 'amplicon_chrX_IL3RA/P2RY8/SLC25A6': ['IL3RA', 'P2RY8', 'SLC25A6'],
 'amplicon_chrX_MAGEA4': ['MAGEA4'],
 'amplicon_chrX_MAGEA12/A2/A2B/A3/A6': ['MAGEA12', 'MAGEA2', 'MAGEA2B', 'MAGEA3', 'MAGEA6'],
 'amplicon_chrX_MAGEA9/9B': ['MAGEA9', 'MAGEA9B'],
 'amplicon_chrX_MAGEB6': ['MAGEB6'],
 'amplicon_chrX_MAGEC1': ['MAGEC1'],
 'amplicon_chrX_MBTPS2/YY2': ['MBTPS2', 'YY2'],
 'amplicon_chrX_NSDHL': ['NSDHL'],
 'amplicon_chrX_NUDT10/11': ['NUDT10', 'NUDT11'],
 'amplicon_chrX_NXF2/2B/5': ['NXF2', 'NXF2B', 'NXF5'],
 'amplicon_chrX_PABPC1L2A/B': ['PABPC1L2A', 'PABPC1L2B'],
 'amplicon_chrX_PAGE2/2B/5': ['PAGE2', 'PAGE2B', 'PAGE5'],
 'amplicon_chrX_RHOXF2/B': ['RHOXF2', 'RHOXF2B'],
 'amplicon_chrX_SPACA5/B': ['SPACA5', 'SPACA5B'],
 'amplicon_chrX_SPANXA1/A2/N1/N2/N3/N4/N5/B1/C/D': ['SPANXA1', 'SPANXA2', 'SPANXN1', 'SPANXN2', 'SPANXN3', 'SPANXN4', 'SPANXN5', 'SPANXB1', 'SPANXC', 'SPANXD'],
 'amplicon_chrX_SSX1/2/2B/344B/5/7': ['SSX1', 'SSX2', 'SSX2B', 'SS344B', 'SSX5', 'SSX7'],
 'amplicon_chrX_SUPT20HL1/2': ['SUPT20HL1', 'SUPT20HL2'],
 'amplicon_chrX_TCEAL2/3/4/5/6': ['TCEAL2', 'TCEAL3', 'TCEAL4', 'TCEAL5', 'TCEAL6'],
 'amplicon_chrX_TCP11X1/2': ['TCP11X1', 'TCP11X2'],
 'amplicon_chrX_TEX28': ['TEX28'],
 'amplicon_chrX_TMEM185A': ['TMEM185A'],
 'amplicon_chrX_VCX/2/3A/3B': ['VCX', 'VCX2', 'VCX3A', 'VCX3B'],
 'amplicon_chrX_XAGE1A/B': ['XAGE1A', 'XAGE1B'],
 'amplicon_chrX_XAGE3': ['XAGE3'],
 'amplicon_chrX_XAGE5': ['XAGE5'],
}

class GeneListCollection(object):

    def __init__(self, url:str=None, google_sheet:str=None, tab='Sheet1'):

        assert url or google_sheet, 'Either file/url or google_sheet id must be provided.'

        if url is None:
            url = f'https://docs.google.com/spreadsheets/d/{google_sheet}/gviz/tq?tqx=out:csv&sheet={tab}'

        self.desc = []
        for desc in pd.read_csv(url, header=None, low_memory=False).iloc[0]:
            if str(desc) == 'nan':
                self.desc.append('')
            else:
                self.desc.append(desc.replace('\n', ' '))
        self.df = pd.read_csv(url, header=1, low_memory=False)
        self.df = self.df.loc[:, [not x.startswith('Unnamed') for x in self.df.columns]]
        self.names = self.df.columns.tolist()

    def all_genes(self):
        names = []
        for label in lists:
            names.extend(lists.get(label))
        return sorted(set(names))
    
    def cache_coord(self):
        for label in lists:
            names = lists.get(label)
            if not names or len(names) > 3000:
                names = names[:3000]
            gi.gene_coord(names, assembly='hg38', pos_list=True)
    
    def expand_amplicon_abbrev(self, old_list):

        new_list = []
        for gene_name in old_list:
            abbrev = gene_name.rsplit('_', 1)[0]
            if abbrev in AMPL_ABBREV_MAP:
                new_list.extend(AMPL_ABBREV_MAP[abbrev])
            else:
                new_list.append(gene_name)

        # new_list = []
        # for gene_name in old_list:
        #     if gene_name.startswith('amplicon') and '/' in gene_name:
        #         prefix, *variants = gene_name.split('/')
        #         first_amplicon = re.split(r'[_-]+', prefix, 2)[-1]
        #         new_list.append(first_amplicon)
        #         for var in variants:
        #             ampl_name = first_amplicon[:-1] + var
        #             new_list.append(ampl_name)
        #     else:
        #         new_list.append(gene_name)

        new_list = sorted(set(new_list))
        return new_list
    
    def get(self, name):
        sr = self.df[name]
        sr = self.df.loc[~sr.isnull(), name]
        ampl_expanded = sorted(self.expand_amplicon_abbrev(sr.tolist()))
        return GeneList(ampl_expanded)

    def _repr_html_(self):
        out = ['| label | description |', '|:---|:---|']
        for name, desc in zip(self.names, self.desc):
            if pd.isnull(desc):
                desc = ''
            # out.append(f"- **{(name+':**').ljust(130)} {desc}")
            out.append(f"| **{name}** | {desc} |")
            
        display(Markdown('\n'.join(out)))

    def __repr__(self):
        return ""
  
    def __iter__(self):
         yield from self.names


from collections import UserList, UserString

class Gene(str):

    def __init__(self, name):
        self.name = name
        self.aliases = [x.strip() for x in name.split('/')]

    def add_aliases(self):
        ...
    
    def __eq__(self, other):
        return bool(set(self.aliases).intersection(set(other.aliases)))


class GeneList(UserList):

    def __getitem__(self, i):
        return Gene(self.data[i])
    
    def __repr__(self):
        n = len(self)
        col_width = max(map(len, self)) + 1
        ncols = min(max(100//col_width, 1+sqrt(n/col_width)), 150//col_width)
        nrows = int(n/ncols) + 1
        rows = []
        for r in range(0, n, nrows):
            rows.append(self[r:r+nrows])
        repr = []
        for row in list(zip_longest(*rows, fillvalue='')):
            line = []
            for gene in row:
                line.append(gene.ljust(col_width))
            repr.append(''.join(line))
        return('\n'.join(repr))

    def __str__(self):
        return repr(self)

gl = GeneList(['AFF2 / FOO', 'DYNLT3 / FOO'])
gl[0] == gl[1]

True

In [24]:
gl[1].aliases

AttributeError: 'str' object has no attribute 'aliases'