In [1]:
%env http_proxy=http://proxy-default:3128
%env https_proxy=http://proxy-default:3128

env: http_proxy=http://proxy-default:3128
env: https_proxy=http://proxy-default:3128


In [1]:
import random, os
import pandas as pd
import matplotlib
from matplotlib.axes import Axes
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('ticks')
%config InlineBackend.figure_format = 'retina'
# %config InlineBackend.figure_format = 'svg'

user_settings = '~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension/themes.jupyterlab-settings'
if 'JupyterLab Dark' in open(os.path.expanduser(user_settings)).read() or '__vsc_ipynb_file__' in globals():
    style = {'axes.facecolor': '#111111',
     'axes.edgecolor': '.99',
     'axes.labelcolor': '.99',
     'figure.facecolor': '#111111',
     'grid.color': '.0',
     'text.color': '.99',
     'xtick.color': '.99',
     'ytick.color': '.99',
     'patch.edgecolor': 'w',
    }
    sns.set_style(style)
    sns.set_palette('pastel')

import geneinfo.utils as utils
from geneinfo.utils import GeneList as glist
import geneinfo.information as gi

def stars(p):
    return '*' * int((-np.log10(p / 5) - 1))

gene_lists = utils.GeneListCollection(google_sheet='1JSjSLuto3jqdEnnG7JqzeC_1pUZw76n7XueVAYrUOpk')
gene_lists    

| label | description |
|:---|:---|
| **all_npx** | all NPX genes |
| **neuron_genome_proteome** | Neuron proteome (whole genome) |
| **neuron_npx_proteome** | NPX neuron proteome |
| **matos_neuron** | Matos neuron genes |
| **matos_common** | Matos neuron genes expressed in spermatids |
| **hpa_brain_prot** | Brain genes form Human protein atlas |
| **meritxell_spermatid_expr** |  |
| **mult_copy** | Multi copy genes |
| **primate_ampl_multi** | Ampliconic or multicopy in some primates |
| **gametologs** | Gametologs in some primates |
| **old_cDEG** | Old cDEG |
| **cDEG** | The rerun cDEGs |
| **cDEG_alt** |  |
| **Old nDEG** | old nDEG |
| **nDEG** | nDEG |
| **xi** | Subject to somatic X inactivation (XI) |
| **xi_escape** | Escaping somatic X inactivation (XI) |
| **xi_uncertain** | cDEG and Xi escape |
| **xi_any_evidence** | allDEG and Xi escape |
| **expr_mod_xi_copynr_fibrobl** | Xi escapers in either ECH,  hama or hum-nean |
| **expr_mod_xi_copynr_lcl** | Xi escapers in ECH or hum-nean |
| **expr_mod_xi_copynr** | Xi escapers  in hama or hum-nean |
| **pure_hama** | Xi escapers  in hum-nean |
| **hum_nean_admix** | Xi escapers  in ECH90 |
| **ari_relate_EUR** | Xi escapers  in hama |
| **ari_relate_ASIA** | Xi escaping (primate)  gametologs |
| **ari_relate_AFR** | Xi escaping (primate)  gametologs in hum-nean |
| **ari_nonPUR** | Xi escaping (primate)  gametologs in hama |
| **ari_relate_PUR** | Xi escape in ari_nonPUR |
| **ari_all** | Xi escape in accel_reg_simiiformes_br |
| **candidates** | X genes uncertain XI status |
| **ech75_regions** | Union of certain  and uncertain  XI status |
| **ech90_regions** | Expression  modulated by Xi copy number (Fibroblast) |
| **accel_reg_simiiformes_br** | Expression  modulated by Xi copy number (LCL) |
| **my_primate_codeml** | Expression  modulated by Xi copy number (Fibroblast or LCL) |
| **reg_sa_pheno** |  |
| **sfari_all_conf** | Genes in 95% pure hamadryas regions in Gog |
| **intel_seiz_lang** | Human-Neanderthal introgression |
| **intelect_disabil** |  |
| **Xbrain** |  |



## Relate overlap to Sfari genes

In [21]:
gene_lists.get('sfari_all_conf') << glist(chrX_signif.gene)

0,1,2,3,4,5,6,7,8,9
AFF2,BRWD3,DDX3X,GLRA2,IL1RAPL2,MSL3,PCDH19,RLIM,SYN1,UPF3B
AGTR2,CACNA1F,DDX53,GPC4,IQSEC2,NEXMIF,PHF8,RPL10,SYP,USP9X
AP1S2,CASK,DMD,GRIA3,KDM5C,NLGN3,PJA1,RPS6KA3,TAF1,VSIG4
AR,CD99L2,FAM47A,HCFC1,KDM6A,NLGN4X,PLXNA3,SLC6A8,TBL1X,WNK3
ARHGEF9,CDKL5,FGF13,HDAC8,LAS1L,OCRL,PTCHD1,SLC7A3,TBX22,ZNF711
ARX,CHM,FMR1,HNRNPH2,MAOA,OFD1,PTCHD1-AS,SLC9A6,TFE3,
ATRX,CLCN4,FRMPD4,HUWE1,MAOB,OPHN1,RAB39B,SMC1A,TMLHE,
BCORL1,CNKSR2,GABRA3,IL1RAPL1,MECP2,PCDH11X,RHOXF1,SYAP1,TSPAN7,


In [22]:
p = gene_lists.get('sfari_all_conf').fisher(glist(chrX_signif.gene), background=all_genes_chrX)
print(f'{p:.2e} {stars(p)}')

1.41e-04 ***


In [23]:
for population, region in pop_reg_map.items():
    p = gene_lists.get('sfari_all_conf').fisher(chrX_signif_by_pop[population], background=all_genes_chrX)
    print(f"{region:<20} {population}  {p:.5f} {stars(p)}")

Africa               ASW  0.15130 
Africa               ESN  0.22544 
Africa               GWD  0.02573 *
Africa               LWK  0.01648 *
Africa               MSL  0.00274 **
Africa               YRI  0.00150 **
EastAsian            CDX  0.00225 **
EastAsian            CHB  0.00426 **
EastAsian            CHS  0.00685 *
EastAsian            JPT  0.01118 *
EastAsian            KHV  0.01274 *
SouthAsia            PJL  0.03745 *
SouthAsia            BEB  0.01110 *
SouthAsia            GIH  0.01249 *
SouthAsia            STU  0.01249 *
SouthAsia            ITU  0.01249 *
Europe               GBR  0.01323 *
Europe               FIN  0.01083 *
Europe               IBS  0.00190 **
Europe               TSI  0.00320 **
CentralSouthAmerica  MXL  0.00462 **
CentralSouthAmerica  CLM  0.00379 **
CentralSouthAmerica  PEL  0.00070 **
Caribia              PUR  0.00014 ***
Caribia              ACB  0.00014 ***


In [24]:
gene_lists.get('sfari_all_conf') << glist(chrX_signif.gene) << gene_lists.get('ech90_regions')

0,1,2,3,4,5,6,7,8,9
AFF2,BRWD3,DDX3X,GLRA2,IL1RAPL2,MSL3,PCDH19,RLIM,SYN1,UPF3B
AGTR2,CACNA1F,DDX53,GPC4,IQSEC2,NEXMIF,PHF8,RPL10,SYP,USP9X
AP1S2,CASK,DMD,GRIA3,KDM5C,NLGN3,PJA1,RPS6KA3,TAF1,VSIG4
AR,CD99L2,FAM47A,HCFC1,KDM6A,NLGN4X,PLXNA3,SLC6A8,TBL1X,WNK3
ARHGEF9,CDKL5,FGF13,HDAC8,LAS1L,OCRL,PTCHD1,SLC7A3,TBX22,ZNF711
ARX,CHM,FMR1,HNRNPH2,MAOA,OFD1,PTCHD1-AS,SLC9A6,TFE3,
ATRX,CLCN4,FRMPD4,HUWE1,MAOB,OPHN1,RAB39B,SMC1A,TMLHE,
BCORL1,CNKSR2,GABRA3,IL1RAPL1,MECP2,PCDH11X,RHOXF1,SYAP1,TSPAN7,


In [25]:
relate_or_ech90 = glist(chrX_signif.gene) | gene_lists.get('ech90_regions')

In [26]:
p, counts = gene_lists.get('sfari_all_conf').fisher(
    relate_or_ech90,
    background=all_genes_chrX,
    return_counts=True)

In [27]:
gene_lists.get('sfari_all_conf') << relate_or_ech90 << gene_lists.get('meritxell_spermatid_expr')

0,1,2,3,4,5,6,7,8,9
AFF2,BRWD3,DDX3X,GLRA2,IL1RAPL2,MSL3,PCDH19,RLIM,SYN1,UPF3B
AGTR2,CACNA1F,DDX53,GPC4,IQSEC2,NEXMIF,PHF8,RPL10,SYP,USP9X
AP1S2,CASK,DMD,GRIA3,KDM5C,NLGN3,PJA1,RPS6KA3,TAF1,VSIG4
AR,CD99L2,FAM47A,HCFC1,KDM6A,NLGN4X,PLXNA3,SLC6A8,TBL1X,WNK3
ARHGEF9,CDKL5,FGF13,HDAC8,LAS1L,OCRL,PTCHD1,SLC7A3,TBX22,ZNF711
ARX,CHM,FMR1,HNRNPH2,MAOA,OFD1,PTCHD1-AS,SLC9A6,TFE3,
ATRX,CLCN4,FRMPD4,HUWE1,MAOB,OPHN1,RAB39B,SMC1A,TMLHE,
BCORL1,CNKSR2,GABRA3,IL1RAPL1,MECP2,PCDH11X,RHOXF1,SYAP1,TSPAN7,


### considering only spermatid expressed, sfari genes are even more strongly enriched

In [28]:
(
    (gene_lists.get('sfari_all_conf') & gene_lists.get('meritxell_spermatid_expr'))
    .fisher(
        (relate_or_ech90 & gene_lists.get('meritxell_spermatid_expr')),
    background=gene_lists.get('meritxell_spermatid_expr'),
    return_counts=True)
)
print(f'{p:.2e} {stars(p)}')

5.82e-05 ***


## Also enriched among pure hama genes

In [29]:
gene_lists.get('sfari_all_conf').fisher(
    (gene_lists.get('pure_hama')),
    background=all_genes_chrX,
    return_counts=True)

(0.0016555738972285853, [[10, 55], [67, 1316]])

## Also among human_nean_genes

In [30]:
gene_lists.get('sfari_all_conf').fisher(
    (gene_lists.get('hum_nean_admix')),
    background=all_genes_chrX,
    return_counts=True)

(0.0018677807804493786, [[10, 56], [67, 1315]])