In [1]:
from goatools.anno.gaf_reader import GafReader
from goatools.obo_parser import OBOReader, GODag
import torch
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary

import numpy as np

## Reading gaf (annotations) and obo (terms)

In [2]:
ogaf = GafReader("../goa_human.gaf")

HMS:0:00:08.197932 606,840 annotations READ: ../goa_human.gaf 


In [3]:
obon = OBOReader("../go.obo")

In [4]:
i=0
for rec in obon:
    print(rec)
    
    i += 1
    
    if i==10:
        break

GO:0000001	mitochondrion inheritance [biological_process]
GO:0000002	mitochondrial genome maintenance [biological_process]
GO:0000003	reproduction [biological_process]
GO:0000005	obsolete ribosomal chaperone activity [molecular_function]obsolete
GO:0000006	high-affinity zinc transmembrane transporter activity [molecular_function]
GO:0000007	low-affinity zinc ion transmembrane transporter activity [molecular_function]
GO:0000008	obsolete thioredoxin [molecular_function]obsolete
GO:0000009	alpha-1,6-mannosyltransferase activity [molecular_function]
GO:0000010	trans-hexaprenyltranstransferase activity [molecular_function]
GO:0000011	vacuole inheritance [biological_process]


In [5]:
godag = GODag("../go.obo", optional_attrs={'consider', 'replaced_by'}, load_obsolete=True)

../go.obo: fmt(1.2) rel(2021-02-01) 50,515 GO Terms; optional_attrs(consider replaced_by)


In [6]:
godag['GO:0000001'].get_all_parents()

{'GO:0006996',
 'GO:0007005',
 'GO:0008150',
 'GO:0009987',
 'GO:0016043',
 'GO:0048308',
 'GO:0048311',
 'GO:0051179',
 'GO:0051640',
 'GO:0051641',
 'GO:0051646',
 'GO:0071840'}

In [7]:
godag['GO:0008150'].get_all_parents()

set()

In [8]:
godag['GO:0000001']

GOTerm('GO:0000001'):
  id:GO:0000001
  item_id:GO:0000001
  name:mitochondrion inheritance
  namespace:biological_process
  _parents: 2 items
    GO:0048311
    GO:0048308
  parents: 2 items
    GO:0048311	level-05	depth-06	mitochondrion distribution [biological_process]
    GO:0048308	level-05	depth-05	organelle inheritance [biological_process]
  children: 0 items
  level:6
  depth:7
  is_obsolete:False
  alt_ids: 0 items
  consider: 0 items
  replaced_by:

In [9]:
k=3

for i, term in godag.items():
    if term.level==k and not term.is_obsolete and term.namespace in ['biological_process', 'molecular_function']:
        print(term)

GO:0000031	level-03	depth-03	mannosylphosphate transferase activity [molecular_function]
GO:0000036	level-03	depth-04	acyl carrier activity [molecular_function]
GO:0000062	level-03	depth-09	fatty-acyl-CoA binding [molecular_function]
GO:0000149	level-03	depth-03	SNARE binding [molecular_function]
GO:0000150	level-03	depth-03	recombinase activity [molecular_function]
GO:0000166	level-03	depth-04	nucleotide binding [molecular_function]
GO:0000170	level-03	depth-03	sphingosine hydroxylase activity [molecular_function]
GO:0000212	level-03	depth-08	meiotic spindle organization [biological_process]
GO:0000226	level-03	depth-06	microtubule cytoskeleton organization [biological_process]
GO:0000278	level-03	depth-03	mitotic cell cycle [biological_process]
GO:0000279	level-03	depth-03	M phase [biological_process]
GO:0000320	level-03	depth-03	re-entry into mitotic cell cycle [biological_process]
GO:0000332	level-03	depth-06	template for synthesis of G-rich strand of telomere DNA activity [molecul

## Selecting unobsolete terms from level 3 in the right namespaces

In [10]:
def select_terms(godag, namespaces, level, take_obsolete):
    selected = {}
    
    for k, term in godag.items():
        if term.level == level and (take_obsolete or not term.is_obsolete) and term.namespace in namespaces:
            selected[k]=term
    
    return(selected)

selected_terms = select_terms(godag, ['biological_process', 'molecular_function'], 3, False)

In [11]:
selected_terms.keys()

dict_keys(['GO:0000031', 'GO:0000036', 'GO:0000062', 'GO:0000149', 'GO:0000150', 'GO:0000166', 'GO:0000170', 'GO:0000212', 'GO:0000226', 'GO:0000278', 'GO:0000279', 'GO:0000320', 'GO:0000332', 'GO:0000384', 'GO:0000386', 'GO:0000706', 'GO:0000709', 'GO:0000746', 'GO:0000747', 'GO:0000753', 'GO:0000755', 'GO:0000756', 'GO:0000758', 'GO:0000761', 'GO:0000768', 'GO:0000769', 'GO:0000900', 'GO:0000901', 'GO:0000902', 'GO:0000910', 'GO:0000913', 'GO:0000919', 'GO:0000981', 'GO:0001092', 'GO:0001094', 'GO:0001095', 'GO:0001096', 'GO:0001097', 'GO:0001098', 'GO:0001101', 'GO:0001216', 'GO:0001217', 'GO:0001402', 'GO:0001502', 'GO:0001505', 'GO:0001512', 'GO:0001525', 'GO:0001530', 'GO:0001539', 'GO:0001542', 'GO:0001543', 'GO:0001544', 'GO:0001547', 'GO:0001548', 'GO:0001549', 'GO:0001551', 'GO:0001552', 'GO:0001553', 'GO:0001554', 'GO:0001555', 'GO:0001556', 'GO:0001562', 'GO:0001565', 'GO:0001568', 'GO:0001618', 'GO:0001653', 'GO:0001659', 'GO:0001661', 'GO:0001662', 'GO:0001666', 'GO:00016

## Loading RNA-seq data and little cleaning

In [12]:
import pandas as pd

In [13]:
df = pd.read_csv('../data_RNA-seq/FPKM.txt', delimiter='\t')

In [14]:
df

Unnamed: 0,combi.5miR.R1,combi.5miR.R2,combi.5miR.R3,combi.5miR.R4,miR.A.R1,miR.A.R2,miR.A.R3,miR.A.R4,miR.B.R1,miR.B.R2,...,miR.E.R1,miR.E.R2,miR.E.R3,miR.E.R4,miR.Neg.R1,miR.Neg.R2,miR.Neg.R3,miR.Neg.R4,UHR,Gene
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.053521,0.000000,...,0.000000,0.000000,0.000000,0.037997,0.000000,0.000000,0.000000,0.000000,0.000000,3.8-1.4
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.8-1.5
2,0.934004,1.068055,0.652681,0.629475,0.767691,0.632476,0.519308,0.786472,0.609850,0.487538,...,1.014120,1.534630,0.634869,0.601324,0.828966,0.790009,0.900515,0.891860,7.304470,A1BG
3,4.544203,4.594169,5.422509,4.586262,4.341056,5.096817,4.009487,3.877099,6.897029,4.957077,...,5.037288,5.699663,4.915528,4.373311,4.759663,4.352712,5.079500,4.461291,3.355772,A1BG-AS1
4,0.000000,0.005769,0.000000,0.000000,0.000000,0.000000,0.000000,0.005664,0.000000,0.000000,...,0.012723,0.011478,0.000000,0.000000,0.000000,0.000000,0.000000,0.005420,1.611937,A1CF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31190,0.105058,0.090102,0.169902,0.136551,0.142743,0.101096,0.112652,0.126376,0.000000,0.132201,...,0.141930,0.102432,0.110177,0.083484,0.095907,0.057125,0.012603,0.084643,1.863472,ZYG11A
31191,20.462076,21.320759,19.229276,19.606094,19.516275,18.925830,17.611627,19.482250,14.331371,16.776734,...,20.399502,20.423338,20.480384,22.301466,17.224581,18.482725,15.623726,16.606289,5.618102,ZYG11B
31192,52.838730,54.408419,63.899677,51.293766,52.506885,56.304345,54.019570,57.847245,90.233705,78.800758,...,52.442046,51.397280,45.808345,35.744158,89.516294,75.222222,97.042169,78.620001,80.373700,ZYX
31193,4.961195,5.189275,5.385604,5.648418,7.584934,6.850248,8.337649,8.107763,7.228431,6.722117,...,4.651285,5.134639,4.483953,4.344500,5.848417,6.448634,6.228980,7.619931,6.166176,ZZEF1


In [15]:
df.columns

Index(['combi.5miR.R1', 'combi.5miR.R2', 'combi.5miR.R3', 'combi.5miR.R4',
       'miR.A.R1', 'miR.A.R2', 'miR.A.R3', 'miR.A.R4', 'miR.B.R1', 'miR.B.R2',
       'miR.B.R3', 'miR.B.R4', 'miR.C.R1', 'miR.C.R2', 'miR.C.R3', 'miR.C.R4',
       'miR.D.R1', 'miR.D.R2', 'miR.D.R3', 'miR.D.R4', 'miR.E.R1', 'miR.E.R2',
       'miR.E.R3', 'miR.E.R4', 'miR.Neg.R1', 'miR.Neg.R2', 'miR.Neg.R3',
       'miR.Neg.R4', 'UHR', 'Gene'],
      dtype='object')

In [16]:
df = df.drop(index=[0,1])

In [17]:
selected_genes = df['Gene'].unique()

In [18]:
ogaf.get_associations()[:14]

[ntgafobj(DB='UniProtKB', DB_ID='A0A024RBG1', DB_Symbol='NUDT4B', Qualifier=set(), GO_ID='GO:0003723', DB_Reference={'GO_REF:0000043'}, Evidence_Code='IEA', With_From={'UniProtKB-KW:KW-0694'}, NS='MF', DB_Name={'Diphosphoinositol polyphosphate phosphohydrolase NUDT4B'}, DB_Synonym={'NUDT4B'}, DB_Type='protein', Taxon=[9606], Date=datetime.date(2020, 11, 28), Assigned_By='UniProt', Extension=None, Gene_Product_Form_ID=set()),
 ntgafobj(DB='UniProtKB', DB_ID='A0A024RBG1', DB_Symbol='NUDT4B', Qualifier=set(), GO_ID='GO:0005829', DB_Reference={'GO_REF:0000052'}, Evidence_Code='IDA', With_From=set(), NS='CC', DB_Name={'Diphosphoinositol polyphosphate phosphohydrolase NUDT4B'}, DB_Synonym={'NUDT4B'}, DB_Type='protein', Taxon=[9606], Date=datetime.date(2016, 12, 4), Assigned_By='HPA', Extension=None, Gene_Product_Form_ID=set()),
 ntgafobj(DB='UniProtKB', DB_ID='A0A024RBG1', DB_Symbol='NUDT4B', Qualifier=set(), GO_ID='GO:0046872', DB_Reference={'GO_REF:0000043'}, Evidence_Code='IEA', With_From

## Dicts with final genes-go associations, and the selected terms and genes id

In [19]:
genes_go = dict()
all_go = set()
all_genes = set()

for a in ogaf.get_associations():
    if a.DB_Symbol in selected_genes and a.GO_ID in selected_terms.keys():
        if a.DB_Symbol not in genes_go:
            genes_go[a.DB_Symbol] = {a.GO_ID}
        else:
            genes_go[a.DB_Symbol].add(a.GO_ID)
    
        all_go.add(a.GO_ID)
        all_genes.add(a.DB_Symbol)

In [20]:
genes_go

{'HLA-DRA': {'GO:0002250',
  'GO:0002504',
  'GO:0016032',
  'GO:0030247',
  'GO:0042605',
  'GO:0042608'},
 'OR2A2': {'GO:0007186'},
 'OR2A25': {'GO:0007186'},
 'OR2A1': {'GO:0007186'},
 'OR2A42': {'GO:0007186'},
 'OR2A14': {'GO:0007186'},
 'GLRA4': {'GO:0004888'},
 'TTC26': {'GO:0120170', 'GO:1905198'},
 'E2F8': {'GO:0000981', 'GO:0001217', 'GO:0042802'},
 'UBA6': {'GO:0007612', 'GO:0021764', 'GO:0021766', 'GO:0060996'},
 'ESYT2': {'GO:0006897', 'GO:0031210', 'GO:0042802'},
 'UHRF1BP1L': {'GO:0062069'},
 'DAPL1': {'GO:0030154', 'GO:0097190'},
 'FEZF1': {'GO:0021772', 'GO:0043697'},
 'TMEM120B': {'GO:0034220'},
 'IRGM': {'GO:0006952', 'GO:0045087', 'GO:0061635'},
 'ANO9': {'GO:0034220'},
 'ARHGEF37': {'GO:0050790'},
 'PXDNL': {'GO:0006979', 'GO:0098869'},
 'ILVBL': {'GO:0030976'},
 'SYCE3': {'GO:0007131', 'GO:0007283'},
 'PLEKHG3': {'GO:0050790'},
 'SSC5D': {'GO:0001968',
  'GO:0006897',
  'GO:0006952',
  'GO:0043236',
  'GO:0045087'},
 'SH3PXD2B': {'GO:0030154', 'GO:0050790'},
 'MACR

In [21]:
all_genes

{'ITGB1BP2',
 'OR7G3',
 'FOXO1',
 'PLAG1',
 'ARHGAP24',
 'PTPRN',
 'TLR4',
 'CHRNA10',
 'PIGZ',
 'DESI1',
 'ANXA11',
 'GPR157',
 'ATP4B',
 'NPM3',
 'FABP3',
 'IRF2BP1',
 'FKBP6',
 'CRTC3',
 'FDPS',
 'PHF14',
 'SIGLEC10',
 'OR52D1',
 'SYCP3',
 'TAS2R16',
 'GLIS2',
 'RILPL1',
 'FANCD2',
 'SRY',
 'ALOXE3',
 'GPC2',
 'OR5V1',
 'RPS5',
 'GABRA2',
 'DRD4',
 'TAS2R39',
 'MC1R',
 'GBA',
 'CCNB1',
 'TNFRSF10C',
 'CLASP2',
 'GRM2',
 'RPTOR',
 'PSEN1',
 'SDC1',
 'NFIB',
 'HRH3',
 'ZIC5',
 'SPESP1',
 'ZNF621',
 'ZCCHC10',
 'CDK16',
 'BMP5',
 'HOXB5',
 'MCM8',
 'LIN7A',
 'RFWD3',
 'CCDC141',
 'ZNF69',
 'PRRX1',
 'NRBF2',
 'MICAL2',
 'GNG5',
 'GADD45GIP1',
 'GABRG2',
 'TRIM36',
 'NAPSA',
 'MORF4L2',
 'CYP27C1',
 'OPN4',
 'MMRN1',
 'RDH13',
 'GDF11',
 'UBE2V1',
 'C1QC',
 'CSRP1',
 'CDC42EP4',
 'MSANTD3',
 'HSFY1',
 'HABP2',
 'ERC1',
 'VPS18',
 'OR2T3',
 'CD302',
 'SBDS',
 'KIF2C',
 'M1AP',
 'SSTR4',
 'KPNA7',
 'SH3KBP1',
 'CRYZ',
 'HSD3B2',
 'ANKHD1',
 'RNF31',
 'MRC1',
 'WNT2B',
 'CXCL14',
 'SLC30A7

In [22]:
all_genes = sorted(list(all_genes))
all_go = sorted(list(all_go))

In [23]:
N_genes = len(all_genes)
N_go = len(all_go)

In [24]:
for t in genes_go['TFEB']:
    print(godag[t])

GO:0019899	level-03	depth-03	enzyme binding [molecular_function]
GO:0006914	level-03	depth-04	autophagy [biological_process]
GO:0002250	level-03	depth-03	adaptive immune response [biological_process]
GO:0006959	level-03	depth-03	humoral immune response [biological_process]
GO:0000981	level-03	depth-03	DNA-binding transcription factor activity, RNA polymerase II-specific [molecular_function]
GO:0001892	level-03	depth-05	embryonic placenta development [biological_process]
GO:0046983	level-03	depth-03	protein dimerization activity [molecular_function]


## Number of terms and genes retained

In [25]:
N_go

1434

In [26]:
N_genes

11160

## Building the autoencoder's mask

In [27]:
genes_to_index = {k:v for v,k in enumerate(all_genes)}
go_to_index = {k:v for v,k in enumerate(all_go)}

In [28]:
go_to_index

{'GO:0000036': 0,
 'GO:0000062': 1,
 'GO:0000149': 2,
 'GO:0000150': 3,
 'GO:0000166': 4,
 'GO:0000170': 5,
 'GO:0000212': 6,
 'GO:0000226': 7,
 'GO:0000278': 8,
 'GO:0000320': 9,
 'GO:0000386': 10,
 'GO:0000706': 11,
 'GO:0000768': 12,
 'GO:0000900': 13,
 'GO:0000902': 14,
 'GO:0000981': 15,
 'GO:0001094': 16,
 'GO:0001096': 17,
 'GO:0001097': 18,
 'GO:0001216': 19,
 'GO:0001217': 20,
 'GO:0001502': 21,
 'GO:0001505': 22,
 'GO:0001512': 23,
 'GO:0001525': 24,
 'GO:0001530': 25,
 'GO:0001539': 26,
 'GO:0001542': 27,
 'GO:0001543': 28,
 'GO:0001544': 29,
 'GO:0001547': 30,
 'GO:0001552': 31,
 'GO:0001553': 32,
 'GO:0001554': 33,
 'GO:0001555': 34,
 'GO:0001556': 35,
 'GO:0001568': 36,
 'GO:0001618': 37,
 'GO:0001653': 38,
 'GO:0001659': 39,
 'GO:0001661': 40,
 'GO:0001662': 41,
 'GO:0001666': 42,
 'GO:0001675': 43,
 'GO:0001704': 44,
 'GO:0001708': 45,
 'GO:0001709': 46,
 'GO:0001756': 47,
 'GO:0001759': 48,
 'GO:0001780': 49,
 'GO:0001786': 50,
 'GO:0001824': 51,
 'GO:0001825': 52,
 'G

In [29]:
mask = torch.zeros(N_genes, N_go, dtype=bool)

In [30]:
for gene, terms in genes_go.items():
    for t in terms:
        print(gene, t)
        mask[genes_to_index[gene], go_to_index[t]] = True

HLA-DRA GO:0002250
HLA-DRA GO:0042605
HLA-DRA GO:0002504
HLA-DRA GO:0030247
HLA-DRA GO:0042608
HLA-DRA GO:0016032
OR2A2 GO:0007186
OR2A25 GO:0007186
OR2A1 GO:0007186
OR2A42 GO:0007186
OR2A14 GO:0007186
GLRA4 GO:0004888
TTC26 GO:0120170
TTC26 GO:1905198
E2F8 GO:0042802
E2F8 GO:0000981
E2F8 GO:0001217
UBA6 GO:0021766
UBA6 GO:0007612
UBA6 GO:0060996
UBA6 GO:0021764
ESYT2 GO:0042802
ESYT2 GO:0006897
ESYT2 GO:0031210
UHRF1BP1L GO:0062069
DAPL1 GO:0030154
DAPL1 GO:0097190
FEZF1 GO:0043697
FEZF1 GO:0021772
TMEM120B GO:0034220
IRGM GO:0006952
IRGM GO:0061635
IRGM GO:0045087
ANO9 GO:0034220
ARHGEF37 GO:0050790
PXDNL GO:0006979
PXDNL GO:0098869
ILVBL GO:0030976
SYCE3 GO:0007283
SYCE3 GO:0007131
PLEKHG3 GO:0050790
SSC5D GO:0001968
SSC5D GO:0043236
SSC5D GO:0006952
SSC5D GO:0006897
SSC5D GO:0045087
SH3PXD2B GO:0050790
SH3PXD2B GO:0030154
MACROD2 GO:0019213
MACROD2 GO:0016798
FRMD3 GO:0008092
HFM1 GO:0003678
NBAS GO:0000149
TESPA1 GO:0005102
HMX2 GO:0000981
HMX2 GO:0030154
DENND3 GO:0008333
DENND3 

CLDN3 GO:0001666
FFAR2 GO:0019915
FFAR2 GO:0007186
MEFV GO:0042802
MEFV GO:0045087
KCNN4 GO:0006952
KCNN4 GO:0005516
KCNN4 GO:0006884
KCNN4 GO:0046541
DHX15 GO:0003724
DHX15 GO:0009636
TET3 GO:0070989
TET3 GO:0080111
FLRT2 GO:0061343
PLXNB1 GO:0004888
PLXNB1 GO:0035556
PLXNB1 GO:0043931
PLXNB1 GO:0016477
PLXNB1 GO:0008360
PLXNB1 GO:0007186
PLXNB1 GO:0032794
PJA2 GO:0045087
SIPA1L1 GO:0048167
CYB5B GO:0050790
CYB5B GO:0009055
CYB5B GO:0006805
CYB5B GO:0008047
ST8SIA3 GO:0033691
ST8SIA3 GO:0042802
CYP26A1 GO:0006766
CYP26A1 GO:0019825
CYP26A1 GO:0004497
CYP26A1 GO:0006805
PHGDH GO:0009055
PHGDH GO:0070314
NDUFS4 GO:0072593
CRX GO:0009887
CRX GO:0000981
CRX GO:0001216
CRX GO:0030154
IRAK2 GO:0035556
IRAK2 GO:0004672
PHF1 GO:0042802
MLNR GO:0007186
GPR39 GO:0007186
DYNC1LI2 GO:0042802
DYNC1LI2 GO:0007018
DYNC1LI2 GO:0045504
DYNC1LI2 GO:0051642
DYNC1LI2 GO:0000226
PSMD3 GO:0050790
HOXC11 GO:0000981
RBFOX2 GO:0008134
RBFOX2 GO:0003714
SIAH2 GO:0042752
SIAH2 GO:0003714
SPINT1 GO:0060670
SPINT

ZMPSTE24 GO:0008360
ZMPSTE24 GO:0044255
ZMPSTE24 GO:0061337
ZMPSTE24 GO:0001942
SC5D GO:0006629
CEACAM4 GO:0006909
IDH1 GO:0042802
IDH1 GO:0006979
IDH1 GO:0006099
IDH1 GO:0006740
SCO1 GO:0016531
CYP7B1 GO:0060740
ATRN GO:0009887
ATRN GO:0016477
ATRN GO:0021549
ATRN GO:0006979
ATRN GO:0009888
STAM2 GO:0016197
ALDH1L1 GO:0006730
TUSC2 GO:0006909
TUSC2 GO:0048469
GABBR2 GO:0150099
GABBR2 GO:0004888
GABBR2 GO:0007186
RASSF9 GO:0016197
RASSF9 GO:0046907
DGAT1 GO:0019915
DGAT1 GO:0042802
DGAT1 GO:0016746
SOAT2 GO:0000062
SOAT2 GO:0034383
SOAT2 GO:0016746
DHRS3 GO:0003151
DHRS3 GO:0060021
DHRS3 GO:0000166
DHRS3 GO:0009055
DHRS3 GO:0060411
DGKI GO:0035556
DGKI GO:0030168
PAK3 GO:0016358
PAK3 GO:0031295
RGS9 GO:0035556
RGS9 GO:0007186
RGS9 GO:0007212
DYSF GO:0005543
PIAS1 GO:0008022
PIAS1 GO:0019899
PIAS1 GO:0003714
PIAS1 GO:0008542
PIAS1 GO:0007283
PIAS1 GO:0019904
PIAS2 GO:0008134
DCTN3 GO:0000278
BBOX1 GO:0042802
RAD17 GO:0003689
LYZL6 GO:0007342
LYZL6 GO:0003796
CABYR GO:0048240
DNAJB5 GO:0

RAF1 GO:0035019
RAF1 GO:0030168
RAF1 GO:0004672
RAF1 GO:0034220
RAF1 GO:0001666
DNTT GO:0003912
PLA2G1B GO:0035556
PLA2G1B GO:0005102
GBA GO:0005102
GBA GO:0006914
GBA GO:0030259
GBA GO:0021694
GBA GO:0048469
GBA GO:0036473
GBA GO:0019915
GBA GO:0009268
PROC GO:0007596
ALDOA GO:0042802
ALDOA GO:0007339
ALDOA GO:0008360
ALDOA GO:0046716
ALDOA GO:0008092
ALDOA GO:0006754
CSTB GO:0008344
ANXA1 GO:0042802
ANXA1 GO:0005102
ANXA1 GO:0005543
ANXA1 GO:0042493
ANXA1 GO:0044849
ANXA1 GO:0007166
ANXA1 GO:0002250
ANXA1 GO:0031018
ANXA1 GO:0008360
ANXA1 GO:0007186
ANXA1 GO:0098609
ANXA1 GO:0006909
ANXA1 GO:0001780
ANXA1 GO:0048306
ANXA1 GO:0045087
PDGFA GO:0009887
PDGFA GO:0030036
PDGFA GO:0009611
PDGFA GO:0005518
PDGFA GO:0001525
PDGFA GO:0048286
PDGFA GO:0001942
RLN2 GO:0050790
RLN2 GO:0007565
RLN2 GO:0007186
APOB GO:0034383
APOB GO:0005543
APOB GO:0030317
APOB GO:0009615
APOB GO:0008201
APOB GO:0007283
APOB GO:0034447
CLPS GO:0032094
CLPS GO:0009617
CLPS GO:0044241
CLPS GO:0008047
CLPS GO:005079

RNASE2 GO:0003676
RNASE2 GO:0004540
RNASE2 GO:0051607
RNASE2 GO:0001530
COX8A GO:0006091
MYB GO:0000278
MYB GO:0000981
MYB GO:0001666
MYBL1 GO:0007283
MYBL1 GO:0000981
MYBL1 GO:0000278
MYBL1 GO:0030154
GAA GO:0009888
GAA GO:0043181
GAA GO:0046716
GAA GO:0002026
AR GO:0008013
AR GO:0060748
AR GO:0005102
AR GO:0019899
AR GO:0033327
AR GO:0007338
AR GO:0008134
AR GO:0060740
AR GO:0030522
AR GO:0060736
AR GO:0060749
AR GO:0000981
AR GO:0048645
AR GO:0060742
AR GO:0005497
AR GO:0007283
AR GO:0005496
RARA GO:0019899
RARA GO:0005102
RARA GO:0009755
RARA GO:0031490
RARA GO:0008134
RARA GO:0030154
RARA GO:0000981
RARA GO:0051018
RARA GO:0019904
RRAS GO:0060325
HLA-C GO:0016032
HLA-C GO:0002250
HLA-C GO:0042605
HLA-C GO:0046977
ACR GO:0007341
ACR GO:0007339
ACR GO:0007338
BCL2 GO:0042802
BCL2 GO:0072593
BCL2 GO:0042493
BCL2 GO:0003014
BCL2 GO:0006959
BCL2 GO:0007565
BCL2 GO:0031069
BCL2 GO:0002931
BCL2 GO:0006808
BCL2 GO:0031647
BCL2 GO:0035094
BCL2 GO:0001662
BCL2 GO:0006582
BCL2 GO:0009314
BCL

ERCC2 GO:0001666
ERCC2 GO:0003678
BMP7 GO:0016358
BMP7 GO:0030902
BMP7 GO:0021502
BMP7 GO:0008201
BMP7 GO:0060411
BMP7 GO:0009880
BMP7 GO:0060395
BMP7 GO:0003272
RPL35A GO:0006413
ITGB5 GO:0033627
ITGB5 GO:0005178
ITGB5 GO:0016477
ITGB5 GO:0001618
ARF4 GO:0060996
ARF4 GO:0007612
ARF4 GO:0016477
ADRA2B GO:0007565
ADRA2B GO:0030168
ADRA2B GO:0007186
RPL7 GO:0042802
RPL7 GO:0006413
EGR1 GO:0044849
EGR1 GO:1990841
EGR1 GO:0002931
EGR1 GO:0060086
EGR1 GO:0000981
EGR1 GO:0045475
EGR1 GO:0001666
VCL GO:0045294
VCL GO:0002162
VCL GO:0008013
GPX2 GO:0009055
GPX2 GO:0098869
GPX2 GO:0004602
SRD5A1 GO:0021766
SRD5A1 GO:0042493
SRD5A1 GO:0021987
SRD5A1 GO:0021854
SRD5A1 GO:0009055
SRD5A1 GO:0030154
SRD5A1 GO:0014850
SRD5A1 GO:0007530
SRD5A1 GO:0021794
LBP GO:0005102
LBP GO:0070891
LBP GO:0008228
LBP GO:0071723
LBP GO:0045087
LBP GO:0001530
NAT1 GO:0006805
GABRB1 GO:0050877
GABRB1 GO:0042391
GABRB1 GO:0030594
GABRB1 GO:0034220
GABRB1 GO:0009636
ADCYAP1 GO:0005102
ADCYAP1 GO:0007565
ADCYAP1 GO:000718

PIK3R1 GO:0007186
PIK3R1 GO:0043559
PIK3R1 GO:0030168
PIK3R1 GO:0043548
PIK3R1 GO:0043560
PIK3R1 GO:0016032
PIK3R1 GO:0005158
ITPKB GO:0005516
ITPKB GO:0035726
ITPKB GO:0007166
PSMB8 GO:0016032
PSMB9 GO:0016032
HLA-DMA GO:0002250
HLA-DMB GO:0002250
POU1F1 GO:0000981
POU1F1 GO:0021984
PSMB4 GO:0016032
PSMB4 GO:0001530
PSMB6 GO:0016032
PSMB5 GO:0016032
PSMB5 GO:0008233
PSMB5 GO:0006979
GSTM2 GO:0019899
GSTM2 GO:0005102
GSTM2 GO:0043295
GSTM2 GO:0005504
GSTM2 GO:0098869
GSTM2 GO:0070458
GSTM2 GO:0018916
GSTM2 GO:0004602
HTR1D GO:0051378
HTR1D GO:0050795
HTR1D GO:0040012
HTR1D GO:0007186
HTR1D GO:0030594
HTR1B GO:0051378
HTR1B GO:0046849
HTR1B GO:0050795
HTR1B GO:0007186
HTR1B GO:0042756
HTR1B GO:0030594
HTR2A GO:0001659
HTR2A GO:0042802
HTR2A GO:0007613
HTR2A GO:0051378
HTR2A GO:0001965
HTR2A GO:0042493
HTR2A GO:0007210
HTR2A GO:0001618
HTR2A GO:0071886
HTR2A GO:0007186
HTR2A GO:0030594
HTR2A GO:0048148
ABCD3 GO:0043621
ABCD3 GO:0042493
ABCD3 GO:0005324
TMOD1 GO:0051015
TMOD1 GO:0008344
L

LIFR GO:0007166
LRPPRC GO:0051015
RPL35 GO:0006413
WAS GO:0042802
WAS GO:0007596
WAS GO:0030048
WAS GO:0006952
WAS GO:0050790
WAS GO:0016197
CDKN2A GO:0007050
CDKN2B GO:0007050
CDKN2B GO:0031668
CDKN2C GO:0007050
PRCP GO:0007597
PRCP GO:0097009
CXCL5 GO:0042802
CXCL5 GO:0006935
CXCL5 GO:0007186
NSG1 GO:0016197
NSG1 GO:0099003
NSG1 GO:0005102
NSG1 GO:0007212
HTT GO:0042802
HTT GO:0002039
HTT GO:0005522
HTT GO:0045505
HTT GO:0031072
HTT GO:0044325
ECE1 GO:0016486
ECE1 GO:0042447
ECE1 GO:0017046
MTHFR GO:0042493
MTHFR GO:0001666
MTHFR GO:0046500
SLC1A1 GO:0042802
SLC1A1 GO:0015183
SLC1A1 GO:0140010
SLC1A1 GO:0001662
SLC1A1 GO:0010842
SLC1A4 GO:0015183
GDF5 GO:0009612
GDF5 GO:0060395
GDF5 GO:0042802
GDF5 GO:0043932
PAFAH1B1 GO:0042802
PAFAH1B1 GO:0030036
PAFAH1B1 GO:0001675
PAFAH1B1 GO:0021766
PAFAH1B1 GO:0021987
PAFAH1B1 GO:0021819
PAFAH1B1 GO:0008201
PAFAH1B1 GO:0031023
PAFAH1B1 GO:0051219
PAFAH1B1 GO:0000226
PAFAH1B1 GO:0019226
PAFAH1B1 GO:0007281
PAFAH1B1 GO:0070840
PAFAH1B1 GO:0008344

JAK3 GO:0002250
JAK3 GO:0045087
JAK3 GO:0035556
DGKE GO:0035556
DGKE GO:0030168
POLR2H GO:0035019
POLR2J GO:0035019
POLR2J GO:0046983
MAP2K6 GO:0007050
MAP2K6 GO:0002931
MAP2K6 GO:0042493
ARHGDIB GO:0071461
AGFG1 GO:0007283
AGFG1 GO:0030154
STAT2 GO:0042802
STAT2 GO:0006952
STAT2 GO:0000981
STAT2 GO:0051607
STAT2 GO:0016032
GTF2A1 GO:0008134
GTF2A2 GO:0016032
GTF2A2 GO:0008134
MSH6 GO:0016032
MSH6 GO:0019899
KIF11 GO:0007051
KIF11 GO:0000278
KIF11 GO:0007018
VAV2 GO:0001525
VAV2 GO:0030168
VAV2 GO:0016477
VAV2 GO:0007186
CHN2 GO:0035556
HK2 GO:0002931
HK2 GO:0001666
EFNB2 GO:0001618
STC1 GO:0042802
STC1 GO:0046697
STC1 GO:0035988
DGKQ GO:0035556
DGKQ GO:0030168
DGKQ GO:0007186
NDST1 GO:0030900
NDST1 GO:0019213
NDST1 GO:0030901
NDST1 GO:0003279
THOP1 GO:0042277
AKR1C2 GO:0044597
AKR1C2 GO:0016229
AKR1C2 GO:0007186
AKR1C2 GO:0044598
CAPZA1 GO:0051015
CAPZA1 GO:0030036
CAPZA1 GO:0007596
CAPZA1 GO:0045087
HMGA2 GO:0003906
HMGA2 GO:0040008
HMGA2 GO:0003714
HMGA2 GO:0003131
HMGA2 GO:0008134


PRKDC GO:0045087
PRKDC GO:0008134
PRKDC GO:0004672
PRKDC GO:0001756
PRKDC GO:0035234
PRKDC GO:0019904
PRKDC GO:0042752
ADAM17 GO:0033627
ADAM17 GO:0042493
ADAM17 GO:0005178
ADAM17 GO:0002467
ADAM17 GO:0008233
ADAM17 GO:0001666
SRPX GO:0006914
ARG2 GO:0002250
ARG2 GO:0045087
BTG2 GO:0009612
BTG2 GO:0021542
BTG2 GO:0051602
BTG2 GO:0003714
ELF3 GO:0000981
ELF3 GO:0060056
ELF3 GO:0030154
ELF3 GO:0001824
NTHL1 GO:0019104
NTHL1 GO:0003906
IL13RA1 GO:0019955
IL13RA1 GO:0007166
CCL20 GO:0006935
CCL20 GO:0007186
MAP1A GO:0008093
MAP1A GO:0007613
MAP1A GO:0048167
MAP1A GO:0070050
MAP1A GO:0000226
MAP1A GO:0016358
CRADD GO:0097190
PHEX GO:0019637
PHEX GO:0030282
ADARB1 GO:0045087
ADARB1 GO:0051607
CCL8 GO:0006887
CCL8 GO:0006935
CCL8 GO:0009615
CCL8 GO:0007186
CCL8 GO:0008201
CCL8 GO:0004672
CCL7 GO:0008201
CCL7 GO:0006935
CCL7 GO:0008360
CCL7 GO:0007186
GPLD1 GO:0017080
CXCL6 GO:0008201
CXCL6 GO:0006935
CXCL6 GO:0007186
LCN2 GO:0019730
LCN2 GO:0042802
LCN2 GO:0097577
LCN2 GO:0045087
IFI35 GO:004

TBX2 GO:0036302
TBX2 GO:0060560
TBX2 GO:0007569
ERCC8 GO:0006979
ERCC8 GO:0003678
DNAJC3 GO:0051787
DNAJC3 GO:0051087
DNAJC3 GO:0034975
DNAJC3 GO:0051607
PAPPA GO:0007565
GRIN2B GO:0048167
GPS2 GO:0003713
GPS2 GO:0016032
GPS2 GO:0030332
GPS2 GO:0003714
CHIT1 GO:0044245
CHIT1 GO:0008061
CHIT1 GO:0009617
MAP3K1 GO:0004672
PRKG2 GO:0042802
PRKG2 GO:0004672
KLRD1 GO:0004888
KLRD1 GO:1990405
KLRD1 GO:0045087
KLRD1 GO:0007166
SRSF9 GO:0009636
SRSF9 GO:0019904
NOG GO:0003151
NOG GO:0035019
NOG GO:0003149
NOG GO:0019955
NOG GO:0060325
GRM1 GO:0007186
MAD2L1 GO:0042802
MAD2L1 GO:0008022
PTGDR GO:0007186
TRIM28 GO:0003714
TRIM28 GO:1990841
TRIM28 GO:0060669
TRIM28 GO:0004672
TRIM28 GO:0003713
TRIM28 GO:0016032
TRIM28 GO:0045087
DHRS2 GO:0009636
SEMA3F GO:0021675
G3BP1 GO:0033677
G3BP1 GO:0003724
G3BP1 GO:0032606
G3BP1 GO:0051607
G3BP1 GO:0016032
G3BP1 GO:0045087
G3BP1 GO:0003678
NR5A1 GO:0019899
NR5A1 GO:0005543
NR5A1 GO:0001553
NR5A1 GO:0030522
NR5A1 GO:0030154
NR5A1 GO:0007530
NR5A1 GO:0000981

BAK1 GO:0002352
AANAT GO:0071889
NTRK2 GO:0007612
NTRK2 GO:0021987
NTRK2 GO:0005030
NFE2 GO:0047485
NFE2 GO:0000981
NFE2 GO:0007596
STX1A GO:0042802
STX1A GO:0006887
STX1A GO:0009629
STX1A GO:0000149
STX1A GO:0005484
STX1A GO:0043008
STX1A GO:0047485
STX1A GO:0048306
STX1A GO:0032940
STX1A GO:0019904
STX1A GO:0044325
OCLN GO:0019904
MEA1 GO:0007283
MEA1 GO:0030154
SRSF7 GO:0019904
CPSF6 GO:1990448
POU2AF1 GO:0003713
POU2AF1 GO:0006959
SMN1 GO:0042802
DBN1 GO:0010644
DBN1 GO:0061351
DBN1 GO:0005522
DBN1 GO:0010643
DBN1 GO:0032507
DBN1 GO:0051015
PTGIS GO:0004497
PTGIS GO:0097190
PTGIS GO:0016705
NFIL3 GO:0000981
TBR1 GO:0042802
TBR1 GO:0001661
TBR1 GO:0021987
TBR1 GO:0021764
TBR1 GO:0001708
TBR1 GO:0030902
TBR1 GO:0000981
MOG GO:0005102
MOG GO:0001618
PDK4 GO:0072593
PDK4 GO:0004672
PDK4 GO:0045124
PDK4 GO:0042594
FSCN1 GO:0030036
FSCN1 GO:0051015
FSCN1 GO:0016477
GUCA2B GO:0007588
CCL15 GO:0008201
CCL15 GO:0006935
CCL15 GO:0005102
CCL15 GO:0007186
HIF1A GO:0003151
HIF1A GO:0002534
HIF1

FGD6 GO:0008360
ESPNL GO:0051015
TOM1L2 GO:0030276
TTLL10 GO:0070735
PAQR9 GO:0005496
ASB18 GO:0035556
SYDE1 GO:0016477
MICALCL GO:0030036
MICALCL GO:0007283
MICALCL GO:0030154
BBS12 GO:0042755
NEK10 GO:0004672
CYP4V2 GO:0004497
TMTC3 GO:0004169
XIRP1 GO:0051015
USP49 GO:0042393
USP51 GO:0042393
USP45 GO:0016477
USP45 GO:0003407
UNC13D GO:0002432
UNC13D GO:0006909
UNC13D GO:0002467
UNC13D GO:0051607
CREB3L2 GO:0030968
CREB3L2 GO:0009611
CREB3L2 GO:0000981
ZNF365 GO:0021687
PREX2 GO:0007186
PREX2 GO:0008344
HTR3D GO:0004888
HTR3D GO:0007210
HTR3D GO:0050877
HTR3D GO:0042391
HTR3D GO:0030594
HTR3D GO:0034220
KCTD1 GO:0042802
KCTD1 GO:0008134
KCTD1 GO:0003714
ILDR2 GO:0030154
SLC24A5 GO:0034220
MED25 GO:0008134
TUBA1A GO:0042802
TUBA1A GO:0000278
TUBA1A GO:0019904
TUBA1A GO:0000226
RPS27L GO:0008494
CBLL1 GO:0042802
CBLL1 GO:0098609
PGAP1 GO:0009880
PGAP1 GO:0016788
ANO5 GO:0034220
ANO5 GO:0046983
UTS2B GO:0008217
UTS2B GO:0007186
SSH2 GO:0030036
SSH2 GO:0004721
ADAMTS13 GO:0005178
ADAMTS

GRPEL2 GO:0050790
GRPEL2 GO:0051082
GRPEL2 GO:0051087
SCAMP5 GO:0006887
GADD45GIP1 GO:0016032
WDR48 GO:0016032
WDR48 GO:0007283
WDR48 GO:0042769
WDR48 GO:0007338
VSTM2A GO:0042802
VSTM2A GO:0030154
EXOC6 GO:0006887
EXOC6 GO:0006904
OLIG1 GO:0000981
OLIG1 GO:0046983
ERMN GO:0051015
ERMN GO:0008360
SMARCC2 GO:0003713
SMARCC2 GO:0042393
UHMK1 GO:0043021
UHMK1 GO:0007050
FGFBP3 GO:0008201
FGFBP3 GO:0019838
NEIL3 GO:0019104
NEIL3 GO:0003906
NEIL3 GO:1904931
NKX2-3 GO:0000981
NKX2-3 GO:0030154
CYP2W1 GO:0006805
CYP2W1 GO:0006082
STOML3 GO:0034220
OSR1 GO:0060021
OSR1 GO:0072111
OSR1 GO:0030154
OSR1 GO:0000981
OSR1 GO:0009790
CMTM2 GO:0006935
SPATA20 GO:0007283
SPATA20 GO:0005975
SPATA20 GO:0030154
RIN3 GO:0006897
DEPTOR GO:0035556
PRR7 GO:0002250
NDNF GO:0008201
NDNF GO:0001525
NDNF GO:0005539
NDNF GO:0002931
GOLGA5 GO:0048193
LNX1 GO:0042802
SHKBP1 GO:0042802
UBA3 GO:0042802
CNIH3 GO:0042391
NANP GO:0005975
FEZF2 GO:0016358
FEZF2 GO:0043697
FEZF2 GO:0021542
ZCCHC10 GO:0003676
DNAJB14 GO:001

PLD4 GO:0006909
PLD4 GO:0045087
ZBTB9 GO:0042802
DHX58 GO:0003724
DHX58 GO:0009617
DHX58 GO:0009615
DHX58 GO:0051607
DHX58 GO:0016032
DHX58 GO:0045087
FGGY GO:0070050
GALM GO:0005975
SYTL4 GO:0006887
SYTL4 GO:0071985
SYTL4 GO:0005543
ULK4 GO:0000226
ROPN1L GO:0042802
ROPN1L GO:0048240
ROPN1L GO:0030317
DCPS GO:0042802
BIRC7 GO:0002088
BIRC7 GO:0019899
RHBDF1 GO:0016477
RHBDF1 GO:0019838
PPCDC GO:0042802
CHMP4C GO:0016197
TIFA GO:0045087
CTHRC1 GO:0017147
CTHRC1 GO:0016477
CTHRC1 GO:0043932
GPR146 GO:0007186
ZNF653 GO:0008134
ZNF653 GO:0003714
GRAMD1A GO:0006914
TLCD1 GO:0097035
HAUS1 GO:0007098
OPTN GO:0042802
OPTN GO:0008022
OPTN GO:0006914
OPTN GO:0016032
OPTN GO:0045087
AP2M1 GO:0006897
AP2M1 GO:0034383
AP2M1 GO:0044325
TUBGCP3 GO:0000278
TUBGCP3 GO:0007338
KCTD12 GO:0042802
RCN3 GO:0032964
RASD2 GO:0031681
RASD2 GO:0043548
ORAI1 GO:0005516
ORAI1 GO:0002250
ORAI1 GO:0042802
HAVCR1 GO:0001786
HAVCR1 GO:0001618
R3HDM4 GO:0003676
HVCN1 GO:0042802
HVCN1 GO:0009268
HVCN1 GO:0045454
EID2B

FAIM2 GO:0021549
FAIM2 GO:0021680
FAIM2 GO:0021681
FAIM2 GO:0002931
CHID1 GO:0070492
CHID1 GO:0005975
CHID1 GO:0045087
CHID1 GO:0008061
SPATA9 GO:0007283
SPATA9 GO:0030154
CDADC1 GO:0061676
APOL5 GO:0006629
GATA5 GO:0045165
GATA5 GO:0000981
GATA5 GO:0007596
GTPBP2 GO:0042802
RBM24 GO:0030154
BRIP1 GO:0051026
BRIP1 GO:0003724
BRIP1 GO:1990918
BRIP1 GO:0007284
BRIP1 GO:0009636
BRIP1 GO:0003678
SORBS1 GO:0005158
SORBS1 GO:0008092
JAM3 GO:0005178
JAM3 GO:0002250
JAM3 GO:0002523
JAM3 GO:0030010
JAM3 GO:0098609
JAM3 GO:0001525
JAM3 GO:0001780
JAM3 GO:0019226
HINT2 GO:0000166
TM2D1 GO:0007186
TM2D1 GO:0097190
STRA6 GO:0060322
STRA6 GO:0001568
STRA6 GO:0061038
STRA6 GO:0061029
STRA6 GO:0060323
STRA6 GO:0060900
STRA6 GO:0034632
STRA6 GO:0007612
STRA6 GO:0060325
STRA6 GO:0061205
STRA6 GO:0048286
TRPM6 GO:0009636
PLVAP GO:0042802
SUCNR1 GO:0007186
TSSK6 GO:0035092
TSSK6 GO:0035556
LGR4 GO:0004888
LGR4 GO:0046849
LGR4 GO:0009755
LGR4 GO:0030282
LGR4 GO:0007283
LGR4 GO:0016500
LGR4 GO:0045087
LGR4 

TLR8 GO:0004888
TLR8 GO:0038187
TLR8 GO:0009615
TLR8 GO:0051607
TLR8 GO:0045087
MXRA5 GO:0071559
SPHK2 GO:0032635
SPHK2 GO:0032616
SPHK2 GO:0002367
PDGFC GO:0009887
SLC17A5 GO:0009617
EIF4ENIF1 GO:0005049
EIF4ENIF1 GO:0031047
CHST12 GO:0050656
SPTBN5 GO:0042802
SPTBN5 GO:0030036
SPTBN5 GO:0008022
SPTBN5 GO:0007041
SPTBN5 GO:0045505
SPTBN5 GO:0002046
SPTBN5 GO:0043621
SPTBN5 GO:0030507
SPTBN5 GO:0051015
FBXO8 GO:0050790
PICK1 GO:0042802
PICK1 GO:0008022
PICK1 GO:0005102
PICK1 GO:0019899
PICK1 GO:0005543
PICK1 GO:0140090
PICK1 GO:0043046
PICK1 GO:0071933
PICK1 GO:0051015
PICK1 GO:0019904
DUOX2 GO:0042446
DUOX2 GO:0042335
DUOX2 GO:0009615
DUOX2 GO:0006952
DUOX2 GO:0006979
DUOX2 GO:0098869
DUOX1 GO:0042446
DUOX1 GO:0042335
DUOX1 GO:0006952
DUOX1 GO:0006979
DUOX1 GO:0098869
TSHZ2 GO:0000981
SH2B1 GO:0035556
SH2B1 GO:0007596
CTPS2 GO:0042802
POLE3 GO:0031490
POLE3 GO:0032201
PRTFDC1 GO:0000166
SMYD2 GO:0002039
AAAS GO:0016032
AAAS GO:0007612
DISC1 GO:0051602
DISC1 GO:0000226
CCL28 GO:0006935

CLEC4E GO:0002221
CLEC4E GO:0045087
APLN GO:0042802
APLN GO:0005102
APLN GO:0007186
APLN GO:0042756
APLN GO:0001525
APLN GO:0016032
APLN GO:0002026
STAP1 GO:0035591
STAP1 GO:0005543
PYCARD GO:0042802
PYCARD GO:0002218
PYCARD GO:0019899
PYCARD GO:0031647
PYCARD GO:0046983
PYCARD GO:0051607
PYCARD GO:0045087
PYCARD GO:0044325
MMP17 GO:0050790
MMP17 GO:0030574
MMP17 GO:0008047
PADI4 GO:0042802
PADI4 GO:0045087
HPCAL4 GO:0008022
HPCAL4 GO:0019904
EPDR1 GO:0042802
EPDR1 GO:0005543
HHLA2 GO:0031295
HHLA2 GO:0005102
NOTCH3 GO:0042802
NOTCH3 GO:0050793
MYO6 GO:0005516
MYO6 GO:0030048
MYO6 GO:0006897
MYO6 GO:0051015
PLAGL1 GO:0007050
ALK GO:0042802
ALK GO:0004704
ALK GO:0021766
ALK GO:0097009
ALK GO:0090648
ALK GO:0036269
SPATA2 GO:0007283
COL17A1 GO:0030020
ICAM5 GO:0098609
ICAM5 GO:0006909
ICAM5 GO:0005178
BARX2 GO:0000981
BARX2 GO:0001502
CAPN11 GO:0008233
TBX20 GO:0035922
TBX20 GO:0003279
TBX20 GO:0001708
TBX20 GO:0036306
TBX20 GO:0000981
TBX20 GO:0003207
TBX20 GO:0003272
PPT2 GO:0008474
PP

MAP3K6 GO:0033554
MAP3K6 GO:0004672
POTEKP GO:0098973
TMEFF1 GO:0016358
TMEFF1 GO:0009887
TMEFF1 GO:0009888
TMEFF1 GO:0005102
CRYBB1 GO:0002088
MT1F GO:0010273
SAMD7 GO:0042393
PPP1R1C GO:0035556
BBS7 GO:0008104
BBS7 GO:0046907
ZNF713 GO:0000981
WFDC11 GO:0045087
GYG1 GO:0016757
ZNF133 GO:0000981
ZNF777 GO:0000981
ZNF75D GO:0000981
ZNF425 GO:0000981
NPEPPS GO:0042277
GLIS3 GO:0000981
WDR5B GO:0042393
ZNF566 GO:0000981
GNB2 GO:0031682
GNB2 GO:0007186
ZNF826P GO:0000981
ZNF675 GO:0000981
ZNF575 GO:0000981
TUBA3D GO:0000278
TUBA3D GO:0000226
CC2D1B GO:0000981
SLC25A22 GO:0015183
SLC25A22 GO:0006810
SERP2 GO:0030968
NUDT17 GO:0006742
AMOTL2 GO:0030036
AMOTL2 GO:0001525
DNLZ GO:0051087
LAMB1 GO:0009887
LAMB1 GO:0005178
LAMB1 GO:0009888
LAMB1 GO:0016477
ZNF765 GO:0000981
FOXI3 GO:0000981
FOXI3 GO:0030154
ATP9B GO:0006897
OTOF GO:0016082
OTOF GO:0035612
TNNC2 GO:0048306
TNNC2 GO:0051015
COQ10B GO:0048039
ZRANB2 GO:0001530
ZSCAN5C GO:0000981
CAMKK1 GO:0005516
CAMKK1 GO:0035556
OR5L1 GO:0007186

## Cleaning the dataframe

In [31]:
df

Unnamed: 0,combi.5miR.R1,combi.5miR.R2,combi.5miR.R3,combi.5miR.R4,miR.A.R1,miR.A.R2,miR.A.R3,miR.A.R4,miR.B.R1,miR.B.R2,...,miR.E.R1,miR.E.R2,miR.E.R3,miR.E.R4,miR.Neg.R1,miR.Neg.R2,miR.Neg.R3,miR.Neg.R4,UHR,Gene
2,0.934004,1.068055,0.652681,0.629475,0.767691,0.632476,0.519308,0.786472,0.609850,0.487538,...,1.014120,1.534630,0.634869,0.601324,0.828966,0.790009,0.900515,0.891860,7.304470,A1BG
3,4.544203,4.594169,5.422509,4.586262,4.341056,5.096817,4.009487,3.877099,6.897029,4.957077,...,5.037288,5.699663,4.915528,4.373311,4.759663,4.352712,5.079500,4.461291,3.355772,A1BG-AS1
4,0.000000,0.005769,0.000000,0.000000,0.000000,0.000000,0.000000,0.005664,0.000000,0.000000,...,0.012723,0.011478,0.000000,0.000000,0.000000,0.000000,0.000000,0.005420,1.611937,A1CF
5,53.892231,49.422126,60.042042,50.773251,69.956050,54.829392,52.503341,51.063678,142.750765,156.441116,...,83.703422,86.594910,94.077634,80.480998,168.341917,137.629276,121.028086,138.916813,186.235958,A2M
6,7.268809,7.516039,8.129149,7.244547,6.225721,6.035746,5.472482,5.725775,4.736538,5.292596,...,5.294046,5.776157,5.459804,5.278308,4.261017,4.936515,4.627150,4.675615,0.950360,A2M-AS1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31190,0.105058,0.090102,0.169902,0.136551,0.142743,0.101096,0.112652,0.126376,0.000000,0.132201,...,0.141930,0.102432,0.110177,0.083484,0.095907,0.057125,0.012603,0.084643,1.863472,ZYG11A
31191,20.462076,21.320759,19.229276,19.606094,19.516275,18.925830,17.611627,19.482250,14.331371,16.776734,...,20.399502,20.423338,20.480384,22.301466,17.224581,18.482725,15.623726,16.606289,5.618102,ZYG11B
31192,52.838730,54.408419,63.899677,51.293766,52.506885,56.304345,54.019570,57.847245,90.233705,78.800758,...,52.442046,51.397280,45.808345,35.744158,89.516294,75.222222,97.042169,78.620001,80.373700,ZYX
31193,4.961195,5.189275,5.385604,5.648418,7.584934,6.850248,8.337649,8.107763,7.228431,6.722117,...,4.651285,5.134639,4.483953,4.344500,5.848417,6.448634,6.228980,7.619931,6.166176,ZZEF1


In [32]:
df = df.loc[df["Gene"].isin(all_genes)]

In [33]:
all_genes[-10:]

['ZSCAN4',
 'ZSCAN5A',
 'ZSCAN5B',
 'ZSCAN5C',
 'ZSCAN9',
 'ZSWIM2',
 'ZWINT',
 'ZXDA',
 'ZXDC',
 'ZYX']

In [34]:
mask = torch.cat((mask, torch.ones(N_genes,100, dtype=bool)), 1)

# Autoencoder

In [35]:
class MaskedLinear(nn.Module):
    def __init__(self, n_input, n_output, mask, has_bias=False, activation='identity'):
        super().__init__()
        self.n_input = n_input
        self.n_output = n_output
        
        self.W = torch.nn.Parameter(torch.randn(n_input, n_output))
        self.mask = mask
        
        activ_dict = {'sigmoid': torch.nn.Sigmoid, 'identity': torch.nn.Identity, 'tanh': torch.nn.Tanh}
        self.activation = activ_dict[activation]()

    def forward(self, X):
        prod = torch.matmul(X, self.W*self.mask)
        #prod = self.W*self.mask
        
        return(self.activation(prod))


In [36]:
clf = MaskedLinear(10,10,torch.rand(10,10)<0.5)

In [37]:
clf.forward(torch.rand(1,30,10))

tensor([[[-1.9208e-01,  0.0000e+00, -3.7600e-01, -1.5855e+00, -7.6853e-01,
          -3.1818e-03, -3.4554e-01,  6.2871e-01, -3.0355e-01,  1.6808e+00],
         [-1.5202e-01,  0.0000e+00, -2.0552e-02, -2.6534e+00,  3.0847e-02,
           6.9758e-01, -5.5151e-01,  1.1481e+00,  4.5833e-02,  1.9180e+00],
         [-8.0046e-02,  0.0000e+00,  4.2811e-01, -2.8545e+00,  2.0163e+00,
           2.6832e-01, -1.5659e+00,  8.6116e-01,  5.9222e-01,  1.7637e+00],
         [-3.4132e-01,  0.0000e+00,  5.7969e-01, -1.7040e+00,  1.2340e+00,
          -1.1017e+00, -2.1042e+00,  4.1037e-01, -6.2394e-01,  2.2046e+00],
         [-9.3146e-02,  0.0000e+00, -1.8324e-01, -1.2103e+00,  1.8269e+00,
          -1.5304e+00, -1.2466e+00,  8.0692e-01, -4.3516e-01,  2.4828e+00],
         [ 1.7106e-01,  0.0000e+00,  9.1507e-01, -3.5462e+00, -6.9104e-02,
          -4.8959e-02, -1.5741e+00,  6.4660e-01,  1.1923e+00,  1.6364e+00],
         [ 2.1481e-01,  0.0000e+00,  5.9202e-01, -2.1493e+00,  1.4023e-01,
          -8.1345e-

In [38]:
class GeneAutoEncoder(nn.Module):
    def __init__(self, n_genes, n_dense, mask, activation='tanh'):
        super().__init__()
        
        self.mask = mask
        self.mask_t = torch.transpose(mask, 1, 0)
        
        self.N0 = mask.size()[0]
        self.N1 = mask.size()[1]
        self.N2 = n_dense
        
        self.encoder = nn.Sequential(MaskedLinear(self.N0, self.N1, self.mask, activation='tanh'), nn.Linear(self.N1, self.N2), nn.Tanh())
        
        self.decoder = nn.Sequential(nn.Linear(self.N2, self.N1), nn.Tanh(), MaskedLinear(self.N1, self.N0, self.mask_t, activation='tanh'))
        
    def forward(self, features):
        encoded = self.encoder(features)
        
        decoded = self.decoder(encoded)
        
        return(decoded)


In [39]:
ae = GeneAutoEncoder(N_genes, 100, mask, activation='tanh')

In [40]:
summary(ae, (1,N_genes))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
              Tanh-1              [-1, 1, 1534]               0
      MaskedLinear-2              [-1, 1, 1534]               0
            Linear-3               [-1, 1, 100]         153,500
              Tanh-4               [-1, 1, 100]               0
            Linear-5              [-1, 1, 1534]         154,934
              Tanh-6              [-1, 1, 1534]               0
              Tanh-7             [-1, 1, 11160]               0
      MaskedLinear-8             [-1, 1, 11160]               0
Total params: 308,434
Trainable params: 308,434
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.04
Forward/backward pass size (MB): 0.22
Params size (MB): 1.18
Estimated Total Size (MB): 1.44
----------------------------------------------------------------


In [41]:
list(clf.parameters())

[Parameter containing:
 tensor([[-3.8902e-01,  1.4854e+00, -6.0528e-01,  5.2005e-02,  1.0165e+00,
          -1.1599e+00, -5.5541e-01,  7.6744e-01, -6.3769e-01,  1.2528e+00],
         [ 8.2514e-01,  5.5579e-01,  4.0729e-01, -1.9706e-02,  5.2028e-01,
          -1.5383e+00, -9.1392e-01,  2.1302e-01, -6.2448e-04,  1.7143e+00],
         [-3.9015e-01, -2.1614e+00, -6.7376e-01, -9.2964e-01,  4.2225e-01,
           5.0513e-01, -5.2302e-01,  2.4482e-01, -2.0183e-01,  2.4078e-01],
         [-2.9757e-01, -1.8941e-01, -5.4971e-02,  1.2648e+00,  1.6282e-01,
          -1.1547e+00,  2.5168e-01, -4.6007e-01, -1.6889e+00,  6.4817e-01],
         [ 3.2505e-01,  1.4087e+00,  5.2855e-04, -1.6998e+00, -1.8489e+00,
           4.0029e-01, -1.8292e+00,  2.1315e-02,  6.2375e-01, -9.7332e-01],
         [-1.0045e+00, -5.4223e-01,  1.7183e+00, -2.3051e+00,  9.5010e-01,
           1.9181e+00, -2.2673e+00,  9.8840e-01,  1.4757e+00,  1.0913e+00],
         [-8.7790e-01,  1.1317e-01,  1.6095e-01,  1.0476e+00, -8.3537e-

In [42]:
nn.Linear(10,10).parameters

<bound method Module.parameters of Linear(in_features=10, out_features=10, bias=True)>

In [45]:
data_numpy = df.transpose().drop(index="Gene").to_numpy(dtype=np.float64)
data_tensor = torch.Tensor(data_numpy)

In [50]:
input_number = data_numpy.shape[0]

In [51]:
input_number

29

In [46]:
train = torch.utils.data.TensorDataset(data_tensor, data_tensor)
train_loader = torch.utils.data.DataLoader(train, batch_size=1, shuffle=False)

In [52]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

optimizer = optim.Adam(ae.parameters(), lr=1e-3)

# mean-squared error loss
criterion = nn.MSELoss()

n_epochs = 10

for epoch in range(n_epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, _ = data
            
        inputs.to(device)
      
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = ae(inputs)
        
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()

    print('[%d, %5d] loss: %.5f' %
                      (epoch + 1, i + 1, running_loss / input_number))

print('Finished Training')


[1,    29] loss: 6453.43011
[2,    29] loss: 6449.68981
[3,    29] loss: 6445.92420
[4,    29] loss: 6442.79381
[5,    29] loss: 6441.22496
[6,    29] loss: 6440.30270
[7,    29] loss: 6439.85429
[8,    29] loss: 6439.57388
[9,    29] loss: 6439.33144
[10,    29] loss: 6439.25997
Finished Training


In [47]:
df.transpose().drop(index="Gene")

Unnamed: 0,5,9,11,13,14,16,23,25,26,31,...,31169,31170,31171,31172,31173,31175,31186,31187,31189,31192
combi.5miR.R1,53.8922,0.125318,0.0,36.9528,20.1601,0.0,3.75609,67.7051,0.0306261,0.0,...,0.0285398,1.72397,0.0,0.0,7.85543,0.0,27.6225,1.67518,5.35151,52.8387
combi.5miR.R2,49.4221,0.0,0.0,34.5353,19.7343,0.0,3.88431,73.6778,0.0787986,0.0,...,0.0,1.76978,0.0,0.0,8.68344,0.0,24.3831,1.45784,5.63437,54.4084
combi.5miR.R3,60.042,0.0,0.0,34.9592,21.0531,0.0,3.92603,79.0903,0.202244,0.0443335,...,0.0,1.82349,0.0,0.117877,8.3658,0.0,24.3584,1.90569,5.58476,63.8997
combi.5miR.R4,50.7733,0.0,0.0,40.6637,20.3805,0.0,4.29406,73.4964,0.139324,0.035631,...,0.0,1.78242,0.0560885,0.0,8.23247,0.0,26.3146,1.34483,5.8061,51.2938
miR.A.R1,69.9561,0.0,0.0,40.6825,17.9472,0.0,7.0465,64.015,0.0970944,0.0372468,...,0.0,2.04958,0.029316,0.0,7.88635,0.0,45.0913,1.26914,5.42255,52.5069
miR.A.R2,54.8294,0.0,0.0,42.8793,19.3258,0.0,6.67455,60.9962,0.0,0.0,...,0.0,2.53871,0.0,0.0,9.17183,0.0,47.9292,1.33939,5.15361,56.3043
miR.A.R3,52.5033,0.11758,0.0,41.4154,20.2341,0.0,7.12277,69.4851,0.28735,0.17637,...,0.0,2.00965,0.0694083,0.0,8.84446,0.0,44.1374,1.49084,5.60394,54.0196
miR.A.R4,51.0637,0.105524,0.0,33.2902,19.0024,0.0,8.23612,60.7372,0.0,0.0395713,...,0.0,2.08951,0.0,0.0,7.20259,0.0,42.3325,1.31723,5.81909,57.8472
miR.B.R1,142.751,0.0,0.0,43.2322,17.9912,0.0,6.56592,67.4997,0.389942,0.184107,...,0.0,2.48155,0.0,0.0,9.02728,0.0,11.4119,1.32703,6.23551,90.2337
miR.B.R2,156.441,0.0551935,0.0,36.2076,15.638,0.0,6.62747,57.6262,0.323725,0.0827903,...,0.0,2.25484,0.0325811,0.0,11.1942,0.0,10.5919,1.21519,5.53292,78.8008


In [2]:
import h5py

In [18]:
filename = "../data_tisch/Glioma_GSE103224_expression.h5"
h5 = h5py.File(filename,'r')

In [19]:
h5.keys()

<KeysViewHDF5 ['matrix']>

In [22]:
h5['matrix']

<HDF5 group "/matrix" (6 members)>

In [25]:
h5['matrix'].keys()

<KeysViewHDF5 ['barcodes', 'data', 'features', 'indices', 'indptr', 'shape']>

In [63]:
import numpy as np

np.array(h5['matrix']['indices']).max()

28908

In [51]:
h5['matrix']['indices']

<HDF5 dataset "indices": shape (28973062,), type "<i4">

In [66]:
barcodes = h5['matrix']['barcodes']

print("Nombre de cellules: ", barcodes.shape[0])
print("Premiers exemples de barcodes")

barcodes[:50]

Nombre de cellules:  17185
Premiers exemples de barcodes


array([b'PJ016_2', b'PJ016_4', b'PJ016_5', b'PJ016_6', b'PJ016_7',
       b'PJ016_10', b'PJ016_11', b'PJ016_14', b'PJ016_17', b'PJ016_19',
       b'PJ016_20', b'PJ016_24', b'PJ016_25', b'PJ016_26', b'PJ016_27',
       b'PJ016_28', b'PJ016_29', b'PJ016_30', b'PJ016_33', b'PJ016_35',
       b'PJ016_37', b'PJ016_40', b'PJ016_42', b'PJ016_43', b'PJ016_44',
       b'PJ016_45', b'PJ016_47', b'PJ016_49', b'PJ016_51', b'PJ016_52',
       b'PJ016_54', b'PJ016_55', b'PJ016_60', b'PJ016_62', b'PJ016_64',
       b'PJ016_65', b'PJ016_66', b'PJ016_67', b'PJ016_68', b'PJ016_69',
       b'PJ016_72', b'PJ016_73', b'PJ016_74', b'PJ016_75', b'PJ016_77',
       b'PJ016_78', b'PJ016_79', b'PJ016_80', b'PJ016_81', b'PJ016_82'],
      dtype='|S200')

In [46]:
h5['matrix']['features'].keys()

<KeysViewHDF5 ['_all_tag_keys', 'feature_type', 'genome', 'id', 'name']>

In [76]:
h5['matrix']['features']['name'][:100]

array([b'SLC7A10', b'AC007228.11', b'AC092667.2', b'ZNF367', b'SULT1B1',
       b'TRIM63', b'AL590762.11', b'HDHD2', b'MORF4L2-AS1',
       b'RP11-384C12.1', b'RP11-298I3.4', b'TMEM53', b'CTA-796E4.4',
       b'EIF4HP2', b'RP11-517B11.7', b'HRAT92', b'RN7SL504P',
       b'AC108039.1', b'RP1-249H1.3', b'PCDHB19P', b'ZBTB12', b'SLC25A14',
       b'FAAP100', b'PRORSD1P', b'DNM1', b'DHFRL1', b'CCP110', b'STARD8',
       b'RP11-314C16.1', b'ANKRD20A8P', b'TICRR', b'AC074367.1', b'SNX11',
       b'CNTLN', b'SKA2', b'CIB1', b'CAPN6', b'KB-1125A3.11', b'PCNPP3',
       b'NDUFB9', b'SUMO2P7', b'RP1-266L20.9', b'SH3TC2', b'YBX1P1',
       b'MTRR', b'RP11-3D4.2', b'GS1-166A23.2', b'RP11-6N17.10',
       b'CCDC186', b'ZNF784', b'CYCSP24', b'AL139319.1', b'PIAS1',
       b'RTN1', b'HOXA11', b'ABHD12', b'SLC16A5', b'RHOG', b'KATNBL1P6',
       b'KDELR2', b'FLT4', b'CTC-510F12.4', b'RP11-270L13.1', b'ZSCAN9',
       b'RP11-136F16.2', b'RP11-831F12.2', b'RP4-590F24.2',
       b'RP13-514E23.1', b'CTA-3

In [75]:
np.unique(np.array(h5['matrix']['features']['name'])).shape

(28909,)

In [82]:
np.array(h5['matrix']['indices']).max()

28908

In [85]:
np.array(h5['matrix']['indptr']).shape

(17186,)