In [1]:
import os

import numpy as np
import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Data, Dataset
from tqdm import tqdm

In [2]:
DATA_FOLDER="/gpfs/data/rsingh47/hzaki1/data/"
RESOURCES_FOLDER="/gpfs/data/rsingh47/hzaki1/data/resources"
DATABASE_FOLDER = "/gpfs/data/rsingh47/hzaki1/data/databases"
METADATA_FNAME = os.path.join(RESOURCES_FOLDER, 'metadata.txt')
DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "mm9-*.mc9nr.feather")
MOTIF_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDER, "motifs-v9-nr.mgi-m0.001-o0.0.tbl")
MM_TFS_FNAME = os.path.join(RESOURCES_FOLDER, 'mm_mgi_tfs.txt')
SC_EXP_FNAME = os.path.join(RESOURCES_FOLDER, "GSE60361_C1-3005-Expression.txt")
REGULONS_FNAME = os.path.join(DATA_FOLDER, "regulons.p")
MOTIFS_FNAME = os.path.join(DATA_FOLDER, "motifs.csv")

In [3]:
metadata = pd.read_csv(METADATA_FNAME,  sep='\t', index_col=1, nrows=9).drop(columns=['Unnamed: 0']).T.reset_index() #.drop(columns=['index', 'group #'])
metadata.columns.name = ''
metadata.age = metadata.age.astype(int)

In [4]:
metadata.tail()

Unnamed: 0,index,group #,total mRNA mol,well,sex,age,diameter,cell_id,level1class,level2class
3000,ca1hippocampus.1311,9,4585,26,-1,23,9.85,1772067059_B04,endothelial-mural,Peric
3001,ca1hippocampus.1312,9,2559,28,-1,26,11.0,1772066097_D04,endothelial-mural,Vsmc
3002,sscortex.1689,9,4015,4,1,26,8.63,1772063068_D01,endothelial-mural,Vsmc
3003,ca1hippocampus.1313,9,2896,89,-1,26,9.23,1772066098_A12,endothelial-mural,Vsmc
3004,sscortex.1690,9,4460,22,1,26,10.4,1772058148_F03,endothelial-mural,Vsmc


In [5]:
metadata['level1class'].unique()

array(['interneurons', 'pyramidal SS', 'pyramidal CA1',
       'oligodendrocytes', 'microglia', 'endothelial-mural',
       'astrocytes_ependymal'], dtype=object)

In [6]:
ex_matrix = pd.read_csv(SC_EXP_FNAME, sep='\t', header=0, index_col=0).T
ex_matrix.head()

cell_id,Tspan12,Tshz1,Fnbp1l,Adamts15,Cldn12,Rxfp1,2310042E22Rik,Sema3c,Jam2,Apbb1ip,...,Gm20826_loc1,Gm20826_loc2,Gm20877_loc2,Gm20877_loc1,Gm20865_loc4,Gm20738_loc4,Gm20738_loc6,Gm21943_loc1,Gm21943_loc3,Gm20738_loc3
1772071015_C02,0,3,3,0,1,0,0,11,1,0,...,0,0,0,0,0,0,0,0,0,0
1772071017_G12,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1772071017_A05,0,0,6,0,1,0,2,25,1,0,...,0,0,0,0,0,0,0,0,0,0
1772071014_B06,3,2,4,0,0,0,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1772067065_H06,0,2,1,0,0,0,0,10,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
ex_matrix = ex_matrix.apply(np.log1p)

In [8]:
maxi = ex_matrix.to_numpy().max()

In [9]:
def normalize(input):
    return input/maxi

In [10]:
ex_matrix.apply(normalize)

cell_id,Tspan12,Tshz1,Fnbp1l,Adamts15,Cldn12,Rxfp1,2310042E22Rik,Sema3c,Jam2,Apbb1ip,...,Gm20826_loc1,Gm20826_loc2,Gm20877_loc2,Gm20877_loc1,Gm20865_loc4,Gm20738_loc4,Gm20738_loc6,Gm21943_loc1,Gm21943_loc3,Gm20738_loc3
1772071015_C02,0.000000,0.149359,0.149359,0.0,0.074679,0.0,0.000000,0.267723,0.074679,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1772071017_G12,0.000000,0.074679,0.074679,0.0,0.074679,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1772071017_A05,0.000000,0.000000,0.209652,0.0,0.074679,0.0,0.118364,0.351026,0.074679,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1772071014_B06,0.149359,0.118364,0.173400,0.0,0.000000,0.0,0.149359,0.074679,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1772067065_H06,0.000000,0.118364,0.074679,0.0,0.000000,0.0,0.000000,0.258348,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1772067059_B04,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1772066097_D04,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.074679,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1772063068_D01,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1772066098_A12,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.074679,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
