In [1]:
import sqlalchemy as sa
from sqlalchemy import update
from sqlalchemy.dialects.mysql import INTEGER
from sqlalchemy.dialects.mysql import SMALLINT
from collections import OrderedDict
from itertools import product 


from __init__ import *
from snmcseq_utils import cd 
from snmcseq_utils import get_mouse_chromosomes
from snmcseq_utils import compute_global_mC 
from snmcseq_utils import create_logger
from CEMBA_run_tsne import run_tsne_CEMBA
from CEMBA_update_mysql import insert_into
from CEMBA_update_mysql import insert_into_worker
from CEMBA_update_mysql import gene_id_to_table_name 
from CEMBA_update_mysql import connect_sql 

In [2]:
log = create_logger()

In [57]:
# An engine connects to a mysql database
database = 'human_snmcseq'
engine = connect_sql(database) # sa.create_engine('mysql://f7xie:3405040212@localhost/{}'.format(database))


# Creating tables
# "genes" table (gene annotation)

In [5]:
def define_genes_table(metadata):
    """
    """
    table = sa.Table('genes', metadata,
                          # NOT auto increment not null primary key
                        sa.Column('gene_id', sa.String(50), primary_key=True, autoincrement=False), 
                        sa.Column('gene_name', sa.String(100), nullable=False), 
                        sa.Column('chr', sa.String(5), nullable=True), 
                        sa.Column('start', sa.Integer, nullable=True), 
                        sa.Column('end', sa.Integer, nullable=True), 
                        sa.Column('strand', sa.CHAR(1), nullable=True), 
                        sa.Column('gene_type', sa.String(100), nullable=True), 
    )
    return table

In [6]:
# create genes table
metadata = sa.MetaData(engine)
genes_table = define_genes_table(metadata)
metadata.create_all()

In [9]:
# update to genes table
table_name = 'genes'

path_genebody = PATH_GENEBODY_ANNOTATION
df_genes = pd.read_table(path_genebody, index_col='gene_id')

df_sql = df_genes.reset_index()
insert_into(engine, 'genes', df_sql, ignore=False, verbose=True)

03/19/2018 11:58:05 AM                 gene_id      gene_name   chr    start      end strand  \
0  ENSMUSG00000102693.1  4933401J01Rik  chr1  3073253  3074322      +   
1  ENSMUSG00000064842.1        Gm26206  chr1  3102016  3102125      +   
2  ENSMUSG00000051951.5           Xkr4  chr1  3205901  3671498      -   
3  ENSMUSG00000102851.1        Gm18956  chr1  3252757  3253236      +   
4  ENSMUSG00000103377.1        Gm37180  chr1  3365731  3368549      -   

              gene_type  
0                   TEC  
1                 snRNA  
2        protein_coding  
3  processed_pseudogene  
4                   TEC  


<sqlalchemy.engine.result.ResultProxy at 0x7fccaca4ad68>

# "ensembles" table

In [10]:
def define_enss_table(metadata):
    """
    """
    table = sa.Table('ensembles', metadata,
                          # NOT auto increment not null primary key
                        sa.Column('ensemble_id', sa.Integer, primary_key=True, autoincrement=False), 
                        sa.Column('ensemble_name', sa.String(255), nullable=False, unique=True),
                        sa.Column('public_access', sa.Boolean, nullable=False, default=False),
                        sa.Column('datasets', sa.String(4000), nullable=False),
    )
    return table

In [11]:
# create enss table
metadata = sa.MetaData(engine)
enss_table = define_enss_table(metadata)
metadata.create_all()

In [12]:
# update to enss table

# "cells" table

In [13]:
def define_cells_table(metadata):
    """
    """
    table = sa.Table('cells', metadata,
                sa.Column('cell_id', sa.Integer, primary_key=True, autoincrement=True), # auto increment and not null are implicitly defined
                sa.Column('cell_name', sa.String(255), nullable=False, unique=True), # unique but not primary key
                sa.Column('dataset', sa.String(40), nullable=False), # not in mapping summary
#                 sa.Column('cell_type', sa.String(20), nullable=True), # not in mapping summary
                     
#                 sa.Column('global_mCH', sa.Float, nullable=True),
#                 sa.Column('global_mCG', sa.Float, nullable=True),
#                 sa.Column('global_mCA', sa.Float, nullable=True), # not in mapping summary
#                 sa.Column('global_mCCC', sa.Float, nullable=True), 
#                 sa.Column('estimated_mCH', sa.Float, nullable=True),
#                 sa.Column('estimated_mCG', sa.Float, nullable=True),
#                 sa.Column('percent_genome_covered', sa.Float, nullable=True),
                     
#                 sa.Column('total_reads', sa.Integer, nullable=True),
#                 sa.Column('mapped_reads', sa.Integer, nullable=True),
#                 sa.Column('mapping_rate', sa.Float, nullable=True),
#                 sa.Column('nonclonal_reads', sa.Integer, nullable=True),
#                 sa.Column('percent_nonclonal_rate', sa.Float, nullable=True),
#                 sa.Column('filtered_reads', sa.Integer, nullable=True),
#                 sa.Column('filtered_rate', sa.Float, nullable=True),
#                 sa.Column('lambda_mC', sa.Float, nullable=True),
                )
    return table


In [15]:
# create cells table
metadata = sa.MetaData(engine)
cells_table = define_cells_table(metadata)
metadata.create_all()

In [16]:
# update to cells table (dataset level)

# Individual gene tables ("gene_ensemble_id")

In [21]:
def define_gene_table(metadata, gene_id):
    """
    """
    table_name = gene_id_to_table_name(gene_id)
    table = sa.Table(table_name, metadata,
                          # NOT auto increment not null primary key
                        sa.Column('cell_id', sa.Integer, sa.ForeignKey('cells.cell_id'), primary_key=True, autoincrement=False), 
                        sa.Column('normalized_counts', sa.Float, nullable=True),
    )
    return table

In [22]:
# create individual gene tables (takes time)
ti = time.time()

metadata = sa.MetaData(engine)
# load specfic tables needed to be referenced as foreign key
sa.Table('cells', metadata, autoload=True)
# load all genes 
sql = 'SELECT * FROM genes'
df_genes = pd.read_sql(sql, engine, index_col='gene_id')
# define gene tables 
for i, gene_id in enumerate(df_genes.index):
    gene_table = define_gene_table(metadata, gene_id)
# create them
logging.info("Creating gene tables... ({} in total)".format(df_genes.index.shape[0]))
metadata.create_all(engine)
logging.info(time.time() - ti)

03/19/2018 12:18:24 PM Creating gene tables... (53379 in total)
03/19/2018 01:34:49 PM 4603.019817352295


In [23]:
# update to gene tables (dataset level)

# Individual ensemble tables ("Ens1")

In [110]:
def define_ens_table(metadata, ens):
    """
    """
    # prototypes
#   sa.Column('tsne_x_mCH', sa.Float, nullable=True),
#   sa.Column('tsne_y_mCH', sa.Float, nullable=True),
#   sa.Column('cluster', SMALLINT(unsigned=True), nullable=True),
#   sa.Column('annotation', sa.String(20), nullable=True),

    # tsnes
    contexts = ['ATAC'] 
    perps = PERPLEXITIES
#     tsne_types = ['{}_ndim2_perp{}'.format(context, p) for (context, p) in product(contexts, perps)]
    tsne_types = ['ATAC']
    
    # prepare columns
    args_tsne = ([sa.Column('tsne_x_{}'.format(tsne_type), sa.Float, nullable=True) 
            for tsne_type in tsne_types] 
            + [sa.Column('tsne_y_{}'.format(tsne_type), sa.Float, nullable=True) 
            for tsne_type in tsne_types]
           )
    
    # clusters
    ks = K_NN
#     cluster_types = ['{}_lv_npc50_k{}'.format(context, k) for (context, k) in product(contexts, ks)]
    cluster_types = ['ATAC']
    # prepare columns
    args_cluster = ([sa.Column('cluster_{}'.format(cluster_type), SMALLINT(unsigned=True), nullable=True) 
            for cluster_type in cluster_types] 
            + [sa.Column('annotation_{}'.format(cluster_type), sa.String(20), nullable=True) 
            for cluster_type in cluster_types]
           )
    
    # combine cols
    args = args_tsne + args_cluster
    # create the table
    table = sa.Table(ens, metadata,
                        # NOT auto increment not null primary key
                        sa.Column('cell_id', sa.Integer, sa.ForeignKey('cells.cell_id'), primary_key=True, autoincrement=False), 
                        *args
    )
    return table

In [111]:
# create individual ensemble tables (ensemble level)
# demo here
ens = 'Ens0'

metadata = sa.MetaData(engine)
# load specfic tables needed to be referenced as foreign key
sa.Table('cells', metadata, autoload=True)
# define and create 
ens_table = define_ens_table(metadata, ens)
metadata.create_all(engine)

In [None]:
# update to ensemble tables (ensemble level)

# Upload to mysql
### Dataset level
- upload to cells table
- upload to gene tables (bottle-neck)

### Ensemble level
- upload to enss table
- create ens table
- upload to ens table

In [70]:
def gene_id_to_name(gene_id, df_genes):
    """df_genes
    """
    return df_genes.loc[gene_id, 'gene_name']

def gene_name_to_id(gene_name, df_genes):
    """df_genes
    """
    return df_genes[df_genes['gene_name'] == gene_name].index.values[0]

gene_name_to_id('Xkr4', df_genes)

'ENSMUSG00000051951.5'

In [77]:
engine = connect_sql(database)
sql = """SELECT * FROM genes"""
df_genes = pd.read_sql(sql, engine, index_col='gene_id')
df_genes.head()

Unnamed: 0_level_0,gene_name,chr,start,end,strand,gene_type
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSMUSG00000000001.4,Gnai3,chr3,108107280,108146146,-,protein_coding
ENSMUSG00000000003.15,Pbsn,chrX,77837901,77853623,-,protein_coding
ENSMUSG00000000028.14,Cdc45,chr16,18780447,18811987,-,protein_coding
ENSMUSG00000000031.16,H19,chr7,142575529,142578143,-,lincRNA
ENSMUSG00000000037.16,Scml2,chrX,161117193,161258213,+,protein_coding


In [121]:
# get gene*cell matrix
dataset = 'CEMBA_3C_171207'
with cd('/cndd/Public_Datasets/CEMBA/snATACSeq/MiniBrain_fromRongxin'):
    df_clst = pd.read_table('XW46_cluster.txt', header=None, names=['sample', 'cluster_ID'], index_col='sample')
    df_tsne = pd.read_table('XW46_tsne.txt', header=None, names=['sample', 'tsne_x', 'tsne_y'], index_col='sample')
    df = pd.read_table('XW46_gene.txt.gz') 

df = df.T
    
print(df_clst.shape)
print(df_tsne.shape)
print(df.shape)
df.head()

(9621, 1)
(9621, 2)
(24848, 9621)


Unnamed: 0,AGCGATAGAACCAGGTAACGAACGCCTATCCT,AGCGATAGAACCAGGTAAGAGATGTAATCTTA,AGCGATAGAACCAGGTAATGACGTCCTATCCT,AGCGATAGAACCAGGTAGGATAACATAGAGGC,AGCGATAGAACCAGGTAGGATAACCCTATCCT,AGCGATAGAACCAGGTAGGATAACTATAGCCT,AGCGATAGAACCAGGTATAGCCTTATAGAGGC,AGCGATAGAACCAGGTATAGCCTTCCTATCCT,AGCGATAGAACCAGGTATTCGTTGATAGAGGC,AGCGATAGAACCAGGTATTCGTTGGTACTGAC,...,TCTCGCGCTTAGCCTCAAGAGATGTATAGCCT,TCTCGCGCTTAGCCTCATAGCCTTCAGGACGT,TCTCGCGCTTAGCCTCATTCGTTGGTACTGAC,TCTCGCGCTTCCATCCAGGATAACAGGCGAAG,TCTCGCGCTTCCATCCATTCGTTGCAGGACGT,TCTCGCGCTTCCATCCGGTTAGACCAGGACGT,TCTCGCGCTTCCATCCTAAGATCCAGGCGAAG,TCTCGCGCTTCCATCCTTACGACCCAGGACGT,TCTCGCGCTTCCATCCTTCATCCAAGGCGAAG,TCTCGCGCTTCCATCCTTGGAAGTTAATCTTA
Xkr4,1.2309,2.2035,1.4904,2.7199,1.9671,0.6435,1.1516,1.2949,1.0551,1.3511,...,1.5819,2.5104,1.1699,0.8651,1.8328,0.8675,0.6887,0.1175,0.5993,0.7535
Ncaph2,0.6129,0.8373,0.3274,1.0381,0.6996,0.4499,0.3647,0.3174,0.7224,0.4432,...,0.7459,0.1373,0.2042,0.4358,0.9827,0.9145,0.9568,0.3386,0.4358,0.3914
Eif4enif1,0.2243,0.5336,0.555,0.3044,0.374,0.9099,0.3129,0.2693,0.4159,0.3219,...,0.5529,0.1557,0.0879,0.5844,0.7681,0.2178,1.4043,0.8182,0.5844,0.1908
Rgr,0.1747,0.6393,0.397,0.1009,0.1228,0.0726,0.2763,0.0862,0.2224,0.2941,...,0.402,0.6971,0.3574,0.1004,0.2802,0.0441,0.0,0.0,0.1004,0.0
Mapkapk2,1.7354,0.9501,1.4129,0.5049,1.0696,0.5063,0.4804,1.0927,0.8737,1.0037,...,2.1037,0.8516,0.4215,1.1965,1.7008,1.2285,1.848,0.292,1.1965,0.476


In [122]:
df_res = pd.merge(df_genes[['gene_name']], df, left_on='gene_name', right_index=True)
df_res = df_res.drop('gene_name', axis=1)
print(df_res.shape)
df_res.head()

(24266, 9621)


Unnamed: 0_level_0,AGCGATAGAACCAGGTAACGAACGCCTATCCT,AGCGATAGAACCAGGTAAGAGATGTAATCTTA,AGCGATAGAACCAGGTAATGACGTCCTATCCT,AGCGATAGAACCAGGTAGGATAACATAGAGGC,AGCGATAGAACCAGGTAGGATAACCCTATCCT,AGCGATAGAACCAGGTAGGATAACTATAGCCT,AGCGATAGAACCAGGTATAGCCTTATAGAGGC,AGCGATAGAACCAGGTATAGCCTTCCTATCCT,AGCGATAGAACCAGGTATTCGTTGATAGAGGC,AGCGATAGAACCAGGTATTCGTTGGTACTGAC,...,TCTCGCGCTTAGCCTCAAGAGATGTATAGCCT,TCTCGCGCTTAGCCTCATAGCCTTCAGGACGT,TCTCGCGCTTAGCCTCATTCGTTGGTACTGAC,TCTCGCGCTTCCATCCAGGATAACAGGCGAAG,TCTCGCGCTTCCATCCATTCGTTGCAGGACGT,TCTCGCGCTTCCATCCGGTTAGACCAGGACGT,TCTCGCGCTTCCATCCTAAGATCCAGGCGAAG,TCTCGCGCTTCCATCCTTACGACCCAGGACGT,TCTCGCGCTTCCATCCTTCATCCAAGGCGAAG,TCTCGCGCTTCCATCCTTGGAAGTTAATCTTA
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000000001.4,1.4893,2.6315,1.4147,4.2155,2.3922,1.953,1.1681,2.036,1.6638,1.4493,...,3.5976,1.2706,0.6035,1.8261,3.3573,3.308,2.0872,0.7567,1.8261,1.2383
ENSMUSG00000000003.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000028.14,0.3212,0.8244,0.2193,0.0973,0.403,0.139,0.7995,0.2772,0.2051,0.2167,...,0.249,0.5776,0.0,0.2935,0.3072,0.4671,0.5941,0.0,0.2232,0.437
ENSMUSG00000000031.16,0.1613,0.2949,0.1389,0.083,0.1155,0.0,0.1784,0.0,0.0868,0.3343,...,0.4269,0.3581,0.1957,0.2701,0.6216,0.9398,0.0,0.0,0.1998,0.1974
ENSMUSG00000000037.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Upload to cells table

In [129]:
df_cells = pd.DataFrame()
df_cells['cell_name'] = dataset + '_' + df_clst.index.values
df_cells['dataset'] = dataset 

print(df_cells.shape)
df_cells.head()

(9621, 2)


Unnamed: 0,cell_name,dataset
0,CEMBA_3C_171207_AGCGATAGAACCAGGTAACGAACGCCTATCCT,CEMBA_3C_171207
1,CEMBA_3C_171207_AGCGATAGAACCAGGTAAGAGATGTAATCTTA,CEMBA_3C_171207
2,CEMBA_3C_171207_AGCGATAGAACCAGGTAATGACGTCCTATCCT,CEMBA_3C_171207
3,CEMBA_3C_171207_AGCGATAGAACCAGGTAGGATAACATAGAGGC,CEMBA_3C_171207
4,CEMBA_3C_171207_AGCGATAGAACCAGGTAGGATAACCCTATCCT,CEMBA_3C_171207


In [130]:
# insert into
insert_into(engine, 'cells', df_cells, ignore=False, verbose='True')
logging.info("Upload complete.")

03/20/2018 09:52:14 PM                                           cell_name          dataset
0  CEMBA_3C_171207_AGCGATAGAACCAGGTAACGAACGCCTATCCT  CEMBA_3C_171207
1  CEMBA_3C_171207_AGCGATAGAACCAGGTAAGAGATGTAATCTTA  CEMBA_3C_171207
2  CEMBA_3C_171207_AGCGATAGAACCAGGTAATGACGTCCTATCCT  CEMBA_3C_171207
3  CEMBA_3C_171207_AGCGATAGAACCAGGTAGGATAACATAGAGGC  CEMBA_3C_171207
4  CEMBA_3C_171207_AGCGATAGAACCAGGTAGGATAACCCTATCCT  CEMBA_3C_171207
03/20/2018 09:52:15 PM Upload complete.


# Upload to gene tables

In [144]:
sql = """SELECT * FROM cells"""
df_cells = pd.read_sql(sql, engine)

metadata = sa.MetaData(engine)

for i, (gene_id, row) in enumerate(df_res.iterrows()):
    gene_table_name = gene_id_to_table_name(gene_id)
#     if (i%10==0):
#         logging.info("Progress on genes: {} {}".format(i+1, gene_table_name))
    if i == 0:
        df_gene = row.to_frame('normalized_counts')
        df_gene.index = dataset + '_' + df_gene.index.values
        df_gene = pd.merge(df_gene, df_cells, left_index=True, right_on='cell_name', how='left')
        df_gene = df_gene[['cell_id', 'normalized_counts']]
        print(df_gene.head())

#     if i == 0:
#         df_gene = row.to_frame('normalized_counts')
#         df_gene = pd.merge(df_gene, df_cells, left_index=True, right_on='cell_name', how='left')
#         df_gene = df_gene[['cell_id', 'normalized_counts']]
#         print(df_gene.head())
#         insert_into(engine, gene_table_name, df_gene, ignore=True, verbose=True)


      cell_id  normalized_counts
8567     8568             1.4893
8568     8569             2.6315
8569     8570             1.4147
8570     8571             4.2155
8571     8572             2.3922


In [131]:
# Upload to ensembles table
ens = 'Ens2'
ens_name = 'CEMBA_3C_171207'
ens_datasets=['CEMBA_3C_171207']


table_name = 'ensembles'
ens_id = int(ens[len('Ens'):])
dict_list = [{'ensemble_id': ens_id, 'ensemble_name': ens_name, 'public_access': False, 'datasets': ','.join(ens_datasets)}]
insert_into_worker(engine, table_name, dict_list, ignore=False)


<sqlalchemy.engine.result.ResultProxy at 0x7fcc89f1aa58>

In [132]:
# create and upload to ensemble table (2 ensembles)

def create_and_upload_to_ens(ens, df_ens, database):
    """
    """
    
    logging.info("Creating {} table...".format(ens))
    engine = connect_sql(database)
    # create ens table
    metadata = sa.MetaData(engine)
    # load specfic tables needed to referenced as foreign key
    sa.Table('cells', metadata, autoload=True)
    # define ensemble table
    ens_table = define_ens_table(metadata, ens)
    metadata.create_all()
    logging.info("Done creating {} table!".format(ens))

    # upload to ens table
    logging.info("Uploading to {} table...".format(ens))

    engine = connect_sql(database)
    table_name = ens 
    insert_into(engine, table_name, df_ens, ignore=False, verbose=True)

    logging.info("Done uploading to {} table!".format(ens))

    return


In [140]:
df_ens = pd.merge(df_clst, df_tsne, left_index=True, right_index=True)
df_ens.columns = ['cluster_ATAC', 'tsne_x_ATAC', 'tsne_y_ATAC']
df_ens.index = dataset + '_' + df_ens.index.values
df_ens['cluster_ATAC'] = [int(cluster.strip('C')) for cluster in df_ens['cluster_ATAC']]

# get cells
sql = """SELECT * FROM cells"""
df_cells = pd.read_sql(sql, engine)[['cell_id', 'cell_name']]

# get cell_id
df_ens = pd.merge(df_cells, df_ens, left_on='cell_name', right_index=True, how='right')
df_ens = df_ens.drop('cell_name', axis=1)

print(ens)
print(df_ens.head())
print(database)
create_and_upload_to_ens(ens, df_ens, database)

03/20/2018 09:59:55 PM Creating Ens2 table...


Ens2
      cell_id  cluster_ATAC  tsne_x_ATAC  tsne_y_ATAC
8567     8568            13    -9.351734    30.328056
8568     8569            19   -32.812104     4.672624
8569     8570             8     3.887879    -2.134793
8570     8571            17    32.098704    19.152496
8571     8572            20    -1.280459    20.562413
CEMBA_snATAC


03/20/2018 09:59:56 PM Done creating Ens2 table!
03/20/2018 09:59:56 PM Uploading to Ens2 table...
03/20/2018 09:59:56 PM       cell_id  cluster_ATAC  tsne_x_ATAC  tsne_y_ATAC
8567     8568            13    -9.351734    30.328056
8568     8569            19   -32.812104     4.672624
8569     8570             8     3.887879    -2.134793
8570     8571            17    32.098704    19.152496
8571     8572            20    -1.280459    20.562413
03/20/2018 09:59:57 PM Done uploading to Ens2 table!


In [128]:
# # update a table
# sql = """SELECT * FROM cells"""
# df_cells = pd.read_sql(sql, engine)
# df_cells['cell_name'] = df_cells['dataset'].values[0] + '_' + df_cells['cell_name']

# metadata = sa.MetaData(engine)
# table = sa.Table('cells', metadata, autoload=True)
# for i, row in df_cells.iterrows():
#     updater = update(table).where(table.c.cell_id==row.cell_id).values(cell_name=row.cell_name)
#     engine.execute(updater)



# delete a table
#     gene_table = sa.Table(gene_table_name, metadata, autoload=True)
#     d = gene_table.delete()
#     d.execute()