In [1]:
import sqlalchemy as sa
from sqlalchemy import update
from sqlalchemy.dialects.mysql import INTEGER
from sqlalchemy.dialects.mysql import SMALLINT
from collections import OrderedDict
from itertools import product 
import re


from __init__ import *
from snmcseq_utils import cd 
from snmcseq_utils import get_mouse_chromosomes
from snmcseq_utils import compute_global_mC 
from snmcseq_utils import create_logger
from CEMBA_run_tsne import run_tsne_CEMBA
from CEMBA_update_mysql import insert_into
from CEMBA_update_mysql import insert_into_worker
from CEMBA_update_mysql import gene_id_to_table_name 
from CEMBA_update_mysql import connect_sql 

In [2]:
log = create_logger()

In [3]:
# An engine connects to a mysql database
database = 'CEMBA_ATAC'
engine = connect_sql(database) 

ImportError: libmariadbclient.so.18: cannot open shared object file: No such file or directory

# Creating tables
# "genes" table (gene annotation)

In [9]:
def define_genes_table(metadata):
    """
    """
    table = sa.Table('genes', metadata,
                          # NOT auto increment not null primary key
                        sa.Column('gene_id', sa.String(50), primary_key=True, autoincrement=False), 
                        sa.Column('gene_name', sa.String(100), nullable=False), 
                        sa.Column('chr', sa.String(5), nullable=True), 
                        sa.Column('start', sa.Integer, nullable=True), 
                        sa.Column('end', sa.Integer, nullable=True), 
                        sa.Column('strand', sa.CHAR(1), nullable=True), 
                        sa.Column('gene_type', sa.String(100), nullable=True), 
    )
    return table

In [10]:
# create genes table
metadata = sa.MetaData(engine)
genes_table = define_genes_table(metadata)
metadata.create_all()

In [11]:
# update to genes table
table_name = 'genes'

path_genebody = PATH_GENEBODY_ANNOTATION
df_genes = pd.read_table(path_genebody, index_col='gene_id')

df_sql = df_genes.reset_index()
insert_into(engine, 'genes', df_sql, ignore=False, verbose=True)

08/06/2018 04:41:23 PM                 gene_id      gene_name   chr    start      end strand  \
0  ENSMUSG00000102693.1  4933401J01Rik  chr1  3073253  3074322      +   
1  ENSMUSG00000064842.1        Gm26206  chr1  3102016  3102125      +   
2  ENSMUSG00000051951.5           Xkr4  chr1  3205901  3671498      -   
3  ENSMUSG00000102851.1        Gm18956  chr1  3252757  3253236      +   
4  ENSMUSG00000103377.1        Gm37180  chr1  3365731  3368549      -   

              gene_type  
0                   TEC  
1                 snRNA  
2        protein_coding  
3  processed_pseudogene  
4                   TEC  


<sqlalchemy.engine.result.ResultProxy at 0x7f7735161eb8>

# "ensembles" table

In [12]:
def define_enss_table(metadata):
    """
    """
    table = sa.Table('ensembles', metadata,
                          # NOT auto increment not null primary key
                        sa.Column('ensemble_id', sa.Integer, primary_key=True, autoincrement=False), 
                        sa.Column('ensemble_name', sa.String(255), nullable=False, unique=True),
                        sa.Column('public_access', sa.Boolean, nullable=False, default=False),
                        sa.Column('datasets', sa.String(4000), nullable=False),
                        sa.Column('description', sa.String(255), nullable=True),
    )
    return table

In [13]:
# create enss table
metadata = sa.MetaData(engine)
enss_table = define_enss_table(metadata)
metadata.create_all()

In [12]:
# update to enss table

# "cells" table

In [14]:
def define_cells_table(metadata):
    """
    """
    table = sa.Table('cells', metadata,
                sa.Column('cell_id', sa.Integer, primary_key=True, autoincrement=True), # auto increment and not null are implicitly defined
                sa.Column('cell_name', sa.String(255), nullable=False, unique=True), # unique but not primary key
                sa.Column('dataset', sa.String(40), nullable=False), # not in mapping summary
#                 sa.Column('cell_type', sa.String(20), nullable=True), # not in mapping summary
                     
#                 sa.Column('global_mCH', sa.Float, nullable=True),
#                 sa.Column('global_mCG', sa.Float, nullable=True),
#                 sa.Column('global_mCA', sa.Float, nullable=True), # not in mapping summary
#                 sa.Column('global_mCCC', sa.Float, nullable=True), 
#                 sa.Column('estimated_mCH', sa.Float, nullable=True),
#                 sa.Column('estimated_mCG', sa.Float, nullable=True),
#                 sa.Column('percent_genome_covered', sa.Float, nullable=True),
                     
#                 sa.Column('total_reads', sa.Integer, nullable=True),
#                 sa.Column('mapped_reads', sa.Integer, nullable=True),
#                 sa.Column('mapping_rate', sa.Float, nullable=True),
#                 sa.Column('nonclonal_reads', sa.Integer, nullable=True),
#                 sa.Column('percent_nonclonal_rate', sa.Float, nullable=True),
#                 sa.Column('filtered_reads', sa.Integer, nullable=True),
#                 sa.Column('filtered_rate', sa.Float, nullable=True),
#                 sa.Column('lambda_mC', sa.Float, nullable=True),
                )
    return table


In [15]:
# create cells table
metadata = sa.MetaData(engine)
cells_table = define_cells_table(metadata)
metadata.create_all()

In [16]:
# update to cells table (dataset level)

# Individual gene tables ("gene_ensemble_id")

In [16]:
def define_gene_table(metadata, gene_id):
    """
    """
    table_name = gene_id_to_table_name(gene_id)
    table = sa.Table(table_name, metadata,
                          # NOT auto increment not null primary key
                        sa.Column('cell_id', sa.Integer, sa.ForeignKey('cells.cell_id'), primary_key=True, autoincrement=False), 
                        sa.Column('normalized_counts', sa.Float, nullable=True),
    )
    return table

In [None]:
# create individual gene tables (takes time)
ti = time.time()

metadata = sa.MetaData(engine)
# load specfic tables needed to be referenced as foreign key
sa.Table('cells', metadata, autoload=True)
# load all genes 
sql = 'SELECT * FROM genes'
df_genes = pd.read_sql(sql, engine, index_col='gene_id')
# define gene tables 
for i, gene_id in enumerate(df_genes.index):
    gene_table = define_gene_table(metadata, gene_id)
# create them
logging.info("Creating gene tables... ({} in total)".format(df_genes.index.shape[0]))
metadata.create_all(engine)
logging.info(time.time() - ti)

08/06/2018 04:43:00 PM Creating gene tables... (53379 in total)


In [23]:
# update to gene tables (dataset level)

# Individual ensemble tables ("Ens1")

In [26]:
def define_ens_table(metadata, ens):
    """
    """
    # prototypes
#   sa.Column('tsne_x_mCH', sa.Float, nullable=True),
#   sa.Column('tsne_y_mCH', sa.Float, nullable=True),
#   sa.Column('cluster', SMALLINT(unsigned=True), nullable=True),
#   sa.Column('annotation', sa.String(20), nullable=True),

    # tsnes
    contexts = ['ATAC'] 
    perps = PERPLEXITIES
#     tsne_types = ['{}_ndim2_perp{}'.format(context, p) for (context, p) in product(contexts, perps)]
    tsne_types = ['ATAC']
    
    # prepare columns
    args_tsne = ([sa.Column('tsne_x_{}'.format(tsne_type), sa.Float, nullable=True) 
            for tsne_type in tsne_types] 
            + [sa.Column('tsne_y_{}'.format(tsne_type), sa.Float, nullable=True) 
            for tsne_type in tsne_types]
           )
    
    # clusters
    ks = K_NN
#     cluster_types = ['{}_lv_npc50_k{}'.format(context, k) for (context, k) in product(contexts, ks)]
    cluster_types = ['ATAC']
    # prepare columns
    args_cluster = ([sa.Column('cluster_{}'.format(cluster_type), SMALLINT(unsigned=True), nullable=True) 
            for cluster_type in cluster_types] 
            + [sa.Column('annotation_{}'.format(cluster_type), sa.String(20), nullable=True) 
            for cluster_type in cluster_types]
           )
    
    # combine cols
    args = args_tsne + args_cluster
    # create the table
    table = sa.Table(ens, metadata,
                        # NOT auto increment not null primary key
                        sa.Column('cell_id', sa.Integer, sa.ForeignKey('cells.cell_id'), primary_key=True, autoincrement=False), 
                        *args
    )
    return table

In [111]:
# create individual ensemble tables (ensemble level)
# demo here
ens = 'Ens0'

metadata = sa.MetaData(engine)
# load specfic tables needed to be referenced as foreign key
sa.Table('cells', metadata, autoload=True)
# define and create 
ens_table = define_ens_table(metadata, ens)
metadata.create_all(engine)

In [None]:
# update to ensemble tables (ensemble level)

# Upload to mysql

### Dataset level

- upload to dataset table
- upload to cells table
- upload to gene tables (bottle-neck)

### Ensemble level

- upload to enss table
- create ens table && upload to ens table

In [70]:
def gene_id_to_name(gene_id, df_genes):
    """df_genes
    """
    return df_genes.loc[gene_id, 'gene_name']

def gene_name_to_id(gene_name, df_genes):
    """df_genes
    """
    return df_genes[df_genes['gene_name'] == gene_name].index.values[0]

gene_name_to_id('Xkr4', df_genes)

'ENSMUSG00000051951.5'

In [77]:
engine = connect_sql(database)
sql = """SELECT * FROM genes"""
df_genes = pd.read_sql(sql, engine, index_col='gene_id')
df_genes.head()

Unnamed: 0_level_0,gene_name,chr,start,end,strand,gene_type
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSMUSG00000000001.4,Gnai3,chr3,108107280,108146146,-,protein_coding
ENSMUSG00000000003.15,Pbsn,chrX,77837901,77853623,-,protein_coding
ENSMUSG00000000028.14,Cdc45,chr16,18780447,18811987,-,protein_coding
ENSMUSG00000000031.16,H19,chr7,142575529,142578143,-,lincRNA
ENSMUSG00000000037.16,Scml2,chrX,161117193,161258213,+,protein_coding


In [50]:
# get gene*cell matrix

# dataset = 'CEMBA_3C_171207'
# with cd('/cndd/Public_Datasets/CEMBA/snATACSeq/MiniBrain_fromRongxin'):
#     df_clst = pd.read_table('XW46_cluster.txt', header=None, names=['sample', 'cluster_ID'], index_col='sample')
#     df_tsne = pd.read_table('XW46_tsne.txt', header=None, names=['sample', 'tsne_x', 'tsne_y'], index_col='sample')
#     df = pd.read_table('XW46_gene.txt.gz') 

# df = df.T
    
# print(df_clst.shape)
# print(df_tsne.shape)
# print(df.shape)
# df.head()
from scipy import sparse
from collections import namedtuple


fdir = '/cndd/Public_Datasets/CEMBA/snATACSeq/Datasets/1B/CEMBA180118_1B_rep1/counts/genebody/'
fdata = fdir + 'CEMBA180118_1B_rep1.genebody.count_matrix.npz'
frow = fdir + 'CEMBA180118_1B_rep1.genebody.count_matrix.row.index' # gene
fcol = fdir + 'CEMBA180118_1B_rep1.genebody.count_matrix.col.index' # col
dataset = 'CEMBA_1B_180118'

GC_matrix = namedtuple('GC_matrix', ['gene', 'cell', 'data'])

In [106]:
gene_ids = pd.read_table(frow, header=None)[0].values
cells = pd.read_table(fcol, header=None)[0].values
cells = cells + '_' + dataset 
data = sparse.load_npz(fdata)

gc_matrix_raw = GC_matrix(
    gene_ids, 
    cells, 
    data,
)

assert gc_matrix_raw.data.shape[0] == len(gc_matrix_raw.gene)
assert gc_matrix_raw.data.shape[1] == len(gc_matrix_raw.cell)

In [118]:
# logcpm

def sparse_logcpm(gc_matrix):
    """
    """
    lib_size_inv = sparse.diags(np.ravel(1.0/gc_matrix.data.sum(axis=0)))
    logcpm = (gc_matrix.data).dot(lib_size_inv*1e6).tocoo()
    logcpm.data = np.log10(logcpm.data + 1)

    gc_logcpm = GC_matrix(
        gc_matrix.gene, 
        gc_matrix.cell, 
        logcpm,
    )
    
    return gc_logcpm
    
gc_logcpm = sparse_logcpm(gc_matrix_raw)



## Upload to dataset table

In [46]:
from CEMBA_update_mysql import upload_to_datasets
upload_to_datasets(dataset, database=database, strict=False)




08/07/2018 01:51:47 PM Update dataset: CEMBA_1B_180118 to datasets table in CEMBA_ATAC database
08/07/2018 01:51:47 PM May not be a valid dataset: CEMBA_1B_180118 (not found in snmCSeq datasets directory)


# Upload to cells table

In [40]:
df_cells = pd.DataFrame()
df_cells['cell_name'] = gc_matrix.cell 
df_cells['dataset'] = dataset 

print(df_cells.shape)
df_cells.head()

(9063, 2)


Unnamed: 0,cell_name,dataset
0,AGCGATAGAACCAGGTAACGAACGCAGGACGT_CEMBA_1B_180118,CEMBA_1B_180118
1,AGCGATAGAACCAGGTAACGAACGGGCTCTGA_CEMBA_1B_180118,CEMBA_1B_180118
2,AGCGATAGAACCAGGTAAGAGATGCAGGACGT_CEMBA_1B_180118,CEMBA_1B_180118
3,AGCGATAGAACCAGGTATAGCCTTCAGGACGT_CEMBA_1B_180118,CEMBA_1B_180118
4,AGCGATAGAACCAGGTCGAATTCCCAGGACGT_CEMBA_1B_180118,CEMBA_1B_180118


In [44]:
# insert into
insert_into(engine, 'cells', df_cells, ignore=False, verbose='True')
logging.info("Upload complete.")

08/07/2018 01:33:07 PM                                           cell_name          dataset
0  AGCGATAGAACCAGGTAACGAACGCAGGACGT_CEMBA_1B_180118  CEMBA_1B_180118
1  AGCGATAGAACCAGGTAACGAACGGGCTCTGA_CEMBA_1B_180118  CEMBA_1B_180118
2  AGCGATAGAACCAGGTAAGAGATGCAGGACGT_CEMBA_1B_180118  CEMBA_1B_180118
3  AGCGATAGAACCAGGTATAGCCTTCAGGACGT_CEMBA_1B_180118  CEMBA_1B_180118
4  AGCGATAGAACCAGGTCGAATTCCCAGGACGT_CEMBA_1B_180118  CEMBA_1B_180118
08/07/2018 01:33:10 PM Upload complete.


# Upload to gene tables

In [120]:
sql = """SELECT * FROM cells"""
df_cells = pd.read_sql(sql, engine)

metadata = sa.MetaData(engine)

for i, gene in enumerate(gc_logcpm.gene):
    if i > 10:
        break
    if (i%1000 == 0):
        logging.info("Progress on genes: {} {}".format(i+1, gene_table_name))
    
    data_gene = gc_logcpm.data.getrow(i).tocoo()
    gene_table_name = gene_id_to_table_name(gene)
    
    df_gene = pd.DataFrame()
    df_gene['cell_name'] = [gc_logcpm.cell[col] for col in data_gene.col]
    df_gene['normalized_counts'] = data_gene.data
    
    df_gene = pd.merge(df_gene, df_cells, on='cell_name', how='left')
    df_gene = df_gene[['cell_id', 'normalized_counts']]
    print(gene, df_gene)
    
#     insert_into(engine, gene_table_name, df_gene, ignore=True, verbose=True)
        
    

08/07/2018 03:17:42 PM Progress on genes: 1 gene_ENSMUSG00000046334_4


ENSMUSG00000102693.1    cell_id  normalized_counts
0     4951           2.513277
ENSMUSG00000064842.1    cell_id  normalized_counts
0     6774           1.896742
ENSMUSG00000051951.5       cell_id  normalized_counts
0        9061           3.609613
1        9060           3.211215
2        9058           3.158629
3        9056           2.886287
4        9048           3.213160
5        9040           3.979891
6        9035           2.673257
7        9030           3.333683
8        9024           3.643308
9        9022           2.952043
10       9016           3.282972
11       9009           3.006551
12       9006           3.370012
13       8979           3.384755
14       8975           2.904327
15       8957           3.180250
16       8955           3.115255
17       8954           3.018372
18       8946           3.269855
19       8939           3.226106
20       8933           3.071205
21       8932           3.155671
22       8931           3.131090
23       8927           3

ENSMUSG00000103201.1     cell_id  normalized_counts
0      8361           2.308151
1      8337           2.732578
2      7357           2.613048
3      7043           1.950322
4      6822           2.496441
5      6708           2.920076
6      5796           2.675198
7      5058           2.883463
8      4676           2.109956
9      4535           1.611768
10     4471           2.918368
11     4454           2.296869
12     4184           2.326002
13     4148           1.988660
14     3734           2.450580
15     3362           2.367892
16     2945           2.754636
17     2329           1.681193
18     1749           1.925660
19      347           2.526822
20      344           2.463990
21       39           2.248930
22       22           2.610132
ENSMUSG00000103147.1     cell_id  normalized_counts
0      8592           2.236930
1      8243           2.379499
2      7507           3.005672
3      6903           2.613225
4      6857           2.645436
5      6202           2.9036

In [144]:
sql = """SELECT * FROM cells"""
df_cells = pd.read_sql(sql, engine)

metadata = sa.MetaData(engine)

for i, (gene_id, row) in enumerate(df_res.iterrows()):
    gene_table_name = gene_id_to_table_name(gene_id)
#     if (i%10==0):
#         logging.info("Progress on genes: {} {}".format(i+1, gene_table_name))
    if i == 0:
        df_gene = row.to_frame('normalized_counts')
        df_gene.index = dataset + '_' + df_gene.index.values
        df_gene = pd.merge(df_gene, df_cells, left_index=True, right_on='cell_name', how='left')
        df_gene = df_gene[['cell_id', 'normalized_counts']]
        print(df_gene.head())

#     if i == 0:
#         df_gene = row.to_frame('normalized_counts')
#         df_gene = pd.merge(df_gene, df_cells, left_index=True, right_on='cell_name', how='left')
#         df_gene = df_gene[['cell_id', 'normalized_counts']]
#         print(df_gene.head())
#         insert_into(engine, gene_table_name, df_gene, ignore=True, verbose=True)


      cell_id  normalized_counts
8567     8568             1.4893
8568     8569             2.6315
8569     8570             1.4147
8570     8571             4.2155
8571     8572             2.3922


## Upload to ensembles table

In [131]:
# Upload to ensembles table
ens = 'Ens2'
ens_name = 'CEMBA_3C_171207'
ens_datasets=['CEMBA_3C_171207']


table_name = 'ensembles'
ens_id = int(ens[len('Ens'):])
dict_list = [{'ensemble_id': ens_id, 'ensemble_name': ens_name, 'public_access': False, 'datasets': ','.join(ens_datasets)}]
insert_into_worker(engine, table_name, dict_list, ignore=False)


<sqlalchemy.engine.result.ResultProxy at 0x7fcc89f1aa58>

In [24]:
# create and upload to ensemble table (2 ensembles)

def create_and_upload_to_ens(ens, df_ens, database):
    """
    """
    
    logging.info("Creating {} table...".format(ens))
    engine = connect_sql(database)
    # create ens table
    metadata = sa.MetaData(engine)
    # load specfic tables needed to referenced as foreign key
    sa.Table('cells', metadata, autoload=True)
    # define ensemble table
    ens_table = define_ens_table(metadata, ens)
    metadata.create_all()
    logging.info("Done creating {} table!".format(ens))

    # upload to ens table
    logging.info("Uploading to {} table...".format(ens))

    engine = connect_sql(database)
    table_name = ens 
    insert_into(engine, table_name, df_ens, ignore=False, verbose=True)

    logging.info("Done uploading to {} table!".format(ens))

    return


In [140]:
df_ens = pd.merge(df_clst, df_tsne, left_index=True, right_index=True)
df_ens.columns = ['cluster_ATAC', 'tsne_x_ATAC', 'tsne_y_ATAC']
df_ens.index = dataset + '_' + df_ens.index.values
df_ens['cluster_ATAC'] = [int(cluster.strip('C')) for cluster in df_ens['cluster_ATAC']]

# get cells
sql = """SELECT * FROM cells"""
df_cells = pd.read_sql(sql, engine)[['cell_id', 'cell_name']]

# get cell_id
df_ens = pd.merge(df_cells, df_ens, left_on='cell_name', right_index=True, how='right')
df_ens = df_ens.drop('cell_name', axis=1)

print(ens)
print(df_ens.head())
print(database)
create_and_upload_to_ens(ens, df_ens, database)

03/20/2018 09:59:55 PM Creating Ens2 table...


Ens2
      cell_id  cluster_ATAC  tsne_x_ATAC  tsne_y_ATAC
8567     8568            13    -9.351734    30.328056
8568     8569            19   -32.812104     4.672624
8569     8570             8     3.887879    -2.134793
8570     8571            17    32.098704    19.152496
8571     8572            20    -1.280459    20.562413
CEMBA_snATAC


03/20/2018 09:59:56 PM Done creating Ens2 table!
03/20/2018 09:59:56 PM Uploading to Ens2 table...
03/20/2018 09:59:56 PM       cell_id  cluster_ATAC  tsne_x_ATAC  tsne_y_ATAC
8567     8568            13    -9.351734    30.328056
8568     8569            19   -32.812104     4.672624
8569     8570             8     3.887879    -2.134793
8570     8571            17    32.098704    19.152496
8571     8572            20    -1.280459    20.562413
03/20/2018 09:59:57 PM Done uploading to Ens2 table!


In [128]:
# # update a table
# sql = """SELECT * FROM cells"""
# df_cells = pd.read_sql(sql, engine)
# df_cells['cell_name'] = df_cells['dataset'].values[0] + '_' + df_cells['cell_name']

# metadata = sa.MetaData(engine)
# table = sa.Table('cells', metadata, autoload=True)
# for i, row in df_cells.iterrows():
#     updater = update(table).where(table.c.cell_id==row.cell_id).values(cell_name=row.cell_name)
#     engine.execute(updater)



# delete a table
#     gene_table = sa.Table(gene_table_name, metadata, autoload=True)
#     d = gene_table.delete()
#     d.execute()

# Delete zero rows (in gene table)

In [None]:
sql = "SELECT * from genes"
df_genes = pd.read_sql(sql, engine)
gene_tables = df_genes['gene_id'].apply(gene_id_to_table_name)


metadata = sa.MetaData(engine)
for i, gene_table in enumerate(gene_tables):
    if i % 1000 == 0:
        logging.info('Progress: {}'.format(i+1))
        
    table = sa.Table(gene_table, metadata, autoload=True)
    d = table.delete(table.c.normalized_counts == 0)
    d.execute()


04/20/2018 11:37:15 AM Progress: 1
04/20/2018 11:42:02 AM Progress: 1001
04/20/2018 11:49:09 AM Progress: 2001
04/20/2018 11:55:28 AM Progress: 3001
04/20/2018 12:02:12 PM Progress: 4001
04/20/2018 12:09:01 PM Progress: 5001
04/20/2018 12:15:35 PM Progress: 6001
04/20/2018 12:21:51 PM Progress: 7001
04/20/2018 12:28:24 PM Progress: 8001
04/20/2018 12:35:53 PM Progress: 9001
04/20/2018 12:42:28 PM Progress: 10001
04/20/2018 12:49:07 PM Progress: 11001
04/20/2018 12:55:29 PM Progress: 12001
04/20/2018 01:02:18 PM Progress: 13001


## Delete from a table

In [122]:
sql = "SELECT * from genes"
df_genes = pd.read_sql(sql, engine)
gene_tables = df_genes['gene_id'].apply(gene_id_to_table_name)

metadata = sa.MetaData(engine)
for i, gene_table in enumerate(gene_tables):
    if i % 1000 == 0:
        logging.info('Progress: {}'.format(i+1))
        
    table = sa.Table(gene_table, metadata, autoload=True)
    d = table.delete()
    d.execute()

08/07/2018 04:00:39 PM Progress: 1
08/07/2018 04:01:01 PM Progress: 1001
08/07/2018 04:01:19 PM Progress: 2001
08/07/2018 04:01:36 PM Progress: 3001
08/07/2018 04:01:52 PM Progress: 4001
08/07/2018 04:02:08 PM Progress: 5001
08/07/2018 04:02:23 PM Progress: 6001
08/07/2018 04:02:40 PM Progress: 7001
08/07/2018 04:02:55 PM Progress: 8001
08/07/2018 04:03:12 PM Progress: 9001
08/07/2018 04:03:27 PM Progress: 10001
08/07/2018 04:03:45 PM Progress: 11001
08/07/2018 04:04:00 PM Progress: 12001
08/07/2018 04:04:12 PM Progress: 13001
08/07/2018 04:04:24 PM Progress: 14001
08/07/2018 04:04:36 PM Progress: 15001
08/07/2018 04:04:48 PM Progress: 16001
08/07/2018 04:04:59 PM Progress: 17001
08/07/2018 04:05:11 PM Progress: 18001
08/07/2018 04:05:22 PM Progress: 19001
08/07/2018 04:05:34 PM Progress: 20001
08/07/2018 04:05:45 PM Progress: 21001
08/07/2018 04:05:56 PM Progress: 22001
08/07/2018 04:06:07 PM Progress: 23001
08/07/2018 04:06:18 PM Progress: 24001
08/07/2018 04:06:29 PM Progress: 25001

## Update ensemble (old)

- update to enss
- create and update Ens 

In [3]:
engine_old = connect_sql('CEMBA_snATAC')

sql = 'SELECT * from ensembles'
df = pd.read_sql(sql, engine_old)
df.head()

Unnamed: 0,ensemble_id,ensemble_name,public_access,datasets,description,snmc_ensemble_id
0,1,CEMBA_3C_171206,0,CEMBA_3C_171206,,1
1,2,CEMBA_3C_171207,0,CEMBA_3C_171207,,2
2,100,Multimodal_test,0,"CEMBA_3C_171206,CEMBA_3C_171207,CEMBA_4B_17121...",Test multimodal visualization,100


In [133]:
insert_into(engine, 'ensembles', df, ignore=False, verbose=True)

08/08/2018 02:28:23 PM    ensemble_id    ensemble_name  public_access  \
0            1  CEMBA_3C_171206              0   
1            2  CEMBA_3C_171207              0   
2          100  Multimodal_test              0   

                                            datasets  \
0                                    CEMBA_3C_171206   
1                                    CEMBA_3C_171207   
2  CEMBA_3C_171206,CEMBA_3C_171207,CEMBA_4B_17121...   

                     description  snmc_ensemble_id  
0                           None                 1  
1                           None                 2  
2  Test multimodal visualization               100  


<sqlalchemy.engine.result.ResultProxy at 0x7f7727e7e0b8>

In [196]:
engine_old = connect_sql('CEMBA_snATAC')
engine= connect_sql('CEMBA_ATAC')

sql = 'SELECT * FROM Ens2 JOIN cells ON cells.cell_id = Ens2.cell_id'
df = pd.read_sql(sql, engine_old).drop(['cell_id', 'dataset'], axis=1)
df['cell_name'] = [cell_name[16:]+'_'+cell_name[:15] for cell_name in df['cell_name']]

sql = 'SELECT * FROM cells'
df2 = pd.read_sql(sql, engine) 
df = pd.merge(df, df2, on='cell_name').drop(['cell_name', 'dataset'], axis=1)
df.head()

create_and_upload_to_ens('Ens2', df, 'CEMBA_ATAC')

08/10/2018 09:58:01 AM Creating Ens2 table...
08/10/2018 09:58:01 AM Done creating Ens2 table!
08/10/2018 09:58:01 AM Uploading to Ens2 table...
08/10/2018 09:58:01 AM    tsne_x_ATAC  tsne_y_ATAC  cluster_ATAC annotation_ATAC  cell_id
0     -9.35173     30.32810            13            None    92411
1    -32.81210      4.67262            19            None    92412
2      3.88788     -2.13479             8            None    92413
3     32.09870     19.15250            17            None    92415
4     -1.28046     20.56240            20            None    92416
08/10/2018 09:58:02 AM Done uploading to Ens2 table!


## Update ensemble (new)

- update to enss
- create and update Ens 

In [31]:

ens_ids = sorted([int(item.split('_')[2][3:-4]) for item in 
 filter(lambda x: (re.search(r'^tsne', x)), os.listdir('/cndd/fangming/CEMBA/ATAC'))])
ens_ids

engine = connect_sql('CEMBA_ATAC')
sql = 'SELECT * FROM ensembles'
ensemble_info = pd.read_sql(sql, engine)
ens_ids = [ens_id for ens_id in ens_ids if ens_id not in ensemble_info['ensemble_id'].values]
ens_ids

[51]

In [30]:


# ens_ids = [
#     3, 
#     4, 
#     5, 
#     6, 
#     7, 
#     8, 
#     9, 
#     10, 
# ]

engine = connect_sql('CEMBA')
sql = 'SELECT * FROM ensembles'
ensemble_info = pd.read_sql(sql, engine)
ensemble_info = ensemble_info.set_index('ensemble_id').loc[ens_ids, :].reset_index()
ensemble_info['ens'] = ensemble_info['ensemble_id'].apply(lambda x: 'Ens{}'.format(x)) 
ensemble_info['snmc_ensemble_id'] = ensemble_info['ensemble_id'] 
ensemble_info = ensemble_info[['ens', 'ensemble_name', 'datasets', 'snmc_ensemble_id']]
ensemble_info



for idx, (ens, ens_name, ens_datasets, snmc_ens_id) in ensemble_info.iterrows():

    # update to ensembles
    table_name = 'ensembles'
    ens_id = int(ens[len('Ens'):])
    dict_list = [{'ensemble_id': ens_id, 
                  'ensemble_name': ens_name, 
                  'public_access': False, 
                  'datasets': ens_datasets, 
                  'snmc_ensemble_id': snmc_ens_id, 
                 }]
    engine= connect_sql('CEMBA_ATAC')
    insert_into_worker(engine, table_name, dict_list, ignore=False)

    
    # create and update Ens
    dirc = '/cndd/fangming/CEMBA/ATAC/'
    f1 = dirc + 'tsne_atac_{}.tsv'.format(ens)
    df1 = pd.read_table(f1)
    f2 = dirc + 'cluster_atac_{}.tsv'.format(ens)
    df2 = pd.read_table(f2)
    df = pd.merge(df1, df2, on='sample').rename(columns={'sample': 'cell_name'})


    sql = 'SELECT * FROM cells'
    df2 = pd.read_sql(sql, engine) 
    df = (pd.merge(df, df2, on='cell_name')
          .drop(['cell_name', 'dataset'], axis=1)
          .rename(columns={
              'tsne_x': 'tsne_x_ATAC', 
              'tsne_y': 'tsne_y_ATAC', 
              'cluster_ID': 'cluster_ATAC', 
          }))
    df['cluster_ATAC'] = df['cluster_ATAC'].apply(lambda x: int(x[len('cluster_'):]))

    create_and_upload_to_ens(ens, df, 'CEMBA_ATAC')

In [230]:
# TEMPORARY!!!!!!!!

# f = '/cndd/fangming/CEMBA/annoj_browser/multimodal_v2/cell_info.tsv'
# ens = 'Ens100'
# df = pd.read_table(f)
# df = df[df['modality']=='atac']
# df = df.drop('modality', axis=1).rename(columns={'tsne_x': 'tsne_x_ATAC', 
#                                             'tsne_y': 'tsne_y_ATAC',
#                                             'cluster_ID': 'cluster_ATAC',
#                                             'sample': 'cell_name',
#                                            })
# sql = 'SELECT * FROM cells'
# df2 = pd.read_sql(sql, engine)
# df = pd.merge(df, df2, on='cell_name').drop(['dataset', 'cell_name'], axis=1)

# create_and_upload_to_ens(ens, df, 'CEMBA_ATAC')



Unnamed: 0,cluster_ATAC,tsne_x_ATAC,tsne_y_ATAC,cell_id
0,cluster_15,-10.438904,-34.038421,82477
1,cluster_2,23.816641,19.874356,82479
2,cluster_13,23.799659,4.783634,82480
3,cluster_1,-36.239155,5.543818,82484
4,cluster_10,-19.669854,-25.038445,82485
