In [30]:
import mygene
from neo4j import GraphDatabase
from py2neo import Graph
import pandas as pd

In [2]:
def get_gene_name(gene_id):
    mg = mygene.MyGeneInfo()
    out = mg.querymany(gene_id, scopes='ensembl.gene', fields="symbol", species='human', 
                           returnall=True, as_dataframe=True)
    with_symbol = out['out']
    del with_symbol.index.name
    with_symbol['ensembl_id'] = with_symbol.index
    if 'symbol' in with_symbol.columns:
        with_symbol = with_symbol.reset_index()[['ensembl_id', 'symbol']]
        with_symbol = with_symbol[with_symbol['symbol'].notnull()]
        return with_symbol
    else:
        return None

In [3]:
def _ids(tx, query):
    result = tx.run(query)
    return [record["id"] for record in result]

In [6]:
driver = GraphDatabase.driver("bolt://localhost:7687")
neo4j_db = driver.session()

In [7]:
cypher = "match (g: Gene) return g.ensembl_id as id"

In [8]:
result = neo4j_db.read_transaction(_ids, cypher)

In [12]:
len([r for r in result if "CAT" not in r])

39458

In [13]:
names = get_gene_name([r for r in result if "CAT" not in r])

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
queryin

In [14]:
names.head()

Unnamed: 0,ensembl_id,symbol
0,ENSG00000116251,RPL22
1,ENSG00000125944,HNRNPR
2,ENSG00000116675,DNAJC6
3,ENSG00000213625,LEPROT
4,ENSG00000122482,ZNF644


In [16]:
graph = Graph()

In [29]:
statement = """
          UNWIND $rows as row
          MATCH (g:Gene) WHERE g.ensembl_id=row.ensembl_id
          SET g.name = row.symbol 
"""
tx = graph.begin(autocommit=True)
params = []
for index, row in names.iterrows():
    params = params + [{'ensembl_id': row['ensembl_id'], 'symbol': row['symbol']}]
    #if index % 5000 == 0:
    #    print("evaluate")
    #    tx.evaluate(statement, rows=params)
    #    params = []
tx.evaluate(statement, rows=params)

  


In [31]:
chip = pd.read_csv("/data/mazurovev/all_marks/ChiP-Seq_for_all_marks_all_replicas.tsv", sep = "\t")
data_origin = pd.read_csv("/data/mazurovev/all_marks/metadata_many_marks_hg38.tsv", sep="\t")
data_origin = data_origin[['Biosample term id', 'Biosample term name']]
d = {}
for e in ["H3K27ac", "H3K27me3", "H3K36me3", "H3K4me1", "H3K4me2", "H3K4me3", "H3K9ac", "H3K9me3", "H3K79me2", "H4K20me1"]:
    exp_targets = chip[chip['Experiment target'] == e + "-human"]['Biosample term id'].unique()
    d[e] = (len(exp_targets), data_origin[data_origin['Biosample term id'].isin(exp_targets)]['Biosample term name'].unique())

  interactivity=interactivity, compiler=compiler, result=result)


In [41]:
data_origin[data_origin['Biosample term id'] == 'CL:0002324'].iloc[0]['Biosample term name']

'myoepithelial cell of mammary gland'

In [44]:
params = []
for i in data_origin['Biosample term id'].unique():
    params = params + [{"id": i, "name": data_origin[data_origin['Biosample term id'] == i].iloc[0]['Biosample term name']}]

In [45]:
params

[{'id': 'CL:0002324', 'name': 'myoepithelial cell of mammary gland'},
 {'id': 'UBERON:0000320', 'name': 'duodenal mucosa'},
 {'id': 'UBERON:0012488', 'name': 'muscle layer of duodenum'},
 {'id': 'EFO:0002067', 'name': 'K562'},
 {'id': 'UBERON:0000948', 'name': 'heart'},
 {'id': 'EFO:0001086', 'name': 'A549'},
 {'id': 'NTR:0003830', 'name': 'mid-neurogenesis radial glial cells'},
 {'id': 'EFO:0002713', 'name': 'Panc1'},
 {'id': 'CL:0002326', 'name': 'luminal epithelial cell of mammary gland'},
 {'id': 'EFO:0001260', 'name': 'WI38'},
 {'id': 'UBERON:0014455', 'name': 'subcutaneous abdominal adipose tissue'},
 {'id': 'UBERON:0002106', 'name': 'spleen'},
 {'id': 'UBERON:0001134', 'name': 'skeletal muscle tissue'},
 {'id': 'EFO:0002779', 'name': 'BJ'},
 {'id': 'CL:0000594', 'name': 'skeletal muscle satellite cell'},
 {'id': 'CL:0000625', 'name': 'CD8-positive, alpha-beta T cell'},
 {'id': 'UBERON:0002107', 'name': 'liver'},
 {'id': 'EFO:0007105', 'name': 'iPS-18a'},
 {'id': 'UBERON:0012489'

In [46]:
statement = """
          UNWIND $rows as row
          MATCH (t:Tissue) WHERE t.encode_id=row.id
          SET t.name = row.name
"""
tx = graph.begin(autocommit=True)
tx.evaluate(statement, rows=params)

  
