In [22]:
import pandas as pd
import numpy as np
import mygene
import pickle
import os
from subprocess import Popen, PIPE
from collections import defaultdict
import h5py

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
#1)The valid read pairs that were mapped to genomic locations within 200 kb of each other were defined as 
#proximal interactions, which were excluded from our analysis. 
def preprocessMARGiData():
    data = pd.read_csv("../MARGI/GSM3478206_HFF_iMARGI.bedpe", sep="\t", header=None)
    data['is_proximal'] = np.abs(data[1] - data[5]) <= 200000
    data = data[data['is_proximal'] == False][[0, 1, 2, 3, 4, 5 ,6 ,7 ,8, 9]]
    data = data.rename({0: "chrom1", 1: "start1", 2: "end1", 3: "chrom2", 4: "start2", 5: "end2", 
                        6: "name", 7:"score", 8: "strand1", 9:"strand2"}, axis=1)
    
    data['name1'] = ['.']*data.shape[0]
    data['score1'] = ['.']*data.shape[0]
    
    data = data[(data['chrom1'] != 'chrM') & (data['chrom2'] != 'chrM')]
    
    data = data[['chrom1', 'start1', 'end1', 'name1', 'score1', 'strand1', 'chrom2', 'start2', 'end2', 'strand2']]
    data.to_csv("../MARGI/GSM3478206_HFF_iMARGI_not_proximal.bedpe", sep="\t", header=None, index=None)

In [None]:
#2)Надо аннотировать RNA конец, не забыть, что:
#According to iMARGI library construction design, the RNA end (Read 1) is reverse strand specific. 
#It means that when you annotate the RNA end with gene annotations, you need to reverse the strand, 
#i.e., "+" -> "-" and "-" -> "+". The DNA end (Read 2) is not strand specific.

In [None]:
def processGencodeAnnotation():
    callWithNiceOutput("gff2bed < ~/annotation/gencode.v31.annotation.gff3 > ~/annotation/gencode.v31.annotation.bed")
    callWithNiceOutput("cat ~/annotation/gencode.v31.annotation.bed |  cut -f 1,2,3,4,5,6 > ~/annotation/gencode.v31.annotation.small.bed")
    
    ann = pd.read_csv("~/annotation/gencode.v31.annotation.small.bed", sep="\t", header=None)
    ann[ann[3].str.startswith("ENSG")].to_csv("../annotation/gencode.v31.annotation.only_genes.bed", sep="\t", header=None, index=None)

In [19]:
def processFantomAnnotation():
    fantom_anno = pd.read_csv("../annotation/F6_CAT.transcript.gtf", sep="\t", header=None)
    fantom_anno[8] = [s.split(";")[0][9:-1] for s in fantom_anno[8]]
    fantom_anno = fantom_anno.sort_values([8, 3, 4])
    
    strand = fantom_anno.groupby([8]).first()[6]
    
    counts = fantom_anno.groupby([8]).count()[0].tolist()
    start_coords = [0] + list(np.cumsum(counts))
    last_start_coord = start_coords.pop()
    end_coords = [i - 1 for i in start_coords][1:] + [last_start_coord - 1]
    
    fantom_anno = pd.DataFrame.from_dict({"0": fantom_anno.iloc[start_coords][0].tolist(), 
                                      "1": fantom_anno.iloc[start_coords][3].tolist(), 
                                      "2": fantom_anno.iloc[end_coords][4].tolist(), 
                                      "3": fantom_anno.iloc[start_coords][8].tolist(),
                                      "4": ['.']*len(fantom_anno.iloc[start_coords][3].tolist()),
                                      "5": strand.tolist()})
    return fantom_anno

In [21]:
processFantomAnnotation().to_csv("../annotation/fantom_gene_annotation_with_strand.bed", header=None, index=None, sep="\t")

In [27]:
def processMARGiData(intersect, output):
    #callWithNiceOutput("bedtools intersect -a ../MARGI/GSM3478206_HFF_iMARGI_not_proximal.bedpe -b ../annotation/gencode.v31.annotation.only_genes.bed -wao -S > ../MARGI/output.bed")
    data = pd.read_csv(intersect, sep="\t", header=None)
    data = data[data[10] != '.'][[13, 6, 7, 8, 9]]
    
    #data.to_csv("../MARGI/data_with_rna_names.bed", sep="\t", header=None, index=None)
    data = data.rename({13: 'ensembl_id'}, axis=1)
    
    data['ensembl_id'] = data['ensembl_id'].apply(lambda x: x.split('.')[0])
    
    genes = [g for g in data['ensembl_id'].unique()]
    print("genes count: " + str(len(genes)))
    
    mg = mygene.MyGeneInfo()
    out = mg.querymany(genes, scopes='ensembl.gene', fields='symbol', species='human', 
                       returnall=True, as_dataframe=True)
    with_symbol = out['out']
    del with_symbol.index.name
    with_symbol['ensembl_id'] = with_symbol.index
    with_symbol = with_symbol.reset_index()[['ensembl_id', 'symbol']]
    t = pd.merge(data, with_symbol, on='ensembl_id', how='left')
    t = t[['ensembl_id', 'symbol', 6, 7, 8, 9]]
    t.to_csv(output, sep="\t", header=None, index=None)

In [None]:
processMARGiData("../MARGI/output.bed", "../MARGI/data_with_rna_names.bed")

In [28]:
processMARGiData("../MARGI/output_fantom.bed", "../MARGI/data_with_fantom_rna_names.bed")

genes count: 56879
querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35

In [1]:
def callWithNiceOutput(cmd):
    proc = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True, bufsize=-1)
    
    for line in proc.stdout:
        print("output: ")
        print(line)
        sys.stdout.flush()
    
    for line in proc.stderr:
        print("error: ")
        print(line)
        sys.stdout.flush()

In [45]:
def makeMARGiRegions(target, is_fantom_annotation=False):
    print("make margi regions for " + target)
    
    if is_fantom_annotation:
        our_lncRNAs_names_file = "our_fantom_fantom_aso_genes_association_pvalues.tsv"
        margi_file = "data_with_fantom_rna_names.bed"
        common_rnas_folder = "fantom_common_rnas"
    else:
        our_lncRNAs_names_file = "our_fantom_genes_association_pvalues.tsv"
        margi_file = "data_with_rna_names.bed"
        common_rnas_folder = "common_rnas"
        
    lncRNAsNames = pd.read_csv("../all_marks/" + target + "/" + our_lncRNAs_names_file, sep="\t")['lncRNAId']
    margi = pd.read_csv("../MARGI/" + margi_file, sep="\t", header=None)
    margi_lncRNAs = list(margi[0].unique())
    
    commonRNAs = [l for l in lncRNAsNames if l.split('_')[0] in margi_lncRNAs]
    print("common rnas count " + str(len(commonRNAs)))
    
    if not os.path.exists("../MARGI/" + target + "/" + common_rnas_folder + "/"):
        os.makedirs("../MARGI/" + target + "/" + common_rnas_folder + "/")
    
    for rna in commonRNAs:
        df = margi[margi[0] == rna][[2, 3, 4]]
        df.to_csv("../MARGI/" + target + "/" + common_rnas_folder + "/" + rna + ".bed", sep="\t", header=None, index=None)

In [46]:
def makePeaksRegions(target, methylation=False, is_fantom_annotation=False):
    print("make peak regions for " + target)
    if is_fantom_annotation:
        our_lncRNAs_names_file = "our_fantom_fantom_aso_genes_association_pvalues.tsv"
        margi_file = "data_with_fantom_rna_names.bed"
        output = "fantom_margi"
    else:
        our_lncRNAs_names_file = "our_fantom_genes_association_pvalues.tsv"
        margi_file = "data_with_rna_names.bed"
        output = "margi"
    
    peaks = pd.read_csv("../all_marks/" + target + "/merged_peaks_first_in_biosample.bed", sep="\t", header=None)
    margi_lncRNAs = set(pd.read_csv("../MARGI/" + margi_file, sep="\t", header=None)[0].unique())
    
    def part(corr, lncRNAsNames):

        if not os.path.exists("../all_marks/" + target + "/" + output):
            os.makedirs("../all_marks/" + target + "/" + output)

        for index, rna in lncRNAsNames:
            corrs = corr[index, :]
            peaks_names = ["peak_" + str(i) for i in np.nonzero(corrs)[0]]
            lnc_peaks = peaks[peaks[3].isin(peaks_names)]
            lnc_peaks.to_csv("../all_marks/" + target + "/" + output + "/" + rna + ".bed", sep="\t", header=None, index=None)
            
    
    if(methylation):
        for i in range(0, 19):
            with h5py.File("../all_marks/" + target + "/lncRNA_Peaks_corrs/lncRNA_Peaks_Correlations_corrected_non_zero_" + str(i) + ".hdf5", 'r') as f:
                lncRNAsNames = pd.read_csv("../all_marks/" + target + "/" + our_lncRNAs_names_file, sep="\t")['lncRNAId']
                commonRNAs = [(i, l) for i, l in enumerate(lncRNAsNames) if l.split('_')[0] in margi_lncRNAs]
                print("common rnas count " + str(len(commonRNAs)))
                part(f['corrs_matrix'], commonRNAs)
    else:
         with h5py.File("../all_marks/" + target + "/lncRNA_Peaks_corrs/lncRNA_Peaks_Correlations_corrected_non_zero.hdf5", 'r') as f:
                lncRNAsNames = pd.read_csv("../all_marks/" + target + "/" + our_lncRNAs_names_file, sep="\t")['lncRNAId']
                commonRNAs = [(i, l) for i, l in enumerate(lncRNAsNames) if l.split('_')[0] in margi_lncRNAs]
                print("common rnas count " + str(len(commonRNAs)))
                part(f['corrs_matrix'], commonRNAs)

In [47]:
#TODO!!!: ("H3K27ac", "_narrow")
#("H3K27ac", ""), 
targets = [("H3K27ac", ""), ("H3K27me3", ""), ("H3K36me3", ""), ("H3K4me1", "_narrow"), 
           ("H3K4me2", "_narrow"), ("H3K4me3", "_narrow"), ("H3K79me2", ""), ("H3K9ac", "_narrow"), 
           ("H3K9me3", ""), ("H4K20me1", ""), ("methylation", "")]

In [48]:
for target in targets:
    print(target[0])
    makeMARGiRegions(target[0], True)
    makePeaksRegions(target[0], methylation=(target[0] == "methylation"), is_fantom_annotation=True)

H3K27ac
make margi regions for H3K27ac
common rnas count 36
make peak regions for H3K27ac
common rnas count 36
H3K27me3
make margi regions for H3K27me3
common rnas count 12
make peak regions for H3K27me3
common rnas count 12
H3K36me3
make margi regions for H3K36me3
common rnas count 15
make peak regions for H3K36me3
common rnas count 15
H3K4me1
make margi regions for H3K4me1
common rnas count 28
make peak regions for H3K4me1
common rnas count 28
H3K4me2
make margi regions for H3K4me2
common rnas count 29
make peak regions for H3K4me2
common rnas count 29
H3K4me3
make margi regions for H3K4me3
common rnas count 35
make peak regions for H3K4me3
common rnas count 35
H3K79me2
make margi regions for H3K79me2
common rnas count 11
make peak regions for H3K79me2
common rnas count 11
H3K9ac
make margi regions for H3K9ac
common rnas count 14
make peak regions for H3K9ac
common rnas count 14
H3K9me3
make margi regions for H3K9me3
common rnas count 11
make peak regions for H3K9me3
common rnas coun

KeyboardInterrupt: 

In [None]:
#Смотрим на результаты генометрика

In [11]:
d = {}
for t in targets:
    df = pd.read_csv("../all_marks/" + t[0] + "/genometric_result_all_rnas.tsv", sep="\t")
    d[t[0]] = len(df['lnc'].unique())

In [12]:
d

{'H3K27ac': 855,
 'H3K27me3': 516,
 'H3K36me3': 539,
 'H3K4me1': 828,
 'H3K4me2': 562,
 'H3K4me3': 770,
 'H3K79me2': 394,
 'H3K9ac': 630,
 'H3K9me3': 468,
 'H4K20me1': 456}

In [9]:
def makeMatrixFromGenometricResults(target):
    frames = []
    for f in os.listdir("../all_marks/" + target + "/genometric_margi_results/"):
        if f.endswith('tsv'):
            lnc_name = f.split(".")[0]
            frame = pd.read_csv("../all_marks/" + target + "/genometric_margi_results/" + f, sep='\t')
            frame['lnc'] = [lnc_name]*frame.shape[0]
            frames.append(frame)
    
    df = pd.concat(frames)
    #df = df.set_index(['lnc', 'chr.names'])
    df.to_csv("../all_marks/" + target + "/genometric_margi_results/all_rnas.csv", sep='\t')
    one_chr_rnas = set(df['lnc'].unique()) - set(df[df['chr.names'] == 'awhole']['lnc'])
    df = df[(df['chr.names'] == 'awhole') | (df['lnc'].isin(one_chr_rnas))]
    
    df = df.applymap(lambda x: 0 if x == '<0.001' else x)
    
    df['relative.distances.ecdf.deviation.area.p.value'] = df['relative.distances.ecdf.deviation.area.p.value'].astype(float)
    df['jaccard.measure.p.value'] = df['jaccard.measure.p.value'].astype(float)
    df['scaled.absolute.min.distance.sum.p.value'] = df['scaled.absolute.min.distance.sum.p.value'].astype(float)
    
    df = df[(df['relative.distances.ks.p.value'] < 0.01) | 
            (df['relative.distances.ecdf.deviation.area.p.value'] < 0.01) | 
            ((df['scaled.absolute.min.distance.sum.p.value'] < 0.01)&
             (df['scaled.absolute.min.distance.sum.lower.tail'] == False)) | 
            ((df['jaccard.measure.p.value'] < 0.01)&(df['jaccard.measure.lower.tail'] == False)) | 
            ((df['projection.test.p.value'] < 0.01)&(df['projection.test.lower.tail'] == False))]
    
    df.to_csv("../all_marks/" + target + "/genometric_result_all_rnas.tsv", sep="\t", index=None)
    
    

In [10]:
makeMatrixFromGenometricResults("methylation")

In [14]:
df = pd.read_csv("../all_marks/" + "methylation" + "/our_fantom_genes_association_pvalues.tsv", sep="\t")
df[df['lncRNAId'].isin(pd.read_csv("../all_marks/" + "methylation" + "/genometric_result_all_rnas.tsv", sep="\t")['lnc'])]

Unnamed: 0,lncRNAId,lncRNAName,pvalue,mm_pvalue,mp_pvalue,pm_pvalue,pp_pvalue
0,ENSG00000203706,SERTAD4-AS1,7.140878e-06,2.124192e-09,0.029237,0.4358948,1.0
1,ENSG00000229847,EMX2OS,6.701828e-13,3.2057170000000003e-23,0.905064,0.7275849,1.0
2,ENSG00000229043,AC091729.9,0.270762,1.0,0.001649,0.107629,0.184658
3,ENSG00000231187,RP11-38L15.3,0.2512517,0.01753837,0.949268,0.0757132,0.184658
4,ENSG00000230630,DNM3OS,5.394249e-06,0.7826021,0.937126,5.212354e-13,0.942134
5,ENSG00000230844,ZNF674-AS1,0.003432298,7.71782e-06,0.836597,0.59827,0.754626
8,ENSG00000233901,RP11-65J3.1,0.1050915,0.3173809,1.0,3.118661e-05,0.107049
9,ENSG00000246067,RAB30-AS1,0.0003867063,0.001400992,0.073806,5.971964e-08,0.805791
10,ENSG00000271270,TMCC1-AS1,9.927673000000001e-18,2.6102700000000004e-29,0.073806,8.45422e-07,0.784685


In [17]:
t = pd.read_csv("~/all_marks/methylation/genometric_margi_results/ENSG00000230590.tsv", sep="\t")
t[t['chr.names'] == 'chrX']

Unnamed: 0,chr.names,query.population,reference.population,relative.distances.ks.p.value,relative.distances.ecdf.deviation.area,relative.distances.ecdf.area.correlation,relative.distances.ecdf.deviation.area.p.value,scaled.absolute.min.distance.sum.p.value,scaled.absolute.min.distance.sum.lower.tail,jaccard.measure.p.value,jaccard.measure.lower.tail,projection.test.p.value,projection.test.lower.tail,projection.test.obs.to.exp
22,chrX,1970,2308,0.373911,0.003579,-0.007955,0.399,0.013,False,0.045,False,0.028718,False,0.0


In [15]:
from collections import defaultdict
d = defaultdict(list)
t= {}
for target in targets:
    #makeMatrixFromGenometricResults(target[0])
    df = pd.read_csv("../all_marks/" + target[0] + "/genometric_result_all_rnas.tsv", sep="\t")
    fantom_pvalues = pd.read_csv("../all_marks/" + target[0] + "/our_fantom_genes_association_pvalues.tsv", sep="\t")
    filt = df[df['lnc'].isin(fantom_pvalues['lncRNAId'])]
    margi_rnas = filt['lnc'].tolist()
    
    pp_significant_lnc = fantom_pvalues[(fantom_pvalues['pp_pvalue'] < 0.05)&(fantom_pvalues['lncRNAId'].isin(margi_rnas))][['lncRNAId', 'lncRNAName']]
    mp_significant_lnc = fantom_pvalues[(fantom_pvalues['mp_pvalue'] < 0.05)&(fantom_pvalues['lncRNAId'].isin(margi_rnas))][['lncRNAId', 'lncRNAName']]
    pm_significant_lnc = fantom_pvalues[(fantom_pvalues['pm_pvalue'] < 0.05)&(fantom_pvalues['lncRNAId'].isin(margi_rnas))][['lncRNAId', 'lncRNAName']]
    mm_significant_lnc = fantom_pvalues[(fantom_pvalues['mm_pvalue'] < 0.05)&(fantom_pvalues['lncRNAId'].isin(margi_rnas))][['lncRNAId', 'lncRNAName']]
    significant_lnc = fantom_pvalues[(fantom_pvalues['pvalue'] < 0.05)&(fantom_pvalues['lncRNAId'].isin(margi_rnas))][['lncRNAId', 'lncRNAName']]
    
    for l in pp_significant_lnc['lncRNAName']:
        d[l].append(target[0] + "_pp")
    for l in pm_significant_lnc['lncRNAName']:
        d[l].append(target[0] + "_pm")
    for l in mp_significant_lnc['lncRNAName']:
        d[l].append(target[0] + "_mp")
    for l in mm_significant_lnc['lncRNAName']:
        d[l].append(target[0] + "_mm")
        
    t[target[0]] = len(pp_significant_lnc) + len(pm_significant_lnc) + len(mp_significant_lnc) + len(mm_significant_lnc)
    
    print(target[0])
    print("pp: " + str(len(pp_significant_lnc)) + ", pm: " + str(len(pm_significant_lnc)) + ", mp: " + str(len(mp_significant_lnc)) + ", mm: " + str(len(mm_significant_lnc)))
    print("pp: " + ", ".join(pp_significant_lnc['lncRNAName'].tolist()))
    print("pm: " + ", ".join(pm_significant_lnc['lncRNAName'].tolist()))
    print("mp: " + ", ".join(mp_significant_lnc['lncRNAName'].tolist()))
    print("mm: " + ", ".join(mm_significant_lnc['lncRNAName'].tolist()))
    
    
    

H3K27ac
pp: 0, pm: 8, mp: 5, mm: 6
pp: 
pm: LINC00702, TMCC1-AS1, RP11-395B7.4, MAGI2-AS3, DNM3OS, LINC00968, MEG3, LINC00511
mp: LINC00702, LINC00657, DNM3OS, LINC00968, MEG3
mm: LINC00702, RP11-395B7.4, MAPKAPK5-AS1, DNM3OS, LINC00968, LINC00511
H3K27me3
pp: 0, pm: 0, mp: 0, mm: 1
pp: 
pm: 
mp: 
mm: LINC00968
H3K36me3
pp: 0, pm: 3, mp: 0, mm: 0
pp: 
pm: RP11-395B7.4, CTD-2587H24.5, RP6-99M1.2
mp: 
mm: 
H3K4me1
pp: 0, pm: 3, mp: 0, mm: 3
pp: 
pm: FGD5-AS1, DNM3OS, MEG3
mp: 
mm: TMCC1-AS1, RP11-660L16.2, MEG3
H3K4me2
pp: 1, pm: 3, mp: 2, mm: 3
pp: RAB30-AS1
pm: RAB30-AS1, RP6-99M1.2, LINC00862
mp: NR2F1-AS1, RAB30-AS1
mm: BOLA3-AS1, RAB30-AS1, RP11-539L10.3
H3K4me3
pp: 0, pm: 4, mp: 0, mm: 2
pp: 
pm: LINC00702, DNM3OS, RP6-109B7.3, MEG3
mp: 
mm: SBF2-AS1, A1BG-AS1
H3K79me2
pp: 0, pm: 3, mp: 0, mm: 1
pp: 
pm: LINC00968, CD27-AS1, MEG3
mp: 
mm: CD27-AS1
H3K9ac
pp: 0, pm: 4, mp: 0, mm: 5
pp: 
pm: LINC00657, FGD5-AS1, DNM3OS, RP11-137L10.6
mp: 
mm: LINC00511, LINC00963, SBF2-AS1, FGD5-AS1,

In [16]:
t

{'H3K27ac': 19,
 'H3K27me3': 1,
 'H3K36me3': 3,
 'H3K4me1': 6,
 'H3K4me2': 9,
 'H3K4me3': 6,
 'H3K79me2': 4,
 'H3K9ac': 9,
 'H3K9me3': 2,
 'H4K20me1': 2}

In [8]:
from itertools import product
marks = ["H3K27ac", "H3K27me3", "H3K36me3", "H3K4me1", "H3K4me2", "H3K4me3", "H3K79me2", "H3K9ac", "H3K9me3", "H4K20me1"]
types = ["pp", "mp", "pm", "mm"]

In [9]:
d_df = defaultdict(list)
for k in d.keys():
    l = [(i.split('_')[0], i.split('_')[1]) for i in d[k]]
    print(l)
    for p in list(product(marks, types)):
        print(p)
        if(p in l):
            d_df[k].append(1)
        else:
            d_df[k].append(0)

[('H3K27ac', 'pm'), ('H3K27ac', 'mp'), ('H3K27ac', 'mm'), ('H3K4me3', 'pm'), ('H3K9me3', 'mm')]
('H3K27ac', 'pp')
('H3K27ac', 'mp')
('H3K27ac', 'pm')
('H3K27ac', 'mm')
('H3K27me3', 'pp')
('H3K27me3', 'mp')
('H3K27me3', 'pm')
('H3K27me3', 'mm')
('H3K36me3', 'pp')
('H3K36me3', 'mp')
('H3K36me3', 'pm')
('H3K36me3', 'mm')
('H3K4me1', 'pp')
('H3K4me1', 'mp')
('H3K4me1', 'pm')
('H3K4me1', 'mm')
('H3K4me2', 'pp')
('H3K4me2', 'mp')
('H3K4me2', 'pm')
('H3K4me2', 'mm')
('H3K4me3', 'pp')
('H3K4me3', 'mp')
('H3K4me3', 'pm')
('H3K4me3', 'mm')
('H3K79me2', 'pp')
('H3K79me2', 'mp')
('H3K79me2', 'pm')
('H3K79me2', 'mm')
('H3K9ac', 'pp')
('H3K9ac', 'mp')
('H3K9ac', 'pm')
('H3K9ac', 'mm')
('H3K9me3', 'pp')
('H3K9me3', 'mp')
('H3K9me3', 'pm')
('H3K9me3', 'mm')
('H4K20me1', 'pp')
('H4K20me1', 'mp')
('H4K20me1', 'pm')
('H4K20me1', 'mm')
[('H3K27ac', 'pm'), ('H3K4me1', 'mm')]
('H3K27ac', 'pp')
('H3K27ac', 'mp')
('H3K27ac', 'pm')
('H3K27ac', 'mm')
('H3K27me3', 'pp')
('H3K27me3', 'mp')
('H3K27me3', 'pm')
('H3

In [10]:
df = pd.DataFrame.from_dict(d_df, orient="index")
df.columns = pd.MultiIndex.from_product([marks, types], names=['target', 'type'])

In [11]:
df

target,H3K27ac,H3K27ac,H3K27ac,H3K27ac,H3K27me3,H3K27me3,H3K27me3,H3K27me3,H3K36me3,H3K36me3,H3K36me3,H3K36me3,H3K4me1,H3K4me1,H3K4me1,H3K4me1,H3K4me2,H3K4me2,H3K4me2,H3K4me2,H3K4me3,H3K4me3,H3K4me3,H3K4me3,H3K79me2,H3K79me2,H3K79me2,H3K79me2,H3K9ac,H3K9ac,H3K9ac,H3K9ac,H3K9me3,H3K9me3,H3K9me3,H3K9me3,H4K20me1,H4K20me1,H4K20me1,H4K20me1
type,pp,mp,pm,mm,pp,mp,pm,mm,pp,mp,pm,mm,pp,mp,pm,mm,pp,mp,pm,mm,pp,mp,pm,mm,pp,mp,pm,mm,pp,mp,pm,mm,pp,mp,pm,mm,pp,mp,pm,mm
LINC00702,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
TMCC1-AS1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
RP11-395B7.4,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
MAGI2-AS3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
DNM3OS,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
LINC00968,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
MEG3,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
LINC00511,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
LINC00657,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
MAPKAPK5-AS1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
rna_target = df.index.to_frame()
rna_target = rna_target.reset_index()[[0]]
rna_target.columns = ['lncRNAName']

for i in range(0, 10):
    rna_target[marks[i]] = list(df.iloc[:, i*4:(i*4 + 4)].apply(lambda x : np.max(x), axis=1))
    
rna_target = rna_target.set_index('lncRNAName')
del rna_target.index.name

In [13]:
rna_target

Unnamed: 0,H3K27ac,H3K27me3,H3K36me3,H3K4me1,H3K4me2,H3K4me3,H3K79me2,H3K9ac,H3K9me3,H4K20me1
LINC00702,1,0,0,0,0,1,0,0,1,0
TMCC1-AS1,1,0,0,1,0,0,0,0,0,0
RP11-395B7.4,1,0,1,0,0,0,0,0,0,0
MAGI2-AS3,1,0,0,0,0,0,0,0,0,1
DNM3OS,1,0,0,1,0,1,0,1,0,0
LINC00968,1,1,0,0,0,0,1,0,0,0
MEG3,1,0,0,1,0,1,1,0,0,0
LINC00511,1,0,0,0,0,0,0,1,0,0
LINC00657,1,0,0,0,0,0,0,1,0,0
MAPKAPK5-AS1,1,0,0,0,0,0,0,0,0,0


In [18]:
def color_negative_red(val):
        try:
            color = 'red' if val == 1 else 'black'
        except Exception:
            color = 'black'
        return 'color: %s' % color

In [19]:
makeHtmlTable(rna_target.style.applymap(color_negative_red), "/home/mazurovev/iMARGI_results.html")

In [16]:
def makeHtmlTable(s, path):
    with open(path, 'w') as f:
        for index, item in enumerate(s.render().split("\n")):
            if(index == 0):
                f.write('<style  type="text/css" >\n')
                f.write('\n')
                f.write('table, th, td {\n')
                f.write('border: 1px solid black;\n')
                f.write('border-collapse: collapse;\n')
                f.write('font-size: 11px;\n')
                f.write('}\n')
                f.write('\n')
                f.write('th,\n')
                f.write('td {\n')
                f.write('border: 1px solid black;\n')
                f.write('width: 100px;\n')
                f.write('height: 25px;\n')
                f.write('text-align:center;\n')
                f.write('font-family: Montserrat;\n')
                f.write('overflow: hidden;\n')
                f.write('}\n')
                f.write('\n')
                f.write('tr:nth-child(even) {\n')
                f.write('background-color: #ffe6e6\n')
                f.write('}\n')
                f.write('\n')
            else:
                f.write("%s\n" % item)

In [14]:
rna_target.apply(lambda x: sum(x))

H3K27ac     10
H3K27me3     1
H3K36me3     3
H3K4me1      5
H3K4me2      6
H3K4me3      6
H3K79me2     3
H3K9ac       8
H3K9me3      2
H4K20me1     2
dtype: int64

In [15]:
rna_target.apply(lambda x: sum(x), axis=1)

LINC00702        3
TMCC1-AS1        2
RP11-395B7.4     2
MAGI2-AS3        2
DNM3OS           4
LINC00968        3
MEG3             4
LINC00511        2
LINC00657        2
MAPKAPK5-AS1     1
CTD-2587H24.5    1
RP6-99M1.2       2
FGD5-AS1         2
RP11-660L16.2    1
RAB30-AS1        1
LINC00862        1
NR2F1-AS1        1
BOLA3-AS1        1
RP11-539L10.3    1
RP6-109B7.3      1
SBF2-AS1         2
A1BG-AS1         1
CD27-AS1         1
RP11-137L10.6    1
LINC00963        1
AC016747.3       1
AC005592.2       1
RP11-65J3.1      1
dtype: int64