In [1]:
import pandas as pd
import subprocess
from Bio import SeqIO
import pickle
import os
from pybedtools import BedTool
from multiprocessing import Pool
from itertools import chain
import h5py

In [61]:
rnas = pd.read_csv("../all_marks/H3K27ac/our_fantom_genes_association_pvalues.tsv", sep="\t")

In [3]:
rnas_with_prefixes = []
for i, row in rnas.iterrows():
    if(row['pvalue'] < 0.05):
        rnas_with_prefixes.append((row['lncRNAId'], '', ''))
    if(row['mm_pvalue'] < 0.05):
        rnas_with_prefixes.append((row['lncRNAId'], '', '_m'))
        rnas_with_prefixes.append((row['lncRNAId'], '_m', 'm'))
    if(row['mp_pvalue'] < 0.05):
        rnas_with_prefixes.append((row['lncRNAId'], '', '_p'))
        rnas_with_prefixes.append((row['lncRNAId'], '_m', 'p'))
    if(row['pm_pvalue'] < 0.05):
        rnas_with_prefixes.append((row['lncRNAId'], '', '_m'))
        rnas_with_prefixes.append((row['lncRNAId'], '_p', 'm'))
    if(row['pp_pvalue'] < 0.05):
        rnas_with_prefixes.append((row['lncRNAId'], '', '_p'))
        rnas_with_prefixes.append((row['lncRNAId'], '_p', 'p'))
rnas_with_prefixes = set(rnas_with_prefixes)

In [36]:
len(rnas_with_prefixes)

92

In [2]:
def generateTranscriptFiles():
    fasta_sequences = SeqIO.parse(open("/home/ymedvedeva/FANTOM6/data/private/F6_CAT.DMFB.target.all_transcript.fa"), 'fasta')
    for fasta in fasta_sequences:
        print(fasta.id.split('|')[1])
        with open("../fantom6/transcripts/" + fasta.id.split('|')[1] + ".fa", "w") as output_handle:
            SeqIO.write(fasta, output_handle, "fasta")

In [None]:
#generateFilesForTests(['ENSG00000231312', 'ENSG00000214548', 'ENSG00000246273', 'ENSG00000260032'])

In [26]:
#generate proms and background files for promoter and region tests
def generateFilesForTests(target, rnas):
    fantom = pd.read_csv("../fantom6/oligo_DE_Summary_gene_filtered.tsv", sep="\t")
    
    #В качестве бэкграунда берем все деги(а не просто все фантомовские гены)
    fantom_all_genes = set(fantom['geneID'].unique())
    
    #Уже проверено, что гены из общего списка
    with open("../all_marks/" + target + "/our_fantom_genes_association.pickle", 'rb') as f:
        our_fantom_genes = pickle.load(f)
        
    if not os.path.exists("../all_marks/" + target + "/fantom_tdf/"):
        os.makedirs("../all_marks/" + target + "/fantom_tdf/")
        
    fantom_ann = pd.read_csv("/home/ymedvedeva/FANTOM6/data/public/FANTOM5-CAT/F6_CAT.gene.info.tsv", sep="\t")
    fantom_ann = fantom_ann[['cntg', 'geneStart', 'geneEnd', 'geneID', 'geneName', 'strnd']]
    
    for rna in rnas:
        #Все деги
        genes = set(our_fantom_genes[rna]['fantom'])
        #Все деги c положительным fc
        genes_p = set(our_fantom_genes[rna]['fantom_plus'])
        #Все деги c положительным fc
        genes_m = set(our_fantom_genes[rna]['fantom_minus'])
        #Все деги c положительным fc и положительной корреляцией
        genes_pp = set(our_fantom_genes[rna]['fantom_plus']).intersection(set(our_fantom_genes[rna]['our_plus']))
        #Все деги c положительным fc и отрицательной корреляцией
        genes_pm = set(our_fantom_genes[rna]['fantom_minus']).intersection(set(our_fantom_genes[rna]['our_plus']))
        #Все деги c отрицательным fc и положительной корреляцией
        genes_mp = set(our_fantom_genes[rna]['fantom_plus']).intersection(set(our_fantom_genes[rna]['our_minus']))
        #Все деги c отрицательным fc и отрицательной корреляцией
        genes_mm = set(our_fantom_genes[rna]['fantom_minus']).intersection(set(our_fantom_genes[rna]['our_minus']))

        background_genes = fantom_all_genes - genes
        background_genes_p = fantom_all_genes - genes_p
        background_genes_m = fantom_all_genes - genes_m
        background_genes_pp = fantom_all_genes - genes_pp
        background_genes_pm = fantom_all_genes - genes_pm
        background_genes_mp = fantom_all_genes - genes_mp
        background_genes_mm = fantom_all_genes - genes_mm

        genes_bed = fantom_ann[fantom_ann['geneID'].isin(genes)]
        background_genes_bed = fantom_ann[fantom_ann['geneID'].isin(background_genes)]

        genes_bed_p = fantom_ann[fantom_ann['geneID'].isin(genes_p)]
        background_genes_bed_p = fantom_ann[fantom_ann['geneID'].isin(background_genes_p)]
        
        genes_bed_m = fantom_ann[fantom_ann['geneID'].isin(genes_m)]
        background_genes_bed_m = fantom_ann[fantom_ann['geneID'].isin(background_genes_m)]

        genes_bed_pp = fantom_ann[fantom_ann['geneID'].isin(genes_pp)]
        background_genes_bed_pp = fantom_ann[fantom_ann['geneID'].isin(background_genes_pp)]
        
        genes_bed_pm = fantom_ann[fantom_ann['geneID'].isin(genes_pm)]
        background_genes_bed_pm = fantom_ann[fantom_ann['geneID'].isin(background_genes_pm)]
        
        genes_bed_mp = fantom_ann[fantom_ann['geneID'].isin(genes_mp)]
        background_genes_bed_mp = fantom_ann[fantom_ann['geneID'].isin(background_genes_mp)]
        
        genes_bed_mm = fantom_ann[fantom_ann['geneID'].isin(genes_mm)]
        background_genes_bed_mm = fantom_ann[fantom_ann['geneID'].isin(background_genes_mm)]

        proms = getProms(genes_bed)
        background_proms = getProms(background_genes_bed)

        proms.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_genes_proms.bed", sep="\t", header=None, index=None)
        background_proms.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_proms.bed", sep="\t", header=None, index=None)

        proms_p = getProms(genes_bed_p)
        background_proms_p = getProms(background_genes_bed_p)

        proms_p.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_genes_p_proms.bed", sep="\t", header=None, index=None)
        background_proms_p.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_p_proms.bed", sep="\t", header=None, index=None)
        
        proms_m = getProms(genes_bed_m)
        background_proms_m = getProms(background_genes_bed_m)

        proms_m.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_genes_m_proms.bed", sep="\t", header=None, index=None)
        background_proms_m.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_m_proms.bed", sep="\t", header=None, index=None)

        proms_pp = getProms(genes_bed_pp)
        background_proms_pp = getProms(background_genes_bed_pp)

        proms_pp.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_genes_pp_proms.bed", sep="\t", header=None, index=None)
        background_proms_pp.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_pp_proms.bed", sep="\t", header=None, index=None)
        
        proms_pm = getProms(genes_bed_pm)
        background_proms_pm = getProms(background_genes_bed_pm)

        proms_pm.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_genes_pm_proms.bed", sep="\t", header=None, index=None)
        background_proms_pm.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_pm_proms.bed", sep="\t", header=None, index=None)
        
        proms_mp = getProms(genes_bed_mp)
        background_proms_mp = getProms(background_genes_bed_mp)

        proms_mp.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_genes_mp_proms.bed", sep="\t", header=None, index=None)
        background_proms_mp.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_mp_proms.bed", sep="\t", header=None, index=None)
        
        proms_mm = getProms(genes_bed_mm)
        background_proms_mm = getProms(background_genes_bed_mm)

        proms_mm.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_genes_mm_proms.bed", sep="\t", header=None, index=None)
        background_proms_mm.to_csv("../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_mm_proms.bed", sep="\t", header=None, index=None)

        #for region test
        generateBackgroundForRegionTest(target, rna, "../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_proms.bed")
        generateBackgroundForRegionTest(target, rna + "_p", "../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_p_proms.bed")
        generateBackgroundForRegionTest(target, rna + "_m", "../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_m_proms.bed")
        generateBackgroundForRegionTest(target, rna + "_pp", "../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_pp_proms.bed")
        generateBackgroundForRegionTest(target, rna + "_pm", "../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_pm_proms.bed")
        generateBackgroundForRegionTest(target, rna + "_mp", "../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_mp_proms.bed")
        generateBackgroundForRegionTest(target, rna + "_mm", "../all_marks/" + target + "/fantom_tdf/" + rna + "_background_genes_mm_proms.bed")

In [27]:
generateFilesForTests("H3K27ac", [i[0] for i in rnas_with_prefixes])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [6]:
def getProms(bed):
    bed['startProm'] = bed['geneStart'] - 500
    bed['startProm'] = bed['startProm'].apply(lambda x: 0 if x < 0 else x)
    bed['endProm'] = bed['geneStart'] + 500
    
    return bed[['cntg', 'startProm', 'endProm', 'geneID', 'geneName', 'strnd']]

In [24]:
def runTDFpromoterTest(rna, prefix):
    cmd = "rgt-TDF promotertest -rm 2 -r ../fantom6/transcripts/" + rna + ".fa -rn " + rna + prefix + " -bed ../H3K27me3/fantom_tdf/" + rna + "_genes" + prefix + "_proms.bed -bg ../H3K27me3/fantom_tdf/" + rna + "_background_genes" + prefix + "_proms.bed -o ../H3K27me3/fantom_tdf/promoter_test/ -organism hg19 -l 12 -ccf 20"
    print(cmd)
    return subprocess.check_output(cmd, shell=True)

In [34]:
def runTDFregionTest(rna, prefix_our, prefix_fantom, target="H3K27ac"):
    cmd = "rgt-TDF regiontest -r ../fantom6/transcripts/" + rna + ".fa -rn " + rna + prefix_our + prefix_fantom + " -bed ../all_marks/" + target + "/fantom_tdf/" + rna + "_genes" + prefix_our + prefix_fantom + "_proms.bed -f ../all_marks/" + target + "/fantom_tdf/bg_" + rna + prefix_our + prefix_fantom + ".bed -o ../all_marks/" + target + "/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n " + str(1000)
    print(cmd)
    return subprocess.check_output(cmd, shell=True)

In [14]:
#bedtools subtract -a A.bed -b B.bed
#A - B = substract
#геном - промоторы нетаргетных дегов, тогда в качестве бэкграунда будут браться только промоторы нетаргетных дегов(т.е. то, что вычли)
def generateBackgroundForRegionTest(target, rna, nontarget_promoters):
    proms = BedTool(nontarget_promoters)
    hg19 = BedTool('../hg38/chrom.sizes')
    hg19.subtract(proms, output='../all_marks/' + target + '/fantom_tdf/bg_' + rna + '.bed')

In [28]:
def runTDF(rna_prefix):
    print(runTDFregionTest(rna_prefix[0], rna_prefix[1], rna_prefix[2]))

In [35]:
pool = Pool(processes=5)
res = pool.map(runTDF, rnas_with_prefixes)

pool.close()
pool.join()

rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000204054.fa -rn ENSG00000204054 -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000204054_genes_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000204054.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000271270.fa -rn ENSG00000271270 -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000271270_genes_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000271270.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000268895.fa -rn ENSG00000268895_mm -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000268895_genes_mm_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000268895_mm.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
rgt-TDF regiontest -r ../fantom6/transcr

rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000214548.fa -rn ENSG00000214548 -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000214548_genes_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000214548.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000246430.fa -rn ENSG00000246430_m -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000246430_genes_m_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000246430_m.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
b'\n*************** Genomic Region Test ***************\n*** Input RNA sequence: /home/mazurovev/fantom6/transcripts/ENSG00000246430.fa\n*** Input regions in BED: ENSG00000246430_genes_m_proms.bed\n*** Number of randomization: 1000\n*** Output directory: ENSG00000246430_m\nStep 1: Calculate the triplex forming sites on RNA and DNA.\n\tRunning time: 0:00:05\n

rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000225733.fa -rn ENSG00000225733_mm -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000225733_genes_mm_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000225733_mm.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
b'\n*************** Genomic Region Test ***************\n*** Input RNA sequence: /home/mazurovev/fantom6/transcripts/ENSG00000225733.fa\n*** Input regions in BED: ENSG00000225733_genes_mm_proms.bed\n*** Number of randomization: 1000\n*** Output directory: ENSG00000225733_mm\nStep 1: Calculate the triplex forming sites on RNA and DNA.\n\tRunning time: 0:00:00\nStep 2: Permutation by randomization the target regions for 1000 times.\n*** Find no DBD having DBS with cutoff = 50\n'
rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000223485.fa -rn ENSG00000223485_m -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000223485_genes_m_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/

rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000223485.fa -rn ENSG00000223485 -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000223485_genes_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000223485.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000256268.fa -rn ENSG00000256268_m -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000256268_genes_m_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000256268_m.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
b'\n*************** Genomic Region Test ***************\n*** Input RNA sequence: /home/mazurovev/fantom6/transcripts/ENSG00000256268.fa\n*** Input regions in BED: ENSG00000256268_genes_m_proms.bed\n*** Number of randomization: 1000\n*** Output directory: ENSG00000256268_m\nStep 1: Calculate the triplex forming sites on RNA and DNA.\n\tRunning time: 0:00:02\n

b'\n*************** Genomic Region Test ***************\n*** Input RNA sequence: /home/mazurovev/fantom6/transcripts/ENSG00000233117.fa\n*** Input regions in BED: ENSG00000233117_genes_mm_proms.bed\n*** Number of randomization: 1000\n*** Output directory: ENSG00000233117_mm\nStep 1: Calculate the triplex forming sites on RNA and DNA.\n\tRunning time: 0:00:00\nStep 2: Permutation by randomization the target regions for 1000 times.\n*** Find no DBD having DBS with cutoff = 50\n'
rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000225733.fa -rn ENSG00000225733_pm -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000225733_genes_pm_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000225733_pm.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
b'\n*************** Genomic Region Test ***************\n*** Input RNA sequence: /home/mazurovev/fantom6/transcripts/ENSG00000225733.fa\n*** Input regions in BED: ENSG00000225733_genes_pm_prom

rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000223485.fa -rn ENSG00000223485_mm -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000223485_genes_mm_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000223485_mm.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
b'\n*************** Genomic Region Test ***************\n*** Input RNA sequence: /home/mazurovev/fantom6/transcripts/ENSG00000223485.fa\n*** Input regions in BED: ENSG00000223485_genes_mm_proms.bed\n*** Number of randomization: 1000\n*** Output directory: ENSG00000223485_mm\nStep 1: Calculate the triplex forming sites on RNA and DNA.\n\tRunning time: 0:00:00\nStep 2: Permutation by randomization the target regions for 1000 times.\n*** Find no DBD having DBS with cutoff = 50\n'
rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000223485.fa -rn ENSG00000223485_pp -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000223485_genes_pp_proms.bed -f ../all_marks/H3K27ac/fantom_td

rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000229647.fa -rn ENSG00000229647 -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000229647_genes_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000229647.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
b'\n*************** Genomic Region Test ***************\n*** Input RNA sequence: /home/mazurovev/fantom6/transcripts/ENSG00000229647.fa\n*** Input regions in BED: ENSG00000229647_genes_proms.bed\n*** Number of randomization: 1000\n*** Output directory: ENSG00000229647\nStep 1: Calculate the triplex forming sites on RNA and DNA.\n\tRunning time: 0:00:02\nStep 2: Permutation by randomization the target regions for 1000 times.\n*** Find no DBD having DBS with cutoff = 50\n'
rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000268895.fa -rn ENSG00000268895_p -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000268895_genes_p_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000268

rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000229647.fa -rn ENSG00000229647_mm -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000229647_genes_mm_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000229647_mm.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
b'\n*************** Genomic Region Test ***************\n*** Input RNA sequence: /home/mazurovev/fantom6/transcripts/ENSG00000229647.fa\n*** Input regions in BED: ENSG00000229647_genes_mm_proms.bed\n*** Number of randomization: 1000\n*** Output directory: ENSG00000229647_mm\nStep 1: Calculate the triplex forming sites on RNA and DNA.\n\tRunning time: 0:00:00\nStep 2: Permutation by randomization the target regions for 1000 times.\n*** Find no DBD having DBS with cutoff = 50\n'
rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000234608.fa -rn ENSG00000234608_mm -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000234608_genes_mm_proms.bed -f ../all_marks/H3K27ac/fantom_td

rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000229852.fa -rn ENSG00000229852 -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000229852_genes_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000229852.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000246430.fa -rn ENSG00000246430_mp -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000246430_genes_mp_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000246430_mp.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000
b'\n*************** Genomic Region Test ***************\n*** Input RNA sequence: /home/mazurovev/fantom6/transcripts/ENSG00000246430.fa\n*** Input regions in BED: ENSG00000246430_genes_mp_proms.bed\n*** Number of randomization: 1000\n*** Output directory: ENSG00000246430_mp\nStep 1: Calculate the triplex forming sites on RNA and DNA.\n\tRunning time: 0:00



CalledProcessError: Command 'rgt-TDF regiontest -r ../fantom6/transcripts/ENSG00000253161.fa -rn ENSG00000253161_pm -bed ../all_marks/H3K27ac/fantom_tdf/ENSG00000253161_genes_pm_proms.bed -f ../all_marks/H3K27ac/fantom_tdf/bg_ENSG00000253161_pm.bed -o ../all_marks/H3K27ac/fantom_tdf/region_test/ -organism hg38 -l 12 -e 10 -obed -rt -mp 6 -ccf 50 -n1000' returned non-zero exit status 1.

In [37]:
def tdfDataToHtml(target):
    import os
    folders = os.listdir("../all_marks/" + target + "/fantom_tdf/region_test")

    for path in folders:
        print(path)
        try:
            #os.chmod("../H3K27me3/fantom_tdf/region_test/" + path + "/data.tsv", 0o777)
            getHtmlTableWithTDFResults("../all_marks/" + target + "/fantom_tdf/region_test/" + path)
        except FileNotFoundError: 
            print('Oops, ' + path + '/data.tsv not found')

In [38]:
def getHtmlTableWithTDFResults(path):
    def color_negative_red(row):
        val = row[5]
        try:
            color = 'red' if val < 0.05 else 'black'
        except Exception:
            color = 'black'
        return ['color: black']*5 + ['color: %s' % color] + ['color: black']
    
    def processTable(df):
        new = df['DBD'].str.split("-", n = 1, expand = True) 
        new[0] = pd.to_numeric(new[0])
        new[1] = pd.to_numeric(new[1])
        
        df['startDBD'] = new[0]
        df['endDBD'] = new[1]
        
        df.sort_values(by=['startDBD'], ascending=True, inplace=True)
        df.drop(columns=['startDBD', 'endDBD'], inplace=True)
        df = df[df['p-value'] < 0.05]
        df = df.reset_index(drop=True)
        
        return df
    
    df = processTable(pd.read_csv(path + "/data.tsv", sep="\t"))
    #s = df.style.apply(color_negative_red, axis=1)
    s = df.style
    
    with open(path + "/data_table.html", 'w') as f:
        for index, item in enumerate(s.render().split("\n")):
            if(index == 0):
                f.write('<style  type="text/css" >\n')
                f.write('\n')
                f.write('table, th, td {\n')
                f.write('border: 1px solid black;\n')
                f.write('border-collapse: collapse;\n')
                f.write('font-size: 11px;\n')
                f.write('}\n')
                f.write('\n')
                f.write('th,\n')
                f.write('td {\n')
                f.write('border: 1px solid black;\n')
                f.write('width: 100px;\n')
                f.write('height: 25px;\n')
                f.write('text-align:center;\n')
                f.write('font-family: Montserrat;\n')
                f.write('overflow: hidden;\n')
                f.write('}\n')
                f.write('\n')
                f.write('tr:nth-child(even) {\n')
                f.write('background-color: #ffe6e6\n')
                f.write('}\n')
                f.write('\n')
            else:
                f.write("%s\n" % item)

In [39]:
def processTable(df):
        new = df['DBD'].str.split("-", n = 1, expand = True) 
        new[0] = pd.to_numeric(new[0])
        new[1] = pd.to_numeric(new[1])
        
        df['startDBD'] = new[0]
        df['endDBD'] = new[1]
        
        df.sort_values(by=['startDBD'], ascending=True, inplace=True)
        df.drop(columns=['startDBD', 'endDBD'], inplace=True)
        df = df[df['p-value'] < 0.05]
        df = df.reset_index(drop=True)
        
        return df

In [40]:
tdfDataToHtml("H3K27ac")

ENSG00000234608_m
Oops, ENSG00000234608_m/data.tsv not found
ENSG00000256268_m
Oops, ENSG00000256268_m/data.tsv not found
ENSG00000229647
Oops, ENSG00000229647/data.tsv not found
ENSG00000233117_mm
Oops, ENSG00000233117_mm/data.tsv not found
ENSG00000233117_p
ENSG00000268895_p
ENSG00000223485_p
ENSG00000212978_pm
Oops, ENSG00000212978_pm/data.tsv not found
ENSG00000229852
ENSG00000257219_m
Oops, ENSG00000257219_m/data.tsv not found
ENSG00000204054_pm
Oops, ENSG00000204054_pm/data.tsv not found
ENSG00000246430
ENSG00000240875_mm
Oops, ENSG00000240875_mm/data.tsv not found
ENSG00000268895_m
ENSG00000229852_m
ENSG00000227053
ENSG00000223485_mm
Oops, ENSG00000223485_mm/data.tsv not found
ENSG00000227053_m
Oops, ENSG00000227053_m/data.tsv not found
ENSG00000230630_mm
Oops, ENSG00000230630_mm/data.tsv not found
ENSG00000214548_mp
Oops, ENSG00000214548_mp/data.tsv not found
ENSG00000227053_mm
Oops, ENSG00000227053_mm/data.tsv not found
ENSG00000256268
Oops, ENSG00000256268/data.tsv not found


In [41]:
h3k27ac_ccf_50_res = ['ENSG00000233117_p','ENSG00000268895_p','ENSG00000223485_p','ENSG00000229852',
                      'ENSG00000246430','ENSG00000268895_m','ENSG00000229852_m','ENSG00000227053',
                      'ENSG00000233396_p','ENSG00000214548_p','ENSG00000212978','ENSG00000230630_p',
                      'ENSG00000225733_m','ENSG00000260032_p','ENSG00000268895','ENSG00000260032',
                      'ENSG00000212978_m','ENSG00000204054_m','ENSG00000233396','ENSG00000230630',
                      'ENSG00000223485_m','ENSG00000233117_m','ENSG00000225733','ENSG00000214548',
                      'ENSG00000233396_m','ENSG00000271270','ENSG00000234608','ENSG00000240875_m',
                      'ENSG00000204054','ENSG00000230630_m','ENSG00000233117','ENSG00000240875',
                      'ENSG00000223485','ENSG00000271270_m']

In [49]:
res = []
for f in h3k27ac_ccf_50_res:
    tmp = pd.read_csv("../all_marks/H3K27ac/fantom_tdf/region_test/" + f + "/data.tsv", sep="\t")
    tmp = tmp[tmp['p-value'] < 0.05]
    if(tmp.shape[0] > 0):
        tmp['lnc'] = [f]*tmp.shape[0]
        res.append(tmp)

In [56]:
res = pd.concat(res, ignore_index=True)

In [57]:
res.sort_values(['lnc'])

Unnamed: 0,DBD,tr_with DBS,tr_without DBS,ntr_with DBS(average),ntr_with DBS(std),p-value,z-score,lnc
21,2406-2425,68,882,45.1,6.29,0.000,3.63,ENSG00000212978
20,1537-1558,73,877,58.9,7.25,0.024,1.94,ENSG00000212978
19,1980-2011,289,661,250.2,13.40,0.001,2.88,ENSG00000212978
47,9-28,79,3084,63.2,7.09,0.012,2.24,ENSG00000214548
18,9-28,51,1883,40.9,5.86,0.031,1.72,ENSG00000214548_p
64,84-96,724,5848,496.3,15.80,0.000,14.40,ENSG00000223485
65,108-129,108,6464,85.2,6.75,0.000,3.39,ENSG00000223485
67,240-252,68,6504,43.0,4.81,0.000,5.20,ENSG00000223485
68,339-356,75,6497,56.0,5.56,0.000,3.42,ENSG00000223485
69,733-749,155,6417,113.0,7.79,0.000,5.40,ENSG00000223485


In [60]:
sorted(list(res['lnc'].unique()))

['ENSG00000212978',
 'ENSG00000214548',
 'ENSG00000214548_p',
 'ENSG00000223485',
 'ENSG00000223485_m',
 'ENSG00000223485_p',
 'ENSG00000225733',
 'ENSG00000229852',
 'ENSG00000229852_m',
 'ENSG00000230630',
 'ENSG00000230630_m',
 'ENSG00000233117',
 'ENSG00000233117_p',
 'ENSG00000233396',
 'ENSG00000233396_m',
 'ENSG00000233396_p',
 'ENSG00000240875',
 'ENSG00000260032',
 'ENSG00000268895',
 'ENSG00000268895_m',
 'ENSG00000268895_p',
 'ENSG00000271270',
 'ENSG00000271270_m']

In [64]:
fantom_lnc_tdf = ['ENSG00000212978', 'ENSG00000214548', 'ENSG00000223485', 'ENSG00000225733', 
                 'ENSG00000229852', 'ENSG00000230630', 'ENSG00000233117', 'ENSG00000233396',
                 'ENSG00000240875', 'ENSG00000260032', 'ENSG00000268895', 'ENSG00000271270']

In [65]:
rnas[rnas['lncRNAId'].isin(fantom_lnc_tdf)]

Unnamed: 0,lncRNAId,lncRNAName,pvalue,mm_pvalue,mp_pvalue,pm_pvalue,pp_pvalue
0,ENSG00000233117,LINC00702,4.692277e-08,8.878292e-28,5.876098e-07,6.352252e-06,1.0
1,ENSG00000212978,AC016747.3,7.310864e-05,0.432655,1.0,8.976846e-14,0.361277
2,ENSG00000260032,LINC00657,4.423346e-05,0.460041,6.213641e-06,1.0,0.173855
3,ENSG00000223485,RP11-417E7.1,1.468296e-11,2.535745e-31,4.4553329999999996e-20,1.157244e-37,5e-06
4,ENSG00000271270,TMCC1-AS1,0.004082303,0.21427,1.0,2.122146e-05,1.0
6,ENSG00000268895,A1BG-AS1,4.290017e-11,1.476438e-22,1.866638e-07,3.154033e-20,0.238645
8,ENSG00000225733,FGD5-AS1,3.378603e-09,0.002163507,1.0,4.037441e-16,1.0
9,ENSG00000240875,LINC00886,0.006833321,0.002163507,1.0,0.9296805,0.378362
13,ENSG00000230630,DNM3OS,1.0068e-15,6.105785e-36,3.586202e-05,0.002390112,0.0851
15,ENSG00000214548,MEG3,1.203857e-12,1.0,7.552614e-13,0.01639964,0.295756


In [None]:
#1)Посчитать отдельно с двумя буквами варианты, найти подходящий ccf
#2)Статистически проверить что отношение таргетов к общему становится уже с увеличением фильтрации
#3)Подбирать ccf динамически(пункт 1), возможно нужны параметры G и C для триплексатора