In [1]:
import pickle
import pandas as pd
from pybedtools import BedTool
import matplotlib.pyplot as plt
import seaborn as sb
from collections import defaultdict
import gffutils
from BCBio import GFF
import subprocess
import os
from multiprocessing import Pool

In [6]:
with open("../H3K27me3/lncRNA_New_Peaks_Correlations_norm.pickle", 'rb') as f:
    corr = pickle.load(f)

In [10]:
corr_no_zero = [i for i in corr if len(i) != 0]
corr_no_zero = [item for sublist in corr_no_zero for item in sublist]

In [2]:
peaks = pd.read_csv("../H3K27me3/new_peaks/merged_peaks_around_genes.bed", sep="\t", header=None)

In [4]:
peaks_gene = pd.read_csv("../H3K27me3/new_peaks/lncRNA_peaks_gene_association.tsv", sep="\t")

In [5]:
peaks_gene.head()

Unnamed: 0,lncRNA,peak,gene
0,ENSG00000093100,X005284,ENSG00000117395
1,ENSG00000093100,X005284,ENSG00000243710
2,ENSG00000093100,X005408,ENSG00000230615
3,ENSG00000093100,X006402,ENSG00000203356
4,ENSG00000093100,X021525,ENSG00000273384


In [3]:
peaks.head()

Unnamed: 0,0,1,2,3
0,chr1,54316,55724,X000001
1,chr1,61835,62068,X000004
2,chr1,64456,64692,X000005
3,chr1,87039,88835,X000007
4,chr1,89695,90136,X000008


In [21]:
rnas = set([g for g, p, c in corr])

In [12]:
gene_key_corrs = defaultdict(list)
for g, p, c in corr_no_zero:
    gene_key_corrs[g].append((p, c))

In [23]:
corrs_with_sign = []
for rna in rnas:
    corrs = gene_key_corrs[rna]
    plus_corrs = len([(p, c) for p, c in corrs if c[0] > 0])
    minus_corrs = len([(p, c) for p, c in corrs if c[0] < 0])
    corrs_with_sign.append((rna, plus_corrs, minus_corrs))

In [24]:
corrs_with_sign_sorted_plus = sorted(corrs_with_sign, key=lambda x: x[1], reverse=True)
corrs_with_sign_sorted_minus = sorted(corrs_with_sign, key=lambda x: x[2], reverse=True)

In [25]:
corrs = list(set(corrs_with_sign_sorted_plus[0:20]).union(set(corrs_with_sign_sorted_minus[0:20])))

In [None]:
def runTDF(corr):
    g = corr[0]
    cmd = "rgt-TDF promotertest -rm 2 -r ../H3K27me3/tdf_lncRNA_gene_sequences/" + g + "_gene_lncRNA.fa -rn " + g + " -bed ../H3K27me3/peaks_for_tdf/plus_" + g + "_peaks.bed -bg ../H3K27me3/peaks_for_tdf/ne_" + g + "_peaks.bed -o ../H3K27me3/promoter_test/ -organism hg19 -l 12 -ccf 20"
    print(cmd)
    return subprocess.check_output(cmd, shell=True) 

In [16]:
not_done = ["ENSG00000197291.4", "ENSG00000251209.3"]

In [3]:
def getConvertIds():
    lncRNAMatrix = pd.read_csv("../H3K27me3/lncRNA_matrix_filtered.csv", sep="\t", index_col=0)

    #hg19 айдишники
    lncRNAGenes = list(lncRNAMatrix.index)

    in_file = "../H3K27me3/gencode.v29lift37.long_noncoding_RNAs.gff3"
    in_handle = open(in_file)

    limit_info = dict(
        gff_id = ['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 
     'chr19', 'chr2', 'chr20', 'chr21', 'chr22', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9'],
        gff_type = ["gene"])

    lncRNAgenes = []
    for rec in GFF.parse(in_handle, limit_info=limit_info):
        lncRNAgenes.append((rec.id, rec.features)) 

    in_handle.close()

    tmp = [k.split('.')[0] for k in lncRNAGenes]

    #38 айдишники
    lncRNAgenesIds = [g.id for i, j in lncRNAgenes for g in j if g.id.split('.')[0] in tmp]

    lncRNAgenesIds.sort(key = lambda x: x.split('.')[0])
    lncRNAGenes.sort(key = lambda x: x.split('.')[0])

    #в gff лежат названия айдишников для версии ghr38, а в данных hg19
    convertIds = {i:j for i, j in zip(lncRNAGenes, lncRNAgenesIds)} 
    
    return convertIds

In [79]:
def generateFilesForTDF(corrs):
    db = gffutils.FeatureDB('../H3K27me3/long_noncoding_RNAs.db', keep_order=True)
    convertIds = getConvertIds()
    
    for g, p, m in corrs:
        print("lncRNA: " + g)
        corrs = gene_key_corrs[g]
        plus_corrs = [p for p, c in corrs if c[0] > 0]
        minus_corrs = [p for p, c in corrs if c[0] < 0]

        #Сделать TDF положительные против отрицательных
        gene_plus_peaks = peaks[peaks[3].isin(plus_corrs)]
        gene_plus_peaks.to_csv("../H3K27me3/peaks_for_tdf/plus_" + g + "_peaks.bed", sep="\t", index=False, header=None)

        gene_minus_peaks = peaks[peaks[3].isin(plus_corrs)]
        gene_minus_peaks.to_csv("../H3K27me3/peaks_for_tdf/minus_" + g + "_peaks.bed", sep="\t", index=False, header=None)

        ne_gene_peaks = peaks[~peaks[3].isin([p for p, c in corrs])]
        ne_gene_peaks.to_csv("../H3K27me3/peaks_for_tdf/ne_" + g + "_peaks.bed", sep="\t", index=False, header=None)

        #Делаем геномную(!) последовательность нужной lncRNA
        gene = db[convertIds[g]]
        with open("../H3K27me3/tdf_lncRNA_gene_sequences/" + g + ".gff3", "a") as myfile:
            myfile.write(str(gene) + "\n")
            for i in db.children(gene, featuretype='exon', order_by='start'):
                myfile.write(str(i) + "\n")

        cmd = 'gffread ../H3K27me3/tdf_lncRNA_gene_sequences/' + g + '.gff3 -g ../hg19/allChr.fa -w ../H3K27me3/tdf_lncRNA_gene_sequences/' + g + '_gene_lncRNA.fa' 
        print(cmd + "\n")
        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
        p.wait()
        print(p.returncode)

In [5]:
#ячейка для фантом
lncRNA_for_fantom = ['ENSG00000230074.1', 'ENSG00000268895.1', 'ENSG00000229852.2']

db = gffutils.FeatureDB('../H3K27me3/long_noncoding_RNAs.db', keep_order=True)
convertIds = getConvertIds()

for g in lncRNA_for_fantom:
    gene = db[convertIds[g]]
    print("old: " + g + ", new: " + str(gene))
    with open("../H3K27me3/tdf_lncRNA_gene_sequences/" + g + ".gff3", "a") as myfile:
        myfile.write(str(gene) + "\n")
        for i in db.children(gene, featuretype='exon', order_by='start'):
            myfile.write(str(i) + "\n")

    cmd = 'gffread ../H3K27me3/tdf_lncRNA_gene_sequences/' + g + '.gff3 -g ../hg19/allChr.fa -w ../H3K27me3/tdf_lncRNA_gene_sequences/' + g + '_gene_lncRNA.fa' 
    print(cmd + "\n")
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    p.wait()
    print(p.returncode)

old: ENSG00000230074.1, new: chr9	HAVANA	gene	34665662	34681295	.	+	.	ID=ENSG00000230074.1;gene_id=ENSG00000230074.1_5;gene_type=antisense;gene_name=AL162231.2;level=2;havana_gene=OTTHUMG00000000451.1_5;remap_status=full_contig;remap_num_mappings=1;remap_target_status=overlap
gffread ../H3K27me3/tdf_lncRNA_gene_sequences/ENSG00000230074.1.gff3 -g ../hg19/allChr.fa -w ../H3K27me3/tdf_lncRNA_gene_sequences/ENSG00000230074.1_gene_lncRNA.fa

0
old: ENSG00000268895.1, new: chr19	HAVANA	gene	58859117	58866549	.	+	.	ID=ENSG00000268895.5;gene_id=ENSG00000268895.5_5;gene_type=antisense;gene_name=A1BG-AS1;level=2;tag=overlapping_locus;havana_gene=OTTHUMG00000183508.1_5;remap_status=full_contig;remap_num_mappings=1;remap_target_status=overlap
gffread ../H3K27me3/tdf_lncRNA_gene_sequences/ENSG00000268895.1.gff3 -g ../hg19/allChr.fa -w ../H3K27me3/tdf_lncRNA_gene_sequences/ENSG00000268895.1_gene_lncRNA.fa

0
old: ENSG00000229852.2, new: chr6	HAVANA	gene	73972938	74011124	.	+	.	ID=ENSG00000229852.2;

In [6]:
#ячейка для фантом
lncRNA_for_fantom = ['ENSG00000230074.1', 'ENSG00000268895.1', 'ENSG00000229852.2']
for l in lncRNA_for_fantom:
    p = peaks_gene[peaks_gene['lncRNA'] == l.split('.')[0]]['peak'].tolist()
    gene_peaks = peaks[peaks[3].isin(p)]
    gene_peaks.to_csv("../H3K27me3/peaks_for_tdf/plus_" + l + "_peaks.bed", sep="\t", index=False, header=None)

    ne_gene_peaks = peaks[~peaks[3].isin(p)]
    ne_gene_peaks.to_csv("../H3K27me3/peaks_for_tdf/ne_" + l + "_peaks.bed", sep="\t", index=False, header=None)

In [None]:
def generate():
    generateFilesForTDF(corrs_with_sign_sorted_plus[0:20])
    generateFilesForTDF(list(set(corrs_with_sign_sorted_minus[0:20]) - set(corrs_with_sign_sorted_plus[0:20])))

In [None]:
pool = Pool(processes=1)
r = pool.map(runTDF, not_done)

pool.close()
pool.join()