In [117]:
import pandas as pd
import os
import h5py
import scipy.stats as stats
import numpy as np
from collections import defaultdict
from statsmodels.stats.multitest import multipletests

from BCBio import GFF
from Bio.Seq import Seq
from Bio.SeqUtils import GC
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation

In [9]:
targets = [("H3K27ac", "_narrow"), ("H3K27me3", ""), ("H3K36me3", ""), 
           ("H3K4me1", "_narrow"), ("H3K4me2", "_narrow"), ("H3K4me3", "_narrow"), ("H3K79me2", ""), 
           ("H3K9ac", "_narrow"), ("H3K9me3", ""), ("H4K20me1", "")]
           #, ("methylation", "")]

In [10]:
def getLncRNAGeneSet():
    
    in_file = "../all_marks/gencode.v31.long_noncoding_RNAs.gff3"
    in_handle = open(in_file)

    limit_info = dict(
        gff_id = ['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 
     'chr19', 'chr2', 'chr20', 'chr21', 'chr22', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chrX', 'chrY'],
        gff_type = ["gene"])

    lncRNAgenes = []
    for rec in GFF.parse(in_handle, limit_info=limit_info):
        lncRNAgenes.append((rec.id, rec.features)) 

    in_handle.close()
    
    lncRNAgenesStartPositions = {
    g.id.split(".")[0]:[i, g.location.start.position, g.location.end.position, g.strand] for i, j in lncRNAgenes for g in j}
    
    return [i for i in lncRNAgenesStartPositions.keys() if i.startswith("ENSG")]

In [11]:
#Генерим бэкграунд для конкретного эксперимента: пересекаем наш список lncRNA(из которого мы выбирали)
#с экспериментальным списком генов, которые в принципе экспрессировались
#Если передаем несколько бэкграундов, значит хотим их объеденить
def getCommonBackground(*exp_backgrounds):
    lncRNASet = set(getLncRNAGeneSet())
    exp_background = None
    
    for exp in exp_backgrounds:
        lineList = [line.rstrip('\n') for line in open(exp)]
        lineList = [g for g in lineList if g.startswith("ENSG")]
        if exp_background is not None:
            exp_background = exp_background.union(set([r for l in [g.split(',') for g in lineList] for r in l]))
        else:
            exp_background = set([r for l in [g.split(',') for g in lineList] for r in l])
    
    return lncRNASet.intersection(exp_background)

In [12]:
def getCommonGencodeBackground(target, cell):
    """
    В данном случае у эксперимента бэкграунд - вся gencode v31 аннотация, а у наших корреляций бэкграунд у каждой
    метки свой - кол-во lncRNA, у которых искали корреляциий. Т.к. этот список тоже из gencode v31, то
    в качестве бэкграунда у нас просто будут lncRNA из соответствующего метке lncRNA_matrix_filtered_norm.csv
    файла.
    
    + Решили фильтровать гены по экспресии в клетке: берем только те, что экспрессируются
    """

    l = pd.read_csv("/data/mazurovev/all_marks/" + target + "/lncRNA_matrix_filtered_norm.csv", sep="\t", index_col=0)
    cell_exp = set()
    for c in cell:
        cell_exp = cell_exp.union(set([g.split(".")[0] for g in pd.read_csv("../eCLIP/" + c + ".eCLIP.detectable.genes.tsv", header=None)[0].tolist()]))
    
    # print("background size = " + str(len(set(l.index).intersection(cell_exp))))
    
    return set(l.index).intersection(cell_exp)


In [322]:
t = pd.read_csv("/data/mazurovev/all_marks/" + "H3K27ac" + "/lncRNA_matrix_filtered_norm.csv", sep="\t", index_col=0)

In [323]:
len(set(t.index).intersection(set(s)))

3094

In [319]:
pd.read_csv("../eCLIP/" + "K562" + ".eCLIP.detectable.genes.tsv", header=None).shape

(22471, 1)

In [320]:
s = [g.split(".")[0] for g in pd.read_csv("../eCLIP/" + "K562" + ".c.genes.tsv", header=None)[0].tolist()]

In [321]:
len(s)

22471

In [13]:
#Возвращаем список мишеней для конкретного белка(в конкретной клетке), которые так же есть в бэкграунде
#Если белков несколько(могут быть из разных клеток), то сливаем их в один список
def getTargetsList(*protein_files):
    targets_list = None
    
    for p in protein_files:
        print("Get targets list for " + p)
        pretargets_list = pd.read_csv("../eCLIP/" + p, sep="\t")
        #pretargets_list.head()
        pretargets_list = [r.split(',') for r in pretargets_list[pretargets_list['Gene_ID'].str.startswith("ENSG")]['Gene_ID'].tolist()]
        #print(pretargets_list)
        if targets_list is not None:
            targets_list = targets_list.union(set([r.split(".")[0] for l in pretargets_list for r in l]))
        else:
            targets_list = set([r.split(".")[0] for l in pretargets_list for r in l])
     
    #res = targets_list.intersection(background)
    
    print("Targets list len " + str(len(targets_list)))
    return targets_list

In [18]:
def getCorrsLncRNAsLists(target):
    def getCorrsLncRNAsLists(lncRNAnames, corrs):
        plus_corrs_idx = list(np.where(np.sum(corrs > 0, axis=1) == 0)[0])
        plus_hm_lncRNAs = lncRNAnames[plus_corrs_idx]
        minus_corrs_idx = list(np.where(np.sum(corrs < 0, axis=1) == 0)[0])
        minus_hm_lncRNAs = lncRNAnames[minus_corrs_idx]
        #mixture_idx = [i for i in range(data.shape[0]) if i not in (plus_corrs_idx + minus_corrs_idx)]
        #mixture_hm_lncRNAs = set(lncRNAnames[mixture_idx]).intersection(background)
   
        return {"general": lncRNAnames, "only + corrs": plus_hm_lncRNAs, "only - corrs": minus_hm_lncRNAs}
    

    print("makes count for " + target)
    if(target == "methylation"):
        corrs_lncRNAs = defaultdict(list)
        for i in range(0, 19):
            with h5py.File("/data/mazurovev/all_marks/" + target + "/lncRNA_Peaks_corrs/lncRNA_Peaks_Correlations_corrected_non_zero_" + str(i) + ".hdf5", 'r') as f:
                tmp = getCorrsLncRNAsLists(f['lncRNAs_names'][:], f['corrs_matrix'][:])
                corrs_lncRNAs["general"].append(tmp["general"])
                corrs_lncRNAs["only + corrs"].append(tmp["only + corrs"])
                corrs_lncRNAs["only - corrs"].append(tmp["only - corrs"])
    else:
        with h5py.File("/data/mazurovev/all_marks/" + target + "/lncRNA_Peaks_corrs/lncRNA_Peaks_Correlations_corrected_non_zero.hdf5", 'r') as f:
            corrs_lncRNAs = getCorrsLncRNAsLists(f['lncRNAs_names'][:], f['corrs_matrix'][:])
            
    return corrs_lncRNAs

In [19]:
corrs_lncRNAs = {}
for target in targets:
    corrs_lncRNAs[target[0]] = getCorrsLncRNAsLists(target[0])

makes count for H3K27ac
makes count for H3K27me3
makes count for H3K36me3
makes count for H3K4me1
makes count for H3K4me2
makes count for H3K4me3
makes count for H3K79me2
makes count for H3K9ac
makes count for H3K9me3
makes count for H4K20me1


In [20]:
def makeCounts(eclip, eclip_targets, cell):

    res = []
    for target in targets:
        background = getCommonGencodeBackground(target[0], cell)
        for list_type in  corrs_lncRNAs[target[0]].keys():
            type_list = set(corrs_lncRNAs[target[0]][list_type]).intersection(background)
            res.append((target[0], list_type, 
                        len(type_list), 
                        eclip, len(eclip_targets.intersection(background)), 
                        len(type_list.intersection(set(eclip_targets))),
                       len(background)))
            
                
    df = pd.DataFrame(res, columns =['Modification', 'Modification lncRNAs list type', 'Modification lncRNAs list count', 'RBP', 'RBP lncRNAs count', 'TP', 'background'])
    df['TN'] = df['background'] - df['Modification lncRNAs list count'] - df['RBP lncRNAs count'] + df['TP']
    df['FP'] = df['Modification lncRNAs list count'] - df['TP']
    df['FN'] = df['RBP lncRNAs count'] - df['TP']
    
    # Уберем строки, где TP == 0 или какой-нибудь из каунтов == 0
    df = df[(df['TP'] != 0) & (df["Modification lncRNAs list count"] != 0) & (df["RBP lncRNAs count"] != 0)]
    
    return res, df

In [21]:
def count_p_values(df):
    pv_list = []
    oddsratio_list = []
    for index, row in df.iterrows():
        oddsratio, pv = stats.fisher_exact([[row['TP'], row['FP']], [row['FN'], row['TN']]], alternative='greater')
        pv_list.append(pv)
        oddsratio_list.append(oddsratio)
    
    df['p-value'] = pv_list
    df['oddsratio'] = oddsratio_list
    correct = multipletests(pv_list, alpha=0.05, method='fdr_bh')
    df['Correction'] = correct[0]
    
    return df

In [22]:
def getResults(proteins, proteins_names, cell):
    res = []

    for protein, name in zip(proteins, proteins_names):
        if(isinstance(protein, list)):
            targets = getTargetsList(*protein)
        else:
            targets = getTargetsList(protein)

        _, counts = makeCounts(name, targets, cell)
        if counts.shape[0] != 0:
            res.append((name, count_p_values(counts)))
    
    concat = []
    without_corr = []
    for name, df in res:
        # display(df)
        concat.append(df[df['Correction'] == True])
        without_corr.append(df[df['p-value'] < 0.05])
        
    return res, pd.concat(concat), pd.concat(without_corr)

In [337]:
proteins = [f for f in os.listdir("../eCLIP/") if "txt" in f and "K562" in f and "new" in f]
proteins_names = [os.path.splitext(p)[0] for p in proteins]

res_k562, k562, p_k562 = getResults(proteins, proteins_names, ["K562"])

Get targets list for KHDRBS1new_K562.txt
Targets list len 881
Get targets list for HNRNPLnew_K562.txt
Targets list len 1837
Get targets list for TAF15new_K562.txt
Targets list len 791
Get targets list for HNRNPUnew_K562.txt
Targets list len 1122
Get targets list for HNRNPUL1new_K562.txt
Targets list len 855
Get targets list for SAFB2new_K562.txt
Targets list len 2577
Get targets list for NONOnew_K562.txt
Targets list len 2448
Get targets list for SAFBnew_K562.txt
Targets list len 1779
Get targets list for HNRNPMnew_K562.txt
Targets list len 171


In [340]:
k562.to_csv("../eCLIP/K562_norm_bg_26.10.2020.tsv", sep="\t", index=None)

In [341]:
proteins = [f for f in os.listdir("../eCLIP/") if "txt" in f and "HepG2" in f and "new" in f]
proteins_names = [os.path.splitext(p)[0] for p in proteins]

res_hepG2, hepG2, p = getResults(proteins, proteins_names, ["HepG2"])

Get targets list for HNRNPUL1new_HepG2.txt
Targets list len 77
Get targets list for ILF3new_HepG2.txt
Targets list len 717
Get targets list for HNRNPLnew_HepG2.txt
Targets list len 814
Get targets list for QKInew_HepG2.txt
Targets list len 492
Get targets list for SLTMnew_HepG2.txt
Targets list len 777
Get targets list for NCBP2new_HepG2.txt
Targets list len 97
Get targets list for CSTF2Tnew_HepG2.txt
Targets list len 2597
Get targets list for SUGP2new_HepG2.txt
Targets list len 2264
Get targets list for MATR3new_HepG2.txt
Targets list len 1094
Get targets list for CSTF2new_HepG2.txt
Targets list len 494
Get targets list for DROSHAnew_HepG2.txt
Targets list len 94
Get targets list for NKRFnew_HepG2.txt
Targets list len 260
Get targets list for SFPQnew_HepG2.txt
Targets list len 1058
Get targets list for FUSnew_HepG2.txt
Targets list len 70
Get targets list for HNRNPUnew_HepG2.txt
Targets list len 1197
Get targets list for FAM120Anew_HepG2.txt
Targets list len 72
Get targets list for HN

In [346]:
hepG2.to_csv("../eCLIP/HepG2_norm_bg_26.10.2020.tsv", sep="\t", index=None)

In [351]:
proteins_names = ['HNRNPLnew', 'HNRNPUnew', 'HNRNPUL1new', 'SAFBnew', 'TAF15new']
proteins = []

for name in proteins_names:
    proteins.append([f for f in os.listdir("../eCLIP/") if "txt" in f and name == os.path.splitext(f)[0].split('_')[0]])
    
res_union, union, p = getResults(proteins, proteins_names, ["K562", "HepG2"])

Get targets list for HNRNPLnew_K562.txt
Get targets list for HNRNPLnew_HepG2.txt
Targets list len 2146
Get targets list for HNRNPUnew_K562.txt
Get targets list for HNRNPUnew_HepG2.txt
Targets list len 1799
Get targets list for HNRNPUL1new_HepG2.txt
Get targets list for HNRNPUL1new_K562.txt
Targets list len 885
Get targets list for SAFBnew_K562.txt
Get targets list for SAFBnew_HepG2.txt
Targets list len 2497
Get targets list for TAF15new_K562.txt
Get targets list for TAF15new_HepG2.txt
Targets list len 882


In [352]:
union

Unnamed: 0,Modification,Modification lncRNAs list type,Modification lncRNAs list count,RBP,RBP lncRNAs count,TP,background,TN,FP,FN,p-value,oddsratio,Correction
0,H3K27ac,general,1335,HNRNPLnew,413,172,3769,2193,1163,241,0.00320996,1.345768,True
9,H3K4me1,general,1290,HNRNPLnew,418,166,3840,2298,1124,252,0.003224187,1.346763,True
0,H3K27ac,general,1335,HNRNPUnew,559,253,3769,2128,1082,306,1.273295e-07,1.626086,True
3,H3K27me3,general,967,HNRNPUnew,562,172,3837,2480,795,390,0.0009935819,1.375778,True
9,H3K4me1,general,1290,HNRNPUnew,563,221,3840,2208,1069,342,0.001337199,1.334712,True
15,H3K4me3,general,1168,HNRNPUnew,572,201,3929,2390,967,371,0.001440612,1.33904,True
17,H3K4me3,only - corrs,268,HNRNPUnew,572,55,3929,3144,213,517,0.003740726,1.570273,True
0,H3K27ac,general,1335,HNRNPUL1new,186,91,3769,2339,1244,95,7.233552e-05,1.801058,True
3,H3K27me3,general,967,HNRNPUL1new,187,73,3837,2756,894,114,1.327233e-05,1.974057,True
6,H3K36me3,general,879,HNRNPUL1new,187,60,3824,2818,819,127,0.00217747,1.625566,True


In [92]:
pd.concat([k562, hepG2, union]).to_csv("../eCLIP/first_exp_result.tsv", sep="\t", index=None)

In [349]:
all_proteins_hep = pd.read_csv("../eCLIP/HepG2.Gencode.eCLIP.targets.all.lncRNA.prefer_fixed.tsv", sep="\t").columns
all_proteins_k562 = pd.read_csv("../eCLIP/K562.Gencode.eCLIP.targets.all.lncRNA.prefer_fixed.tsv", sep="\t").columns

In [350]:
set(all_proteins_hep).intersection(set(all_proteins_k562))

{'Gene.ID', 'HNRNPL', 'HNRNPU', 'HNRNPUL1', 'SAFB', 'TAF15'}

In [331]:
all_proteins_k562.columns

Index(['Gene_ID', 'AARS', 'BOP1', 'CPEB4', 'DDX21', 'EXOSC5', 'FASTKD2',
       'HNRNPA1', 'HNRNPC', 'HNRNPL', 'HNRNPM', 'HNRNPU', 'HNRNPUL1', 'ILF3',
       'KHDRBS1', 'KHSRP', 'LARP4', 'LARP7', 'MATR3', 'NIPBL', 'NPM1', 'NSUN2',
       'PPIL4', 'PUS1', 'QKI', 'SAFB2', 'SAFB', 'SBDS', 'SSB', 'SUPV3L1',
       'TAF15', 'U2AF1', 'UTP18', 'WDR3', 'WDR43', 'WRN', 'XRN2', 'YWHAG',
       'ZC3H8'],
      dtype='object')

In [335]:
all_proteins_k562 = pd.read_csv("../eCLIP/K562.Gencode.eCLIP.targets.all.lncRNA.prefer_fixed.tsv", sep="\t")
all_proteins_k562 = all_proteins_k562.rename(columns={'Gene.ID': "Gene_ID"})

In [336]:
for p in all_proteins_k562.columns[1:]:
    p_df = all_proteins_k562[["Gene_ID", p]]
    p_df[p_df[p].notna()].to_csv("../eCLIP/" + p + "new_K562.txt", sep="\t", index=None)

In [None]:
#==================================================================================================================

In [None]:
#Есть список генов с  сайтами посадки SAFB в K562
#Есть список генов с пиками конкретной метки, скоррелированными со списком конкретных РНК
#Есть список РНК с которыми вяжется SAFB
#1)Аннотировать сайты посадки SAFB - нашей аннотацией, потом все равно пересекать ее
#2)Сделать бэкграунд: пересечь все транскрибирующиеся гены в K562(по идее список, которым надо аннотировать сайты) с
#нашим списком(которым аннотировали пики)
#3)Сделать список lncRNA: пересечь список K562 SAFB с нашим списком lncRNA
#4)Есть список генов с сайтом, делаем список генов с пиком конкретной метки, скоррелированный со списком из
#предыдущего пункта
#)Пересекая эти списки делаем таблицу, потом считаем тест: по тесту на метку

In [23]:
def makeCommonGeneSet():
    in_file = "../annotation/gencode.v31.annotation.gff3"
    in_handle = open(in_file)

    limit_info = dict(
        gff_id = ['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 
     'chr19', 'chr2', 'chr20', 'chr21', 'chr22', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chrX', 'chrY'],
        gff_type = ["gene"])

    gencode31_genes = []
    for rec in GFF.parse(in_handle, limit_info=limit_info):
        gencode31_genes.append(rec.features) 

    in_handle.close()
    
    gencode31_genes = [g.qualifiers['gene_id'][0].split('.')[0] for cr in gencode31_genes for g in cr]
    
    return gencode31_genes

In [7]:
all_res = []

In [56]:
#TODO: А пересекаются ли они в принципе с генами из чипсека?
degs = defaultdict(list)

In [77]:
common_genes = [g.split(".")[0] for g in pd.read_csv("../eCLIP/" + "K562" + ".eCLIP.detectable.genes.tsv", header=None)[0].tolist()]
deg = pd.read_csv("../eCLIP/" + "K562" + ".shRNA.diff.exp.genes.tsv", sep="\t")
deg["Genes"] = [g.split(".")[0] for g in deg["Genes"]]
for protein in deg.columns:
    if protein == "Genes":
        continue
    print(protein + " " + str(len(set([g for g in deg[deg[protein].notna()]["Genes"].tolist()]).intersection(common_genes))))

TAF15 1129
NONO 60
HNRNPU 26
KHDRBS1 0
SAFB2 0
HNRNPL 10
HNRNPUL1 1
HNRNPM 1356
SAFB 12


In [89]:
common_genes = [g.split(".")[0] for g in pd.read_csv("../eCLIP/" + "HepG2" + ".eCLIP.detectable.genes.tsv", header=None)[0].tolist()]
deg = pd.read_csv("../eCLIP/" + "HepG2" + ".shRNA.diff.exp.genes.tsv", sep="\t")
deg["Genes"] = [g.split(".")[0] for g in deg["Genes"]]
for protein in deg.columns:
    if protein == "Genes":
        continue
    print(protein + " " + str(len(set([g for g in deg[deg[protein].notna()]["Genes"].tolist()]).intersection(common_genes))))

MATR3 343
NCBP2 285
QKI 342
TAF15 1934
CSTF2T 195
FUS 15
SLTM 870
FAM120A 0
CSTF2 37
HNRNPA1 136
HNRNPC 193
HNRNPU 25
ILF3 3
HNRNPL 20
HNRNPUL1 13
SFPQ 16
NKRF 173
SUGP2 188
DROSHA 21


In [66]:
deg = pd.read_csv("../eCLIP/" + "K562" + ".shRNA.diff.exp.genes.tsv", sep="\t")
deg["Genes"] = [g.split(".")[0] for g in deg["Genes"]]
for protein in ["TAF15", "NONO", "SAFB2", "HNRNPL", "HNRNPUL1", "SAFB"]:
    protein_anno = pd.read_csv("../eCLIP/ChIPseq_for_lncRNA_RBPs/" + "K562" + "/" + protein + "_opt_1_not_sorted" + "_anno.csv", sep="\t")
    protein_genes = set([f.split('.')[0] for f in protein_anno['feature']])
    protein_deg = [g for g in deg[deg[protein].notna()]["Genes"].tolist()]
    
    print(protein + ": " + str(len(set(protein_genes).intersection(set(protein_deg)))))

TAF15: 41
NONO: 18
SAFB2: 0
HNRNPL: 2
HNRNPUL1: 0
SAFB: 2


In [70]:
deg = pd.read_csv("../eCLIP/" + "HepG2" + ".shRNA.diff.exp.genes.tsv", sep="\t")
deg["Genes"] = [g.split(".")[0] for g in deg["Genes"]]
common_genes = [g.split(".")[0] for g in pd.read_csv("../eCLIP/" + "HepG2" + ".eCLIP.detectable.genes.tsv", header=None)[0].tolist()]
for protein in ["SFPQ", "HNRNPUL1"]:
    protein_anno = pd.read_csv("../eCLIP/ChIPseq_for_lncRNA_RBPs/" + "HepG2" + "/" + protein + "_opt_1_not_sorted" + "_anno.csv", sep="\t")
    protein_genes = set([f.split('.')[0] for f in protein_anno['feature']]) 
    protein_deg = [g for g in deg[deg[protein].notna()]["Genes"].tolist()]
    
    print(protein + ": " + str(len(set(protein_genes).intersection(set(protein_deg)))))

SFPQ: 0
HNRNPUL1: 1


In [None]:
#Сделать мапку с cell->HM->(protein, anno_name)

In [81]:
m = {"K562": {"H3K27ac": ["TAF15", "HNRNPUL1", "SAFB", "SAFB2"], 
          "H3K27me3": ["TAF15", "HNRNPUL1", "NONO"], 
          "H3K4me1": ["TAF15", "HNRNPUL1", "SAFB", "SAFB2"], 
          "H3K4me3": ["HNRNPUL1", "SAFB", "SAFB2"], 
          "H3K36me3": ["HNRNPUL1"]
         },
"HepG2": {"H3K27ac": ["SFPQ"],
          "H3K27me3": ["SFPQ"],
          "H3K4me1":["SFPQ", "HNRNPUL1"],
          "H3K4me3":["HNRNPUL1", "SFPQ"],
          "H3K4me2": ["HNRNPUL1"]
         }
}

In [82]:
all_res = []
all_res_deg = []

In [83]:
for cell in m.keys():
    deg = pd.read_csv("../eCLIP/" + cell + ".shRNA.diff.exp.genes.tsv", sep="\t")
    deg["Genes"] = [g.split(".")[0] for g in deg["Genes"]]
    common_genes = [g.split(".")[0] for g in pd.read_csv("../eCLIP/" + cell + ".eCLIP.detectable.genes.tsv", header=None)[0].tolist()]
    for target in m[cell].keys():
        anno = pd.read_csv("/data/mazurovev/all_marks/" + target + "/peaks_anno.csv", sep="\t") 
        common_lncRNAs = getCommonGencodeBackground(target, [cell])
        for protein in m[cell][target]:
            if protein == "NONO":
                for anno_name in [protein + "_opt_1_not_sorted", protein + "_opt_2_not_sorted"]:
                    protein_anno = pd.read_csv("../eCLIP/ChIPseq_for_lncRNA_RBPs/" + cell + "/" + anno_name + "_anno.csv", sep="\t")
                    protein_genes = set([f.split('.')[0] for f in protein_anno['feature']]).intersection(common_genes)
                    cell_protein_rnas = getTargetsList(protein + "new_" + cell + ".txt")

                    res, tp_gene_lists = makeFisher(target, common_genes, common_lncRNAs, cell_protein_rnas, protein_genes)
                    df = pd.DataFrame.from_records(res, columns = ["lncRNA", "lncRNA_genes count", "protein_genes_count", "TP", "TN", 'FP', "FN", "oddsratio", "p-value", "correction"])
                    df["protein"] = [protein]*df.shape[0]
                    df["tissue"] = [cell]*df.shape[0]
                    df["HM"] = [target]*df.shape[0]
                    df["lncRNA_name"] = get_gene_name(df["lncRNA"])['symbol']
                    df = df[df["correction"] == True]
                    all_res.append(df)

                    if protein in deg.columns:
                        deg_res, deg_tp_gene_lists = makeFisher(target, common_genes, common_lncRNAs, cell_protein_rnas, [g for g in deg[deg[protein].notna()]["Genes"].tolist()])
                        df = pd.DataFrame.from_records(deg_res, columns = ["lncRNA", "lncRNA_genes count", "deg_genes_count", "TP", "TN", 'FP', "FN", "oddsratio", "p-value", "correction"])
                        df["protein"] = [protein]*df.shape[0]
                        df["tissue"] = [cell]*df.shape[0]
                        df["HM"] = [target]*df.shape[0]
                        df["lncRNA_name"] = get_gene_name(df["lncRNA"])['symbol']
                        df = df[df["correction"] == True]
                        all_res_deg.append(df)
            else:
                anno_name = protein + "_opt_1_not_sorted"
                protein_anno = pd.read_csv("../eCLIP/ChIPseq_for_lncRNA_RBPs/" + cell + "/" + anno_name + "_anno.csv", sep="\t")
                protein_genes = set([f.split('.')[0] for f in protein_anno['feature']]).intersection(common_genes)
                cell_protein_rnas = getTargetsList(protein + "new_" + cell + ".txt")

                res, tp_gene_lists = makeFisher(target, common_genes, common_lncRNAs, cell_protein_rnas, protein_genes)
                df = pd.DataFrame.from_records(res, columns = ["lncRNA", "lncRNA_genes count", "protein_genes_count", "TP", "TN", 'FP', "FN", "oddsratio", "p-value", "correction"])
                df["protein"] = [protein]*df.shape[0]
                df["tissue"] = [cell]*df.shape[0]
                df["HM"] = [target]*df.shape[0]
                df["lncRNA_name"] = get_gene_name(df["lncRNA"])['symbol']
                df = df[df["correction"] == True]
                all_res.append(df)

                if protein in deg.columns:
                    deg_res, deg_tp_gene_lists = makeFisher(target, common_genes, common_lncRNAs, cell_protein_rnas, [g for g in deg[deg[protein].notna()]["Genes"].tolist()])
                    df = pd.DataFrame.from_records(deg_res, columns = ["lncRNA", "lncRNA_genes count", "deg_genes_count", "TP", "TN", 'FP', "FN", "oddsratio", "p-value", "correction"])
                    df["protein"] = [protein]*df.shape[0]
                    df["tissue"] = [cell]*df.shape[0]
                    df["HM"] = [target]*df.shape[0]
                    df["lncRNA_name"] = get_gene_name(df["lncRNA"])['symbol']
                    df = df[df["correction"] == True]
                    all_res_deg.append(df)
                                     
                                     

Get targets list for TAF15new_K562.txt
Targets list len 791
trolololo
corrs and RBP lncRNAs for H3K27ac: 104
[[510, 87], [17842, 4032]]
lalala
querying 1-93...done.
Finished.
1 input query terms found no hit:
	['ENSG00000224959']
trolololo
corrs and RBP lncRNAs for H3K27ac: 104
[[1074, 327], [17278, 3792]]
lalala
querying 1-93...done.
Finished.
1 input query terms found no hit:
	['ENSG00000224959']
Get targets list for HNRNPUL1new_K562.txt
Targets list len 855
trolololo
corrs and RBP lncRNAs for H3K27ac: 81
[[631, 130], [17501, 4209]]
lalala
querying 1-73...done.
Finished.
trolololo
corrs and RBP lncRNAs for H3K27ac: 81
[[1, 0], [18131, 4339]]
lalala
querying 1-73...done.
Finished.
Get targets list for SAFBnew_K562.txt
Targets list len 1779
trolololo
corrs and RBP lncRNAs for H3K27ac: 144
[[4646, 496], [13543, 3786]]
lalala
querying 1-130...done.
Finished.
1 input query terms found no hit:
	['ENSG00000279191']
trolololo
corrs and RBP lncRNAs for H3K27ac: 144
[[11, 2], [18178, 4280]]
la

In [97]:
fantom = set(['A1BG-AS1',
 'AC005592.2',
 'AC007246.3',
 'AC007879.7',
 'AC016747.3',
 'AC092295.7',
 'BOLA3-AS1',
 'CD27-AS1',
 'CTD-2587H24.5',
 'DNM3OS',
 'EMX2OS',
 'FGD5-AS1',
 'FTX',
 'JPX',
 'LINC00511',
 'LINC00654',
 'LINC00702',
 'LINC00707',
 'LINC00862',
 'LINC00886',
 'LINC00938',
 'LINC00963',
 'MAPKAPK5-AS1',
 'MEG3',
 'RAB30-AS1',
 'RP11-115C21.2',
 'RP11-137L10.6',
 'RP11-150O12.1',
 'RP11-221N13.3',
 'RP11-38L15.3',
 'RP11-395B7.4',
 'RP11-398K22.12',
 'RP11-417E7.1',
 'RP11-458D21.1',
 'RP11-539L10.3',
 'RP11-545E17.3',
 'RP11-54A9.1',
 'RP11-65J3.1',
 'RP11-660L16.2',
 'RP11-834C11.4',
 'RP13-463N16.6',
 'RP3-510D11.2',
 'RP6-109B7.3',
 'SBF2-AS1',
 'SERTAD4-AS1',
 'TMCC1-AS1',
 'WDFY3-AS2',
 'ZNF674-AS1'])

In [98]:
eclip = set(pd.concat(all_res)['lncRNA_name'])

In [99]:
fantom.intersection(eclip)

{'LINC00511', 'MEG3'}

In [100]:
len(fantom)

48

In [None]:
deg = pd.read_csv("../eCLIP/K562.shRNA.diff.exp.genes.tsv", sep="\t")
deg["Genes"] = [g.split(".")[0] for g in deg["Genes"]]

In [34]:
deg.head()

Unnamed: 0,Genes,TAF15,NONO,HNRNPU,KHDRBS1,SAFB2,HNRNPL,HNRNPUL1,HNRNPM,SAFB
0,ENSG00000000419,,,,,,,,,
1,ENSG00000000457,,,,,,,,,
2,ENSG00000000460,,,,,,,,,
3,ENSG00000000971,,,,,,,,,
4,ENSG00000001036,,,,,,,,,


In [44]:
safb_deg = [g for g in deg[deg["SAFB"].notna()]["Genes"].tolist()]

In [45]:
safb_deg

['ENSG00000104228',
 'ENSG00000104679',
 'ENSG00000106246',
 'ENSG00000116521',
 'ENSG00000118515',
 'ENSG00000130513',
 'ENSG00000143436',
 'ENSG00000151239',
 'ENSG00000160679',
 'ENSG00000163113',
 'ENSG00000163704',
 'ENSG00000165813',
 'ENSG00000225830']

In [38]:
for rna in df["lncRNA"]:
    print(len(tp_gene_lists[rna].intersection(safb_deg)))

0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [88]:
pd.concat(all_res_deg).to_csv("../eCLIP/result_with_ChIP_and_degs_09.11.2020.tsv", sep="\t", index=None)

In [408]:
pd.concat(all_res).to_csv("../eCLIP/result_with_ChIP_26.10.2020.tsv", sep="\t", index=None)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [254]:
# Брать в качестве генов только те, в которых есть пики, сколько их?
#anno = pd.read_csv("/data/mazurovev/all_marks/" + "H3K27me3" + "/peaks_anno.csv", sep="\t")

In [256]:
#Общие гены все(не только нкРНК)
# common_genes = [g.split(".")[0] for g in anno['feature'].unique()]

#В качестве общих генов берем экспрессирующиеся
#common_genes = [g.split(".")[0] for g in pd.read_csv("../eCLIP/" + "K562" + ".eCLIP.detectable.genes.tsv", header=None)[0].tolist()]

In [269]:
#lncRNAs background
#common_lncRNAs = getCommonGencodeBackground("H3K27ac", ["K562"])

In [261]:
#Аннотация чипсека safb для K562
#safb_anno = pd.read_csv("../eCLIP/SAFB_anno.csv", sep="\t")

In [262]:
#Список генов, которые входят в общий с нами список генов и в чипсик белка
#safb_genes = set([f.split('.')[0] for f in safb_anno['feature']]).intersection(common_genes)

In [264]:
#нкРНК мишени у SAFB в K562(и которые могут быть в корреляциях)
#k562_SAFB_rnas = getTargetsList("SAFBnew_K562.txt")

Get targets list for SAFBnew_K562.txt
Targets list len 1779


In [25]:
def makeFisher(target, common_genes, common_lncRNAs, protein_cell_rnas, protein_genes):
    tp_gene_lists = {}
    print("trolololo")
    #нкРНК с корреляциями у H3K27ac(и которые могут быть в K562)
    target_rnas = set(corrs_lncRNAs[target]['general']).intersection(common_lncRNAs)
    
    #нкРНК, которые и имеют корреляции с меткой и есть в мишенях белка
    lncRNAs = protein_cell_rnas.intersection(target_rnas)
    print("corrs and RBP lncRNAs for " + target + ": " + str(len(lncRNAs)))
    
    pg_association = pd.read_csv("/data/mazurovev/all_marks/" + target + "/lncRNA_peaks_gene_association.tsv", sep="\t")
    res_per_lncRNA = []
    all_lncRNAs_hm_genes = set([g.split('.')[0] for g in pg_association[pg_association['lncRNA'].isin(lncRNAs)]['gene'].unique()]).intersection(common_genes)
    for lncRNA in lncRNAs:
        #Список генов(общий список с генами белка), которые имеют корреляцию с меткой через нкРНК, которые так же мишени белка
        hm_genes = set([g.split('.')[0] for g in pg_association[pg_association['lncRNA'] == lncRNA]['gene'].unique()]).intersection(common_genes)
        if len(hm_genes) > 0:
            TP = len(hm_genes.intersection(protein_genes))
            tp_gene_lists[lncRNA] = hm_genes.intersection(protein_genes)
            TN = len(common_genes) - len(protein_genes) - len(hm_genes) + TP
            FP = len(protein_genes) - TP
            FN = len(hm_genes) - TP
            r, pv = stats.fisher_exact([[TP, FP], [FN, TN]], alternative='greater')
            res_per_lncRNA.append([lncRNA, len(hm_genes), len(protein_genes), TP, TN, FP, FN, r, pv])
    
    TP = len(all_lncRNAs_hm_genes.intersection(protein_genes))
    TN = len(common_genes) - len(protein_genes) - len(all_lncRNAs_hm_genes) + TP
    FP = len(protein_genes) - TP
    FN = len(all_lncRNAs_hm_genes) - TP
    print([[TP, FP], [FN, TN]])
    print("lalala")
    r, pv = stats.fisher_exact([[TP, FP], [FN, TN]], alternative='greater')
    # res_per_lncRNA.append(["All", len(all_lncRNAs_hm_genes), len(protein_genes), TP, TN, FP, FN, r, pv])
    
    pv_list = [r[8] for r in res_per_lncRNA]
    correct = multipletests(pv_list, alpha=0.05, method='fdr_bh')
    for i, c in enumerate(correct[0]):
        res_per_lncRNA[i].append(c)
    
    return res_per_lncRNA, tp_gene_lists

In [293]:
safb_deg = deg[["Genes", "SAFB"]]
safb_deg = [g.split(".")[0] for g in safb_deg[safb_deg["SAFB"].notna()]["Genes"].tolist()]

In [294]:
len(safb_deg)

13

In [296]:
set(tp_gene_lists['ENSG00000197536']).intersection(set(safb_deg))

{'ENSG00000116521'}

In [27]:
import mygene
def get_gene_name(gene_id):
    mg = mygene.MyGeneInfo()
    out = mg.querymany(gene_id, scopes='ensembl.gene', fields="symbol", species='human', 
                           returnall=True, as_dataframe=True)
    with_symbol = out['out']
    del with_symbol.index.name
    with_symbol['ensembl_id'] = with_symbol.index
    if 'symbol' in with_symbol.columns:
        with_symbol = with_symbol.reset_index()[['ensembl_id', 'symbol']]
        with_symbol = with_symbol[with_symbol['symbol'].notnull()]
        return with_symbol
    else:
        return None