In [44]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
from PIL import Image
import numpy as np

In [15]:
out_dir = Path("/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/viz/venn")
out_dir.mkdir(parents=True, exist_ok=True)

def draw_venn(sets, labels=tuple(['set1','set2','set3']), title='Untitle', out_path=Path.cwd()):
    out_path.mkdir(parents=True, exist_ok=True)
    if len(sets) == 3:
        set1,set2,set3 = sets
        if len(labels)!=3:
            return "Wrong number of labels"
        else:
            intersect123 = set(set1).intersection(set(set2)).intersection(set(set3))
            intersect12 = set(set1).intersection(set(set2))-intersect123
            intersect23 = set(set2).intersection(set(set3))-intersect123
            intersect13 = set(set1).intersection(set(set3))-intersect123
            only1 = set(set1)-intersect12-intersect13-intersect123
            only2 = set(set2)-intersect12-intersect23-intersect123
            only3 = set(set3)-intersect13-intersect23-intersect123
            venn3(subsets={'100': len(only1), 
                           '010': len(only2), 
                           '110': len(intersect12),
                           '001': len(only3), 
                           '101': len(intersect13), 
                           '011': len(intersect23),
                           '111': len(intersect123)}, set_labels=labels)
            plt.title(title)
            plt.savefig(out_path.joinpath("venn.png"), dpi=300)
            plt.close()
            with open(out_path.joinpath(f"onlyGenes_{labels[0]}.txt"), 'w') as fp:
                fp.writelines([i+"\n" for i in only1])
            with open(out_path.joinpath(f"onlyGenes_{labels[1]}.txt"), 'w') as fp:
                fp.writelines([i+"\n" for i in only2])
            with open(out_path.joinpath(f"onlyGenes_{labels[2]}.txt"), 'w') as fp:
                fp.writelines([i+"\n" for i in only3])
            with open(out_path.joinpath(f"sharedGenes_{labels[0]}_{labels[1]}.txt"), 'w') as fp:
                fp.writelines([i+"\n" for i in intersect12])
            with open(out_path.joinpath(f"sharedGenes_{labels[0]}_{labels[2]}.txt"), 'w') as fp:
                fp.writelines([i+"\n" for i in intersect13])
            with open(out_path.joinpath(f"sharedGenes_{labels[1]}_{labels[2]}.txt"), 'w') as fp:
                fp.writelines([i+"\n" for i in intersect23])
            with open(out_path.joinpath(f"sharedGenes_{labels[0]}_{labels[1]}_{labels[2]}.txt"), 'w') as fp:
                fp.writelines([i+"\n" for i in intersect123])
    else:
        set1,set2 = sets
        if len(labels)!=2:
            return "Wrong number of labels"
        else:
            intersect12 = set(set1).intersection(set(set2))
            only1 = set(set1)-intersect12
            only2 = set(set2)-intersect12
            venn2(subsets={'10': len(only1), 
                           '01': len(only2), 
                           '11': len(intersect12)}, set_labels=labels)
            plt.title(title)
            if out_path != None:
                plt.savefig(out_path.joinpath("venn.png"), dpi=300)
            plt.close()
            with open(out_path.joinpath(f"onlyGenes_{labels[0]}.txt"), 'w') as fp:
                fp.writelines([i+"\n" for i in only1])
            with open(out_path.joinpath(f"onlyGenes_{labels[1]}.txt"), 'w') as fp:
                fp.writelines([i+"\n" for i in only2])
            with open(out_path.joinpath(f"sharedGenes_{labels[0]}_{labels[1]}.txt"), 'w') as fp:
                fp.writelines([i+"\n" for i in intersect12])

In [12]:
# 3 sets Venn: mutation_WT or WT_mutation ? (up/down genes)
for mut in ['W377A','Y112A']:
    WT_EV = pd.read_csv(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/DE_featureCounts/Yalong/Yalong_EV_WT_{mut}_Y112A/WT_EV/WT_EV_sig.csv", index_col=0).index
    EV_WT = pd.read_csv(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/DE_featureCounts/Yalong/Yalong_EV_WT_{mut}_Y112A/EV_WT/EV_WT_sig.csv", index_col=0).index
    mut_EV = pd.read_csv(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/DE_featureCounts/Yalong/Yalong_EV_WT_{mut}_Y112A/{mut}_EV/{mut}_EV_sig.csv", index_col=0).index
    EV_mut = pd.read_csv(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/DE_featureCounts/Yalong/Yalong_EV_WT_{mut}_Y112A/EV_{mut}/EV_{mut}_sig.csv", index_col=0).index
    mut_WT = pd.read_csv(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/DE_featureCounts/Yalong/Yalong_EV_WT_{mut}_Y112A/{mut}_WT/{mut}_WT_sig.csv", index_col=0).index
    WT_mut = pd.read_csv(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/DE_featureCounts/Yalong/Yalong_EV_WT_{mut}_Y112A/WT_{mut}/WT_{mut}_sig.csv", index_col=0).index

    for data, labels in [([WT_EV, mut_EV], tuple(["WT_EV",f"{mut}_EV"])), 
                         ([EV_mut, WT_mut], tuple([f"EV_{mut}",f"WT_{mut}"])),
                         ([EV_WT, mut_WT], tuple(["EV_WT",f"{mut}_WT"])),
                         ([WT_EV, mut_EV, WT_mut], tuple(["WT_EV",f"{mut}_EV",f"WT_{mut}"]))]:
        draw_venn(data, 
                  labels=labels, 
                  title=f"Venn diagram - significant genes of {'-'.join(labels)}", 
                  out_path=out_dir.joinpath("DE").joinpath("Yalong").joinpath(f"{mut}_WT").joinpath("-".join(labels)))


In [13]:
# 3 sets Venn: mutation_WT or WT_mutation ? (up/down genes)
KI_WT = pd.read_csv(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/DE_featureCounts/Tanner/Tanner_KIKO/KI_WT/KI_WT_sig.csv", index_col=0).index
KO_WT = pd.read_csv(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/DE_featureCounts/Tanner/Tanner_KIKO/KO_WT/KO_WT_sig.csv", index_col=0).index
KO_KI = pd.read_csv(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/DE_featureCounts/Tanner/Tanner_KIKO/KO_KI/KO_KI_sig.csv", index_col=0).index

for data, labels in [([KI_WT, KO_WT], tuple(["KI_WT","KO_WT"])), 
                     ([KI_WT, KO_WT, KO_KI], tuple(["KI_WT", "KO_WT", "KO_KI"]))]:
    draw_venn(data, 
              labels=labels, 
              title=f"Venn diagram - significant genes of {'-'.join(labels)}", 
              out_path=out_dir.joinpath("DE").joinpath("Tanner").joinpath(f"KOKI").joinpath("-".join(labels)))

In [None]:
# Venn diagrams for rMATS data
for alter_splice in ["SE","RI",'MXE','A3SS','A5SS']:
    for mut in ['W377A','Y112A']:
        df1 = pd.read_csv(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/rMATS/human/Yalong_EV_WT/{alter_splice}.MATS.JC.txt", sep='\t')
        df2 = pd.read_csv(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/rMATS/human/Yalong_EV_{mut}/{alter_splice}.MATS.JC.txt", sep='\t')
        df3 = pd.read_csv(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/rMATS/human/Yalong_WT_{mut}/{alter_splice}.MATS.JC.txt", sep='\t')
        temp1 = df1[(df1['PValue']<0.05)&(df1['FDR']<0.05)]
        temp2 = df2[(df2['PValue']<0.05)&(df2['FDR']<0.05)]
        temp3 = df3[(df3['PValue']<0.05)&(df3['FDR']<0.05)]

        if alter_splice=="RI":
            temp1['unique_id'] = temp1['GeneID']+"_"+temp1['geneSymbol']+"_"+temp1['chr']+"_"+temp1['strand']+"_"+temp1['riExonStart_0base'].astype(str)+"_"+temp1['riExonEnd'].astype(str)+"_"+temp1['upstreamES'].astype(str)+"_"+temp1['upstreamEE'].astype(str)+"_"+temp1['downstreamES'].astype(str)+"_"+temp1['downstreamEE'].astype(str)
            temp2['unique_id'] = temp2['GeneID']+"_"+temp2['geneSymbol']+"_"+temp2['chr']+"_"+temp2['strand']+"_"+temp2['riExonStart_0base'].astype(str)+"_"+temp2['riExonEnd'].astype(str)+"_"+temp2['upstreamES'].astype(str)+"_"+temp2['upstreamEE'].astype(str)+"_"+temp2['downstreamES'].astype(str)+"_"+temp2['downstreamEE'].astype(str)
            temp3['unique_id'] = temp3['GeneID']+"_"+temp3['geneSymbol']+"_"+temp3['chr']+"_"+temp3['strand']+"_"+temp3['riExonStart_0base'].astype(str)+"_"+temp3['riExonEnd'].astype(str)+"_"+temp3['upstreamES'].astype(str)+"_"+temp3['upstreamEE'].astype(str)+"_"+temp3['downstreamES'].astype(str)+"_"+temp3['downstreamEE'].astype(str)

        elif alter_splice=="MXE":
            temp1['unique_id'] = temp1['GeneID']+"_"+temp1['geneSymbol']+"_"+temp1['chr']+"_"+temp1['strand']+"_"+temp1['1stExonStart_0base'].astype(str)+"_"+temp1['1stExonEnd'].astype(str)+"_"+temp1['2ndExonStart_0base'].astype(str)+"_"+temp1['2ndExonEnd'].astype(str)+"_"+temp1['upstreamES'].astype(str)+"_"+temp1['upstreamEE'].astype(str)+"_"+temp1['downstreamES'].astype(str)+"_"+temp1['downstreamEE'].astype(str)
            temp2['unique_id'] = temp2['GeneID']+"_"+temp2['geneSymbol']+"_"+temp2['chr']+"_"+temp2['strand']+"_"+temp2['1stExonStart_0base'].astype(str)+"_"+temp2['1stExonEnd'].astype(str)+"_"+temp2['2ndExonStart_0base'].astype(str)+"_"+temp2['2ndExonEnd'].astype(str)+"_"+temp2['upstreamES'].astype(str)+"_"+temp2['upstreamEE'].astype(str)+"_"+temp2['downstreamES'].astype(str)+"_"+temp2['downstreamEE'].astype(str)
            temp3['unique_id'] = temp3['GeneID']+"_"+temp3['geneSymbol']+"_"+temp3['chr']+"_"+temp3['strand']+"_"+temp3['1stExonStart_0base'].astype(str)+"_"+temp3['1stExonEnd'].astype(str)+"_"+temp3['2ndExonStart_0base'].astype(str)+"_"+temp3['2ndExonEnd'].astype(str)+"_"+temp3['upstreamES'].astype(str)+"_"+temp3['upstreamEE'].astype(str)+"_"+temp3['downstreamES'].astype(str)+"_"+temp3['downstreamEE'].astype(str)

        elif alter_splice in ['A3SS','A5SS']:
            temp1['unique_id'] = temp1['GeneID']+"_"+temp1['geneSymbol']+"_"+temp1['chr']+"_"+temp1['strand']+"_"+temp1['longExonStart_0base'].astype(str)+"_"+temp1['longExonEnd'].astype(str)+"_"+temp1['shortES'].astype(str)+"_"+temp1['shortEE'].astype(str)+"_"+temp1['flankingES'].astype(str)+"_"+temp1['flankingEE'].astype(str)
            temp2['unique_id'] = temp2['GeneID']+"_"+temp2['geneSymbol']+"_"+temp2['chr']+"_"+temp2['strand']+"_"+temp2['longExonStart_0base'].astype(str)+"_"+temp2['longExonEnd'].astype(str)+"_"+temp2['shortES'].astype(str)+"_"+temp2['shortEE'].astype(str)+"_"+temp2['flankingES'].astype(str)+"_"+temp2['flankingEE'].astype(str)
            temp3['unique_id'] = temp3['GeneID']+"_"+temp3['geneSymbol']+"_"+temp3['chr']+"_"+temp3['strand']+"_"+temp3['longExonStart_0base'].astype(str)+"_"+temp3['longExonEnd'].astype(str)+"_"+temp3['shortES'].astype(str)+"_"+temp3['shortEE'].astype(str)+"_"+temp3['flankingES'].astype(str)+"_"+temp3['flankingEE'].astype(str)
 
        else:
            temp1['unique_id'] = temp1['GeneID']+"_"+temp1['geneSymbol']+"_"+temp1['chr']+"_"+temp1['strand']+"_"+temp1['exonStart_0base'].astype(str)+"_"+temp1['exonEnd'].astype(str)+"_"+temp1['upstreamES'].astype(str)+"_"+temp1['upstreamEE'].astype(str)+"_"+temp1['downstreamES'].astype(str)+"_"+temp1['downstreamEE'].astype(str)
            temp2['unique_id'] = temp2['GeneID']+"_"+temp2['geneSymbol']+"_"+temp2['chr']+"_"+temp2['strand']+"_"+temp2['exonStart_0base'].astype(str)+"_"+temp2['exonEnd'].astype(str)+"_"+temp2['upstreamES'].astype(str)+"_"+temp2['upstreamEE'].astype(str)+"_"+temp2['downstreamES'].astype(str)+"_"+temp2['downstreamEE'].astype(str)
            temp3['unique_id'] = temp3['GeneID']+"_"+temp3['geneSymbol']+"_"+temp3['chr']+"_"+temp3['strand']+"_"+temp3['exonStart_0base'].astype(str)+"_"+temp3['exonEnd'].astype(str)+"_"+temp3['upstreamES'].astype(str)+"_"+temp3['upstreamEE'].astype(str)+"_"+temp3['downstreamES'].astype(str)+"_"+temp3['downstreamEE'].astype(str)
        
        
        for data, labels in [([temp1['unique_id'],temp2['unique_id']], tuple(["WT_EV",f"{mut}_EV"])), 
                             ([temp2['unique_id'],temp3['unique_id']], tuple([f"EV_{mut}",f"WT_{mut}"])),
                             ([temp1['unique_id'],temp3['unique_id']], tuple(["EV_WT",f"{mut}_WT"])),
                             ([temp1['unique_id'],temp2['unique_id'],temp3['unique_id']], tuple(["WT_EV",f"{mut}_EV",f"{mut}_WT"]))]:
            draw_venn(data, 
                    labels=labels, 
                    title=f"Venn diagram - {alter_splice} events of {'-'.join(labels)}", 
                    out_path=out_dir.joinpath("rMATS").joinpath("Yalong").joinpath(f"{mut}_WT").joinpath("-".join(labels)+"_"+alter_splice))

### MERGE VENN DIAGRAMS 

In [47]:
for mut in ['W377A','Y112A']:
    image_mapper = {i.parent.name:i for i in Path(f"/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/viz/venn/rMATS/Yalong/{mut}_WT").glob("**/venn.png")}
    image_full = []
    for row in [[f'WT_EV-{mut}_EV_SE', f'EV_WT-{mut}_WT_SE', f'EV_{mut}-WT_{mut}_SE', f'WT_EV-{mut}_EV-{mut}_WT_SE'],
                [f'WT_EV-{mut}_EV_RI', f'EV_WT-{mut}_WT_RI', f'EV_{mut}-WT_{mut}_RI', f'WT_EV-{mut}_EV-{mut}_WT_RI'],
                [f'WT_EV-{mut}_EV_A5SS', f'EV_WT-{mut}_WT_A5SS', f'EV_{mut}-WT_{mut}_A5SS', f'WT_EV-{mut}_EV-{mut}_WT_A5SS'],
                [f'WT_EV-{mut}_EV_A3SS', f'EV_WT-{mut}_WT_A3SS', f'EV_{mut}-WT_{mut}_A3SS', f'WT_EV-{mut}_EV-{mut}_WT_A3SS'],
                [f'WT_EV-{mut}_EV_MXE', f'EV_WT-{mut}_WT_MXE', f'EV_{mut}-WT_{mut}_MXE', f'WT_EV-{mut}_EV-{mut}_WT_MXE']]:
        image_full.append(np.hstack([Image.open(image_mapper[i]) for i in row]))
    image_full = Image.fromarray(np.vstack(np.array(image_full)))
    image_full.save(f'/rsrch4/home/mol_cgenesis/EMC_BIC_rsrch4/nkdang/Splicing_Projects/results/viz/venn/rMATS/Yalong/{mut}_WT/combined.png') 