In [None]:
import networkx as nx
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib.patches as patches
import matplotlib.font_manager
from scipy.stats import pearsonr
from scipy.stats import linregress
from matplotlib import pyplot as plt
import matplotlib as mpl
from pycirclize import Circos
matplotlib.font_manager.fontManager.addfont('/h/tianyi/TS_datasets_reversion/Cell_Press_plot/Arial.ttf')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
plt.rcParams['font.family'] = 'Arial'
mpl.rcParams['font.size'] = 8 
plt.rcParams['axes.linewidth'] = 0.5
plt.rcParams['text.color'] = 'black'
plt.rcParams['axes.labelcolor'] = 'black'
plt.rcParams['xtick.color'] = 'black'
plt.rcParams['ytick.color'] = 'black'
plt.rcParams['axes.titlecolor'] = 'black'
plt.rcParams['legend.labelcolor'] = 'black'
plt.rcParams['axes.linewidth'] = 0.5

In [None]:
## log10 score distribution and lorenz curve plot
driver_cancer_max_prevalance_plot_all_color=pd.read_csv('/h/tianyi/TS_datasets_reversion/sci_paper_plot_stas/data_statistic/driver_mut_distribution_gini/driver_cancer_max_prevalance_plot_all_color.txt',sep='\t')
driver_cancer_max_prevalance_plot_all_color[(driver_cancer_max_prevalance_plot_all_color['tissue_num']==1) & (driver_cancer_max_prevalance_plot_all_color['count_sum']>3)]
driver_cancer_max_prevalance_plot_all_color_large4=driver_cancer_max_prevalance_plot_all_color[driver_cancer_max_prevalance_plot_all_color['count_sum']>4]

tissue_one=driver_cancer_max_prevalance_plot_all_color[driver_cancer_max_prevalance_plot_all_color['tissue_num']==1].sort_values('max_log10')
tissue_num=driver_cancer_max_prevalance_plot_all_color.sort_values('tissue_num')[::-1].iloc[0:50,:]
tissue_num.sort_values('max_log10')
mut_one=driver_cancer_max_prevalance_plot_all_color[driver_cancer_max_prevalance_plot_all_color['max_log10']==driver_cancer_max_prevalance_plot_all_color['max_log10'].min()].sort_values('tissue_num')

tissue_color=pd.read_csv('/h/tianyi/TS_datasets_reversion/sci_paper_plot_stas/data_statistic/donor_number_circos/tissue_color.txt',sep='\t')
color_palette=tissue_color[['tissue_name','tissue_cor_v2']].drop_duplicates()
color_palette=dict(zip(color_palette['tissue_name'],color_palette['tissue_cor_v2']))
fig, axes = plt.subplots(1,1,figsize=(8.5/2,3.55))
plt.rcParams['axes.linewidth'] = 0.5
sns.scatterplot(data=driver_cancer_max_prevalance_plot_all_color,y='max_log10',x='tissue_num',ax=axes,hue='tissue_name',palette=color_palette,legend=False,s=10)
axes.set_ylabel(r'Max $\log_{10}$(B score) in driver cancer types',fontsize=8, fontname='Arial')
axes.set_xlabel('Number of driver cancer types',fontsize=8, fontname='Arial')
axes.set_xlim(0, 22)
axes.set_xticks(range(0, 23, 5))
axes.set_ylim(0, axes.get_ylim()[1])
axes.spines['top'].set_visible(False)
axes.spines['right'].set_visible(False)
axes.tick_params(axis='x', labelsize=8)
axes.tick_params(axis='y', labelsize=8)
plt.savefig('driver_mut_distribution_gini_log10_v3.pdf',dpi=300,bbox_inches='tight')



primary_bscore=pd.read_csv('/h/tianyi/TS_datasets_reversion/primary/mutation_profile/merge_mut_profile/log10_tissue.txt',sep='\t')
driver_gini=pd.read_csv('/h/tianyi/TS_datasets_reversion/primary/gini_count_cancer/gini_count4_driver_tissue_v2.txt',sep='\t')

primary_bscore_color=pd.merge(primary_bscore,tissue_color,left_on='Cancer_oncotree',right_on='tissue')
tissue_all=pd.DataFrame(primary_bscore_color[['tissue_name','tissue_cor_v2']].drop_duplicates())
tissue_all.columns=['tissue_type','tissue_color']

def lorenz_curve_plot(mut_id):
    gini_index=float(round(driver_gini[driver_gini['Driver_Hgvsp']==mut_id]['Gini'],2))
    pri_count_mut=primary_bscore_color[primary_bscore_color['hgvsp']==mut_id]
    coung_mut_zero_pri=pd.merge(tissue_all,pri_count_mut,left_on='tissue_type',right_on='tissue_name',how='left')
    coung_mut_zero_pri.fillna(0,inplace=True)
    coung_mut_zero_profile_pri=coung_mut_zero_pri[['tissue_type','hgvsp','-log10_b_score','tissue_color']]
    coung_mut_zero_profile_pri['-log10_b_score']=coung_mut_zero_profile_pri['-log10_b_score'].astype(float)
    coung_mut_zero_profile_pri=coung_mut_zero_profile_pri.sort_values(['-log10_b_score'])
    log_profile_pri=np.array(coung_mut_zero_profile_pri['-log10_b_score'].sort_values())
    ####lorenz_curve
    fig, axes = plt.subplots(1,2,figsize=((8.5/2),1.3))
    plt.subplots_adjust(wspace=0.3)
    plt.rcParams['axes.linewidth'] = 0.5
    ###tissue distribution
    sns.barplot(data=coung_mut_zero_profile_pri,x='tissue_type',y='-log10_b_score',palette=color_palette,ax=axes[0])
    axes[0].spines['top'].set_visible(False)
    axes[0].spines['right'].set_visible(False)
    axes[0].tick_params(labelrotation=90,axis='x', labelsize=8)
    axes[0].set_title(f'Tissue-specific mutations',fontsize=8,fontname='Arial')
    axes[0].set_ylabel('-log10(B score)',fontsize=8, fontname='Arial', labelpad=1)
    axes[0].tick_params(bottom=False, labelbottom=False)
    axes[0].set_xlabel('Tissue', fontsize=8, fontname='Arial')
    axes[0].set_yticklabels(axes[0].get_yticks(), fontsize=8, fontname='Arial')
        ###lorenz_plot
    X1=log_profile_pri
    X_lorenz1 = X1.cumsum() / X1.sum()
    X_lorenz1 = np.insert(X_lorenz1, 0, 0)
    X_lorenz1[0], X_lorenz1[-1]
    ## scatter plot of Lorenz curve
    sns.lineplot(np.arange(X_lorenz1.size)/(X_lorenz1.size-1), X_lorenz1, 
            color='#E37869',ax=axes[1])
    ## line plot of equality
    sns.lineplot(np.array([0,1]),np.array([0,1]),color=sns.color_palette()[0],ax=axes[1])
    sns.lineplot(np.array([0,1]),np.array([0,0]),color='black',ax=axes[1])
    sns.lineplot(np.array([1,1]),np.array([0,1]),color='black',ax=axes[1])
    axes[1].set_title(f'Gini index: {gini_index}',fontsize=8,fontname='Arial')
    axes[1].spines['top'].set_visible(False)
    axes[1].spines['right'].set_visible(False)
    axes[1].set_xlabel('Percentage of mutation rate',fontsize=8, fontname='Arial')
    axes[1].set_ylabel('Percentage of tissue type',fontsize=8, fontname='Arial', labelpad=1)
    axes[1].fill_between([1,0], [0,0], [1,0], color='#7DA4C2')
    axes[1].set_xlim(0, 1)
    axes[1].set_ylim(0, 1)
    axes[1].tick_params(axis='both', labelsize=8)  # 设置 x 轴和 y 轴刻度字体大小
    plt.savefig(f'{mut_id}_gini_index.pdf',dpi=300,bbox_inches='tight')

primary_met_plot('KRAS:p.G12D')
primary_met_plot('EGFR:p.L858R')
mut_id='BRAF:p.V600E'

mut_id='KRAS:p.G12D'
mut_id='EGFR:p.L858R'

In [None]:
## mutation Gini distribution
driver_gini_distri=pd.read_excel('/h/tianyi/TS_datasets_reversion/sci_paper_plot_stas/supplyment_figure/mutation_gini/Driver mutation Gini index v3.xlsx')
fig,axes=plt.subplots(1,1,figsize=(9.25,1.3))
plt.subplots_adjust(wspace=0.4)
plt.rcParams['axes.linewidth'] = 0.5
sns.kdeplot(x=driver_gini_distri['Gini index (by adjusted background mutation frequency)'],shade=True,color="#EE7B6C",ax=axes)
axes.spines['top'].set_visible(False)
axes.spines['right'].set_visible(False)
axes.set_ylabel('Probability density',fontsize=8, fontname='Arial')
axes.set_xlabel('Gini index',fontsize=8, fontname='Arial')
axes.axvline(x=0.8, color='#80AACA', linestyle='--')
axes.set_xlim(0.6,1)
axes.tick_params(axis='x', labelsize=8)
axes.tick_params(axis='y', labelsize=8)
plt.savefig(f'/h/tianyi/TS_datasets_reversion/sci_paper_plot_stas/data_statistic/driver_mut_distribution_gini/mut_kdeplot_v2.pdf',dpi=300,bbox_inches='tight')


In [None]:
## ts no ts mut exp num
library(dplyr)
library(paletteer)
library(ggplot2)
library(ggpubr)
library("ggsci")
library(stringr)

setwd('G:\\课题\\tissue_specificity_manuscript\\素材\\ts_no_ts_mut_exp_num')
ts_mut_number=data.frame(read.table('mut_ts_num_v2.txt',sep='\t',header=1,comment.char = "",, check.names = FALSE))
ts_mut_number_sort=ts_mut_number[order(ts_mut_number$Gene_counts,decreasing = F),]
ts_mut_number_sort$tissue=factor(ts_mut_number_sort$tissue, levels = unique(ts_mut_number_sort$tissue))

ts_mut_number=data.frame(read.table('mut_ts_num_ratio_v2.txt',sep='\t',header=1,comment.char = "",, check.names = FALSE))
ts_mut_number_sort=ts_mut_number[order(ts_mut_number$Gene_counts,decreasing = F),]

ts_mut_number_sort$tissue=factor(ts_mut_number_sort$tissue, levels = unique(ts_mut_number_sort$tissue))

pdf('mut_ts_num_nolegend_ratio_v3.pdf',width=8.5/2, height=11/4)
ggplot(data = ts_mut_number_sort,aes(x=Gene_ratio,y=tissue_name ,group=classification  ))+
  geom_bar(stat = "identity" , position="stack", width =0.5 ,aes(fill=classification ) ) +  theme_minimal()+
  scale_fill_manual(values = c('TS mutations'='#80AACA','No-ts mutations'='#c8dcee'))+ ##用于手动设置离散型填充颜色的函数
  theme(legend.position = "none",    
        axis.line.y = element_line(size = (0.5/1.07)*0.5),
        axis.line.x = element_line(size = (0.5/1.07)*0.5) ,
        axis.ticks.y = element_line(size = (0.5/1.07)*0.5) ,
        axis.ticks.x = element_line(size = (0.5/1.07)*0.5) ,
        panel.grid=element_blank(),
        axis.text.x = element_text(size = 8, family = "sans", color = "black"),
        axis.text.y = element_text(size = 8, family = "sans", color = "black") # 调整 x 轴刻度线宽度) + # 去除 y 轴刻度线
  )+ # 去除 y 轴刻度线
  xlab('Driver mutations')+ylab('')+
  font("xlab",size = 8, family = "sans", color = "black")+
  font("ylab",size = 8, family = "sans", color = "black")+
  scale_y_discrete(expand = c(0, 0)) +  # 确保 y 轴从原点开始
  scale_x_continuous(expand = c(0, 0))
dev.off()

ts_exp_number=data.frame(read.table('exp_ts_num.txt',sep='\t',header=1,comment.char = "",, check.names = FALSE))
ts_exp_number_sort=ts_exp_number[order(ts_exp_number$Gene_counts,decreasing = F),]
ts_exp_number_sort$tissue=factor(ts_exp_number_sort$tissue, levels = unique(ts_exp_number_sort$tissue))

ts_exp_number_sort=ts_exp_number_sort[which(ts_exp_number_sort$classification=='TS expressed gene'),]
pdf('exp_ts_num_nolegend_v2.pdf',width=8.5/2, height=11/4)
ggplot(data = ts_exp_number_sort,aes(x=Gene_counts,y=tissue_name ,group=classification  ))+
  geom_bar(stat = "identity" , position="stack", width =0.5 ,aes(fill=classification ) ) +  theme_minimal()+
  scale_fill_manual(values = c('TS expressed gene'='#EE7B6C'))+ ##用于手动设置离散型填充颜色的函数
  theme(legend.position = "none",    
        axis.line.y = element_line(size = (0.5/1.07)*0.5),
        axis.line.x = element_line(size = (0.5/1.07)*0.5) ,
        axis.ticks.y = element_line(size = (0.5/1.07)*0.5) ,
        axis.ticks.x = element_line(size = (0.5/1.07)*0.5) ,
        panel.grid=element_blank(),
        axis.text.x = element_text(size = 8, family = "sans", color = "black"),
        axis.text.y = element_text(size = 8, family = "sans", color = "black") # 调整 x 轴刻度线宽度) + # 去除 y 轴刻度线
  ) + # 去除 y 轴刻度线
  xlab('Tissue-specific expressed genes')+ylab('')+
  font("xlab",size = 8, family = "sans", color = "black")+
  font("ylab",size = 8, family = "sans", color = "black")+
  scale_y_discrete(expand = c(0, 0)) +  # 确保 y 轴从原点开始
  scale_x_continuous(expand = c(0, 0))
dev.off()

In [None]:
## top20 TS mutation
tissue_color=pd.read_csv('/h/tianyi/TS_datasets_reversion/sci_paper_plot_stas/data_statistic/donor_number_circos/tissue_color.txt',sep='\t')
driver_gene_color=pd.read_csv('/h/tianyi/TS_datasets_reversion/sci_paper_plot_stas/data_statistic/top20_TS_mut/ts_mut_tissue_count_top20.txt',sep='\t')
driver_gene_color.drop(['tissue_cor_v2'],axis=1,inplace=True)
driver_gene_color=pd.merge(driver_gene_color,tissue_color,on='tissue_name')
driver_gene_color.to_csv('/h/tianyi/TS_datasets_reversion/sci_paper_plot_stas/data_statistic/top20_TS_mut/ts_mut_tissue_count_top20_v2.txt',sep='\t',index=False)
driver_gene_color[driver_gene_color['gene']=='BRAF']
setwd('G:\\课题\\tissue_specificity_manuscript\\素材\\top20_TS_mut')

ts_mut_number=data.frame(read.table('driver_gene_color_all.txt',sep='\t',header=1,comment.char = "",, check.names = FALSE))

ts_mut_number_tissue_sort_all=NULL
for (gene_eg in unique(ts_mut_number$gene)){
  ts_mut_number_eg=ts_mut_number[which(ts_mut_number$gene==gene_eg),]
  ts_mut_number_eg_sort=ts_mut_number_eg[order(ts_mut_number_eg$X.log10_B_Score),]
  ts_mut_number_tissue_sort_all=rbind(ts_mut_number_eg_sort,ts_mut_number_tissue_sort_all)
}
ts_mut_number_sum=ts_mut_number %>%
  group_by(gene) %>%
  summarise(total_gene_number = sum(X.log10_B_Score))
ts_mut_number_sort=ts_mut_number_sum[order(ts_mut_number_sum$total_gene_number,decreasing = F),]

ts_mut_number$gene=factor(ts_mut_number$gene, levels = unique(ts_mut_number_sort$gene))

color_all=ts_mut_number$tissue_cor_v2
tissue_all=ts_mut_number$tissue_name
names(color_all)=tissue_all

pdf('ts_mut_gene_number_top20.pdf',width=8.5/2, height=11/3)
ggplot(data = ts_mut_number,aes(x=count           ,y=gene,group=tissue_name  ))+
  geom_bar(stat = "identity" , position="stack", width =0.5 ,aes(fill=tissue_name ) ) +  theme_minimal()+
  scale_fill_manual(values = color_all)+ ##用于手动设置离散型填充颜色的函数
  theme(legend.position = "none",    
        axis.line.y = element_line(size = (0.5/1.07)*0.5),
        axis.line.x = element_line(size = (0.5/1.07)*0.5) ,
        axis.ticks.y = element_line(size = (0.5/1.07)*0.5) ,
        axis.ticks.x = element_line(size = (0.5/1.07)*0.5) ,
        panel.grid=element_blank(),
        axis.text.x = element_text(size = 8, family = "sans", color = "black"),
        axis.text.y = element_text(size = 8, family = "sans", color = "black") # 调整 x 轴刻度线宽度) + # 去除 y 轴刻度线
  )+ # 去除 y 轴刻度线
  xlab('The number of tissue-specific mutations')+ylab('The top 20 driver genes')+
  font("xlab",size = 8, family = "sans", color = "black")+
  font("ylab",size = 8, family = "sans", color = "black")+
  scale_y_discrete(expand = c(0, 0)) +  # 确保 y 轴从原点开始
  scale_x_continuous(expand = c(0, 0), limits = c(0, 160))
dev.off()

In [None]:
## ts mut exp gene number
mut_exp_num_df_all=pd.read_csv('/h/tianyi/TS_datasets_reversion/sci_paper_plot_stas/data_statistic/ts_mut_exp_gene_number/mutation_expressed_classification_number.txt',sep='\t',index=False)
custom_palette = {"Tissue-specific genetic mutations": "#80AACA", "Tissue-specific expressed genes": "#EE7B6C"}  # You can customize colors here

mut_num_df_all=mut_exp_num_df_all[mut_exp_num_df_all['Group']=='Tissue-specific genetic mutations']
exp_num_df_all=mut_exp_num_df_all[mut_exp_num_df_all['Group']=='Tissue-specific expressed genes']
mut_num_df_all_sort=mut_num_df_all.sort_values('Number')
exp_num_df_all_sort=exp_num_df_all.sort_values('Tissue')
fig,axes=plt.subplots(2,1,figsize=(8.5/2,11/4))
plt.rcParams['axes.linewidth'] = 0.5
axes[0].spines['top'].set_visible(False)
axes[0].spines['right'].set_visible(False)
sns.set_style("ticks")
sns.barplot(x='Tissue', y='Number', data=mut_num_df_all_sort, color='#7A9FBB',ax=axes[0])
axes[0].set_ylabel('Number',fontsize=8, fontname='Arial')
axes[0].set_xlabel('')
axes[0].set_title('',fontsize=8, fontname='Arial')
axes[0].set_xticklabels(mut_num_df_all_sort['Tissue'].drop_duplicates().tolist(),fontsize=8, fontname='Arial')
axes[0].tick_params(bottom=False, labelbottom=False)
axes[1].spines['top'].set_visible(False)
axes[1].spines['right'].set_visible(False)
sns.set_style("ticks")
sns.barplot(x='Tissue', y='Number', data=exp_num_df_all_sort, color='#EE7B6C',ax=axes[1])
axes[1].set_ylabel('Number',fontsize=8, fontname='Arial')
axes[1].set_xlabel('')
axes[1].set_title('',fontsize=8, fontname='Arial')
axes[1].set_xticklabels(exp_num_df_all_sort['Tissue'].drop_duplicates().tolist(),fontsize=8, fontname='Arial')
axes[1].tick_params(labelrotation=90,axis='x')
plt.savefig(f'/ts_mut_exp_gene_number_primary.pdf',dpi=300,bbox_inches='tight')