# Distribution of FIE counts per gene ("long-tail" Figure 3d)
plot_fie_count_gene_dist_long_tail.ipynb

Nov 2023 amended paths for submission

Sept 2024 revision

Nov 23 - use main yj07f tables eg data/diversity_yj07f_full_table_gene.csv
Sept 24 - updated diversity

A script to generate fie count table is here:
`script/diversity_analysis/create_gene_fie_count_table.R`


In [57]:
import os
import sys
import math
import altair as alt
import pandas as pd

# pip3 install altair_saver
# https://github.com/altair-viz/altair_saver/
# from altair_saver import save
# issues with altair_saver on mac os - run vl-convert instead
# pip3 install vl-convert-python
# see: https://altair-viz.github.io/user_guide/saving_charts.html#png-svg-and-pdf-format
# https://github.com/altair-viz/altair/issues/2239

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 100)

### Load plot data 

In [67]:
# see script/diversity_analysis/create_gene_fie_count_table.R
# fie_count_file = '../script/diversity_analysis/working/fies_by_gene_tx_luad_lusc_pre_post.tsv'
fie_count_file = '../script/diversity_analysis/working/fies_by_gene_tx_luad_lusc_pre_post_unk.tsv'

# Count FIEs by gene and cancer type and TxP (GENE)
dt_fies_by_gene = pd.read_table( fie_count_file, keep_default_na = True, dtype = 'unicode' )

# Latest HUGO names via UniProt Sept 2025 (use ST for TRACERx)
gene_to_hugo = pd.read_table( '../data/resources/Supplemental_Table_ST2.tsv')[['gene', 'SOURCE_HUGO_SYMBOL']].drop_duplicates()
# export name
fig_export_name = 'fies_by_gene_all_pre_post_luad_lusc.png'

### tidy and sort

In [68]:
# convert num_muts and num_rows to numeric
dt_fies_by_gene['tot_muts'] = dt_fies_by_gene['tot_muts'].astype(int)


dt_fies_by_gene = pd.merge(dt_fies_by_gene,
						   gene_to_hugo,
						   left_on='SPECIES',
						   right_on='SOURCE_HUGO_SYMBOL'
					 ).drop(
						 'SOURCE_HUGO_SYMBOL', axis=1
						 ).rename(
							 columns={'SPECIES': 'old_name', 'gene': 'SPECIES'}
							 ).drop(
								 'old_name', axis=1
							 ).sort_values(
								 by=['tot_muts','SPECIES']
							 )


In [66]:
dt_fies_by_gene

Unnamed: 0,SPECIES_TYPE,DATA_SOURCE,TXP,CANCER_TYPE,OVERALL_TIMING,tot_muts,SPECIES
22,FIEs,Tx,N,LUAD-LUSC,early,1,ACADM
95,FIEs,Tx,N,LUAD-LUSC,unknown,1,ACVR1B
104,FIEs,Tx,N,LUAD-LUSC,early,1,AGXT
23,FIEs,Tx,N,LUAD-LUSC,early,1,AKT1
10,FIEs,Tx,N,LUAD-LUSC,unknown,1,ANAPC10
102,FIEs,Tx,N,LUAD-LUSC,unknown,1,APBB1IP
11,FIEs,Tx,N,LUAD-LUSC,unknown,3,BRAF
24,FIEs,Tx,N,LUAD-LUSC,early,5,BRAF
81,FIEs,Tx,N,LUAD-LUSC,unknown,1,CACNB2
25,FIEs,Tx,N,LUAD-LUSC,early,1,CASQ2


### Plot distribution by gene

In [69]:
# GENE - all timings
dt_plot = dt_fies_by_gene
# fig_name = 'fies_by_gene_all_pre_post_luad_lusc.pdf'
# altair_saver not working - using vl-convert which is only PNG or SVG
fig_name = 'fies_by_gene_all_pre_post_luad_lusc.png'

In [70]:
# alt.Chart(dt_plot).mark_bar(size=10).encode(
chart_long = alt.Chart(dt_plot).mark_bar().encode(
    alt.X('SPECIES', axis=alt.Axis(title='Gene'), sort=alt.SortField(field='tot_muts', order='descending') ),
    alt.Y('tot_muts', axis=alt.Axis(title='Num. FIEs') ) ,
    color=alt.Color('OVERALL_TIMING', scale=alt.Scale(domain=['early','late','unknown'], range=['#7FC45EFF','#912A58FF','grey'])),
    order=alt.Order('OVERALL_TIMING', sort='ascending')     # y-order of the bar colouring 
).properties(height=450)

chart_long = chart_long.configure_axis(
    labelFontSize=20,
    titleFontSize=20
)
chart_long = chart_long.configure_axisX(
	labelFontStyle = 'italic'
) 

# see plot!
chart_long

In [71]:
# use vl-convert - see notes top of script
# chart_long.save(fig_name)
chart_long.save( fig_export_name, engine="vl-convert" )
