# plot_fie_count_gene_dist_long_tail.ipynb
## Distribution of FIE counts per gene ("long-tail")
### Nov 2023 amended paths for submission 

Nov 23 - use main yj07f tables eg data/diversity_yj07f_full_table_gene.csv

A script to generate fie count table is here:

`script/diversity_analysis/create_gene_fie_count_table.R`

In [1]:
import os
import sys
import math
import altair as alt
import pandas as pd

# pip3 install altair_saver
# https://github.com/altair-viz/altair_saver/
# from altair_saver import save
# issues with altair_saver on mac os - run vl-convert instead
# pip3 install vl-convert-python
# see: https://altair-viz.github.io/user_guide/saving_charts.html#png-svg-and-pdf-format
# https://github.com/altair-viz/altair/issues/2239

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 100)

### Load plot data 

In [2]:
# see script/diversity_analysis/create_gene_fie_count_table.R
# fie_count_file = '../script/diversity_analysis/working/fies_by_gene_tx_luad_lusc_pre_post.tsv'
fie_count_file = '../script/diversity_analysis/working/fies_by_gene_tx_luad_lusc_pre_post_unk.tsv'

# Count FIEs by gene and cancer type and TxP (GENE)
dt_fies_by_gene = pd.read_table( fie_count_file, keep_default_na = True, dtype = 'unicode' )

In [3]:
# dt_fies_by_gene.head(5)
# dt_fies_by_gene.info()


### tidy and sort

In [4]:
# convert num_muts and num_rows to numeric
dt_fies_by_gene['tot_muts'] = dt_fies_by_gene['tot_muts'].astype(int)

# dt_fies_by_gene.info()
# sort by FIEs and gene
dt_fies_by_gene = dt_fies_by_gene.sort_values(by=[ 'tot_muts', 'SPECIES'])


### Plot distribution by gene

In [5]:
# GENE - all timings
dt_plot = dt_fies_by_gene
# fig_name = 'fies_by_gene_all_pre_post_luad_lusc.pdf'
# altair_saver not working - using vl-convert which is only PNG or SVG
fig_name = 'fies_by_gene_all_pre_post_luad_lusc.png'

In [6]:
# alt.Chart(dt_plot).mark_bar(size=10).encode(
chart_long = alt.Chart(dt_plot).mark_bar().encode(
    alt.X('SPECIES', axis=alt.Axis(title='Gene'), sort=alt.SortField(field='tot_muts', order='descending') ),
    alt.Y('tot_muts', axis=alt.Axis(title='Num. FIEs') ) ,
    color=alt.Color('OVERALL_TIMING', scale=alt.Scale(domain=['early','late','unknown'], range=['#7FC45EFF','#912A58FF','grey'])),
    order=alt.Order('OVERALL_TIMING', sort='ascending')     # y-order of the bar colouring 
).properties(height=450).configure_axis(
    labelFontSize=20,
    titleFontSize=20
)

# see plot!
chart_long

In [7]:
# use vl-convert - see notes top of script
# chart_long.save(fig_name)
chart_long.save( fig_name, engine="vl-convert" )