# Genes with variants in certain lineages

## Setup

In [80]:
import pandas as pd
import numbers

In [92]:
vars_per_gene = pd.read_csv('/FastData/czirion/Crypto_Desjardins/fungal_pop/data/variants_per_gene/HIGH_VNI_VNII_VNBI_VNBII_no-poly_100_20_0.5_1.5_0.2.csv', header =0)
essential_billmyre = pd.read_csv('/FastData/czirion/Crypto_Desjardins/fungal_pop/data/media-1.csv', header =0)
essential_billmyre = essential_billmyre[['Gene', 'Essentiality Classification']]
essential_billmyre.columns = ['gene_id', 'essentiality']
essential_billmyre['essentiality'] = essential_billmyre['essentiality'].astype('category')
essential_billmyre['essentiality'] = essential_billmyre['essentiality'].cat.rename_categories({
    "ESS": "Essential",
    "NESS": "Non Essential",
    "UNK": "Unknown"
})

## Number of strains with variants per lineage per gene

In [82]:
genes_vars_per_lineage = vars_per_gene.groupby(['gene_id', 'lineage'])['presence_vars'].sum().reset_index(name='num_strains_with_vars')
genes_vars_per_lineage

Unnamed: 0,gene_id,lineage,num_strains_with_vars
0,CNAG_00001,VNBI,0
1,CNAG_00001,VNBII,0
2,CNAG_00001,VNI,0
3,CNAG_00002,VNI,0
4,CNAG_00003,VNI,0
...,...,...,...
27553,CNAG_08028,VNII,0
27554,CNAG_08029,VNBI,0
27555,CNAG_08029,VNBII,0
27556,CNAG_08029,VNI,2


## Lineages where no strains have a variants per gene

Pivot to show number of strains with variants per lineage per gene


In [83]:
genes_vars_per_lineage_wide = genes_vars_per_lineage.pivot(index='gene_id', columns='lineage', values='num_strains_with_vars').reset_index()
genes_vars_per_lineage_wide.set_index('gene_id', inplace=True)
genes_vars_per_lineage_wide.columns.name = None
genes_vars_per_lineage_wide

Unnamed: 0_level_0,VNBI,VNBII,VNI,VNII
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CNAG_00001,0.0,0.0,0.0,
CNAG_00002,,,0.0,
CNAG_00003,,,0.0,
CNAG_00004,,,0.0,
CNAG_00005,,0.0,0.0,
...,...,...,...,...
CNAG_08025,0.0,0.0,2.0,0.0
CNAG_08026,0.0,0.0,0.0,0.0
CNAG_08027,0.0,0.0,0.0,0.0
CNAG_08028,0.0,11.0,3.0,0.0


Put the name of the lineage when the gene has no variants in any strain of that lineage.  
Genes that are absent in a lineage will count as if they have variants.

In [None]:
genes_lins_with_vars = genes_vars_per_lineage_wide.where(genes_vars_per_lineage_wide != 0, genes_vars_per_lineage_wide.columns.to_series(), axis=1)
genes_lins_with_vars = genes_lins_with_vars.map(lambda x: None if isinstance(x, numbers.Number) else x)
genes_lins_with_vars

Unnamed: 0_level_0,VNBI,VNBII,VNI,VNII
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CNAG_00001,VNBI,VNBII,VNI,
CNAG_00002,,,VNI,
CNAG_00003,,,VNI,
CNAG_00004,,,VNI,
CNAG_00005,,VNBII,VNI,
...,...,...,...,...
CNAG_08025,VNBI,VNBII,,VNII
CNAG_08026,VNBI,VNBII,VNI,VNII
CNAG_08027,VNBI,VNBII,VNI,VNII
CNAG_08028,VNBI,,,VNII


Join names of lineages where no strains have variants in each gene.

In [99]:
genes_ess_group_lins = genes_lins_with_vars.copy()
genes_ess_group_lins['essential_in'] = genes_ess_group_lins.apply(lambda row: tuple(row), axis=1)
genes_ess_group_lins['essential_in'] = genes_ess_group_lins['essential_in'].apply(lambda x: tuple(filter(None, x)))
genes_ess_group_lins.reset_index(inplace=True)
genes_ess_group_lins.rename(columns={'index': 'gene_id'}, inplace=True)
genes_ess_group_lins = genes_ess_group_lins[['gene_id','essential_in']]
genes_ess_group_lins['essential_in'] = genes_ess_group_lins['essential_in'].apply(lambda x: tuple(sorted(x)))
genes_ess_group_lins['essential_in'] = pd.Categorical(genes_ess_group_lins['essential_in'], 
                                              categories=sorted(genes_ess_group_lins['essential_in'].unique(), key=len), 
                                              ordered=True)
genes_ess_group_lins

Unnamed: 0,gene_id,essential_in
0,CNAG_00001,"(VNBI, VNBII, VNI)"
1,CNAG_00002,"(VNI,)"
2,CNAG_00003,"(VNI,)"
3,CNAG_00004,"(VNI,)"
4,CNAG_00005,"(VNBII, VNI)"
...,...,...
6957,CNAG_08025,"(VNBI, VNBII, VNII)"
6958,CNAG_08026,"(VNBI, VNBII, VNI, VNII)"
6959,CNAG_08027,"(VNBI, VNBII, VNI, VNII)"
6960,CNAG_08028,"(VNBI, VNII)"


Join with classification of essentiality from Billmyre.

In [100]:
essentiality = essential_billmyre.merge(genes_ess_group_lins, on='gene_id', how='left')
essentiality

Unnamed: 0,gene_id,essentiality,essential_in
0,CNAG_00003,Non Essential,"(VNI,)"
1,CNAG_00004,Non Essential,"(VNI,)"
2,CNAG_00005,Non Essential,"(VNBII, VNI)"
3,CNAG_00011,Non Essential,"(VNBII, VNII)"
4,CNAG_00013,Non Essential,"(VNBII, VNII)"
...,...,...,...
6970,CNAG_09008,Essential,
6971,CNAG_09009,Unknown,
6972,CNAG_09010,Essential,
6973,CNAG_09011,Non Essential,


Count the number of genes with each essentiality classification and the groups of lineages where no strains have variants.

In [105]:
ess_count = essentiality.groupby(['essentiality', 'essential_in'], observed=False).count().reset_index()
ess_count

Unnamed: 0,essentiality,essential_in,gene_id
0,Essential,(),0
1,Essential,"(VNI,)",5
2,Essential,"(VNII,)",2
3,Essential,"(VNBII,)",0
4,Essential,"(VNBI,)",1
5,Essential,"(VNBII, VNI)",7
6,Essential,"(VNBII, VNII)",6
7,Essential,"(VNBI, VNBII)",4
8,Essential,"(VNI, VNII)",15
9,Essential,"(VNBI, VNI)",4
