# Evaluate isoforms diversity for each gene

In [5]:
import sys
sys.path.append("..")
import pannzer_out_api as poa
import statistics as stats
import pandas as pd

## Loading data
We'll try with chr1 of gencode human data.

In [88]:
chr1 = poa.parse_pannzer_annotation('../../data/pannzer_output/chr1.all.nr_off.out', 'chr1')

Quick test : our functions works.

In [89]:
for gene in chr1:
    gene = chr1[gene]
    print(gene.id)
    print(gene.number_of_isoforms())
    print(gene.diversity_by_pair(poa.jaccard_index))
    print(gene.diversity_by_pair(poa.dice_coefficient))
    print(gene.diversity_by_pair(poa.overlap_coefficient))
    print(gene.go_redundance_metric())
    print(gene.stdev_number_of_go_by_isoform())

ENSG00000186092
1
1.0
1.0
1.0
1.0
0.0
ENSG00000284733
1
1.0
1.0
1.0
1.0
0.0
ENSG00000284662
1
1.0
1.0
1.0
1.0
0.0
ENSG00000187634
11
0.819138755980861
0.8900363636363633
1.0
0.8578947368421053
12.988233708862536
ENSG00000188976
1
1.0
1.0
1.0
1.0
0.0
ENSG00000187961
2
0.0
0.0
1.0
0.0
33.0
ENSG00000187583
4
0.5655202821869488
0.6254038046818189
1.0
0.70679012345679
39.42318480285427
ENSG00000187642
3
1.0
1.0
1.0
1.0
0.0
ENSG00000188290
3
0.5888888888888889
0.7028112449799196
1.0
0.6916666666666667
17.441967269268172
ENSG00000187608
3
1.0
1.0
1.0
1.0
0.0
ENSG00000188157
4
0.42923099472347087
0.5268169124471052
0.9444444444444445
0.4425770308123249
136.3275008939869
ENSG00000237330
1
1.0
1.0
1.0
1.0
0.0
ENSG00000131591
13
0.41025641025641024
0.4273504273504274
1.0
0.25
1.475640468711606
ENSG00000162571
5
0.39344262295081966
0.39666666666666667
1.0
0.4918032786885246
29.566196914720027
ENSG00000186891
4
0.7108433734939759
0.7966101694915254
1.0
0.8072289156626505
20.784609690826528
ENSG0000

Let's create a dataframe with our results.

In [90]:
data = pd.DataFrame()
data['Gene'] = chr1.genes.keys()
data['Number of isoform'] = data['Gene'].apply(chr1.get_gene).apply(poa.Gene.number_of_isoforms)
data['Jaccard Index'] = data['Gene'].apply(chr1.get_gene).apply(poa.Gene.diversity_by_pair, similarity_function=poa.jaccard_index)
data['Dice coefficient'] = data['Gene'].apply(chr1.get_gene).apply(poa.Gene.diversity_by_pair, similarity_function=poa.dice_coefficient)
data['Overlap coefficient'] = data['Gene'].apply(chr1.get_gene).apply(poa.Gene.diversity_by_pair, similarity_function=poa.overlap_coefficient)
data['Redundance metric'] = data['Gene'].apply(chr1.get_gene).apply(poa.Gene.go_redundance_metric)
data['Stdev number GO term'] = data['Gene'].apply(chr1.get_gene).apply(poa.Gene.stdev_number_of_go_by_isoform)
data

Unnamed: 0,Gene,Number of isoform,Jaccard Index,Dice coefficient,Overlap coefficient,Redundance metric,Stdev number GO term
0,ENSG00000186092,1,1.000000,1.000000,1.0,1.000000,0.000000
1,ENSG00000284733,1,1.000000,1.000000,1.0,1.000000,0.000000
2,ENSG00000284662,1,1.000000,1.000000,1.0,1.000000,0.000000
3,ENSG00000187634,11,0.819139,0.890036,1.0,0.857895,12.988234
4,ENSG00000188976,1,1.000000,1.000000,1.0,1.000000,0.000000
...,...,...,...,...,...,...,...
2056,ENSG00000259823,1,1.000000,1.000000,1.0,1.000000,0.000000
2057,ENSG00000175137,1,1.000000,1.000000,1.0,1.000000,0.000000
2058,ENSG00000171161,3,0.061538,0.103896,1.0,0.092308,28.241026
2059,ENSG00000171163,14,0.634341,0.635740,1.0,0.143269,31.413389


Let's check the summary.

In [91]:
data.describe()

Unnamed: 0,Number of isoform,Jaccard Index,Dice coefficient,Overlap coefficient,Redundance metric,Stdev number GO term
count,2061.0,2061.0,2061.0,2061.0,2061.0,2061.0
mean,4.781174,0.690646,0.715796,0.99531,0.698151,20.879427
std,5.618032,0.318958,0.305803,0.03485,0.331289,32.067409
min,1.0,0.0,0.0,0.107143,0.0,0.0
25%,2.0,0.4,0.43771,1.0,0.438596,0.0
50%,3.0,0.737288,0.817218,1.0,0.8,5.5
75%,6.0,1.0,1.0,1.0,1.0,31.796226
max,80.0,1.0,1.0,1.0,1.0,261.818289


Not as detailed as I want. Let's create our own.

In [94]:
def precise_quantile(sample, n):
    return stats.quantiles(sample, n = 100)[n-1]

In [95]:
summary = pd.DataFrame()
summary['Metrics'] = data.columns[1:]
summary['Mean'] = summary['Metrics'].apply(data.get).apply(stats.fmean, axis = 1)
summary['Harmonic Mean'] = summary['Metrics'].apply(data.get).apply(stats.harmonic_mean, axis = 1)
summary['Median'] = summary['Metrics'].apply(data.get).apply(stats.median, axis = 1)
summary['Q25'] = summary['Metrics'].apply(data.get).apply(precise_quantile, n=25, axis = 1)
summary['Q75'] = summary['Metrics'].apply(data.get).apply(precise_quantile, n=75, axis = 1)
summary['Sample Variance'] = summary['Metrics'].apply(data.get).apply(stats.variance, axis = 1)
summary['Sample Standard Deviation'] = summary['Metrics'].apply(data.get).apply(stats.stdev, axis = 1)
summary['Population Variance'] = summary['Metrics'].apply(data.get).apply(stats.pvariance, axis = 1)
summary['Population Standard Deviation'] = summary['Metrics'].apply(data.get).apply(stats.pstdev, axis = 1)
summary

Unnamed: 0,Metrics,Mean,Harmonic Mean,Median,Q25,Q75,Sample Variance,Sample Standard Deviation,Population Variance,Population Standard Deviation
0,Number of isoform,4.781174,2.239917,3.0,2.0,6.0,31.562286,5.618032,31.546972,5.616669
1,Jaccard Index,0.690646,0.0,0.737288,0.4,1.0,0.101734,0.318958,0.101685,0.31888
2,Dice coefficient,0.715796,0.0,0.817218,0.436637,1.0,0.093516,0.305803,0.09347,0.305729
3,Overlap coefficient,0.99531,0.990415,1.0,1.0,1.0,0.001214,0.03485,0.001214,0.034841
4,Redundance metric,0.698151,0.0,0.8,0.438264,1.0,0.109753,0.331289,0.109699,0.331209
5,Stdev number GO term,20.879427,0.0,5.5,0.0,31.821268,1028.318701,32.067409,1027.819759,32.059628
