# Content

- load and clean the table containing splicing scores of all the biocurated variants.
- count consequence occurence groupted by pathomechanism (e.g. how many `splicing|3css|activated` lead to `3_CSS`)

In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
import seaborn as sns

## Load and clean data

In [2]:
fpath = "/home/ielis/data/threes-simulations/scaling/scorers-all.tsv"

# remove MAX_SCORE column
df = pd.read_csv(fpath, sep='\t').drop("MAX_SCORE" ,axis=1).rename(columns={"PHENOPACKET": "pp", 
                                                                            "VARIANT": "variant",
                                                                            "TRANSCRIPT": "tx",
                                                                            "VCLASS": "vc",
                                                                            "PATHOMECHANISM": "pm",
                                                                            "CONSEQUENCE": "cs"})
# remove rows where there is no score
SCORER_COLUMNS = ['CANONICAL_DONOR', 'CRYPTIC_DONOR', 'CRYPTIC_DONOR_IN_CANONICAL_POSITION', 
                  'CANONICAL_ACCEPTOR', 'CRYPTIC_ACCEPTOR', 'CRYPTIC_ACCEPTOR_IN_CANONICAL_POSITION', 
                  'SMS']
empty_scores = df.loc[:, SCORER_COLUMNS].isna().all(axis=1)
df = df.loc[~empty_scores, :]

# create more convenient pathogenicity and consequence groups
from curation import group_consequence, group_pathomechanism

df['pathogrp'] = df.apply(group_pathomechanism, axis=1)
df['csq'] = df.apply(group_consequence, axis=1)
df = df.drop(['pm', 'cs'], axis=1)

df.head()

Unnamed: 0,pp,variant,tx,vc,CANONICAL_DONOR,CRYPTIC_DONOR,CRYPTIC_DONOR_IN_CANONICAL_POSITION,CANONICAL_ACCEPTOR,CRYPTIC_ACCEPTOR,CRYPTIC_ACCEPTOR_IN_CANONICAL_POSITION,SMS,pathogrp,csq
1,Ito-2017-MYBPC3-VARIANT407-5C_T.json,11:47371668 G>A,NM_000256.3,splicing,,,,-0.895356,,-4.088544,,splicing|3ss|disrupted,EXON_SKIP
2,Fan-2013-TAZ-proband.json,X:153648055 A>G,NM_000116.3,splicing,,-1.05736,,,-18.085797,,-0.6807,splicing|5css|activated,5_CSS
3,Jin-1996-ITGB3-RS.json,17:45368454 G>A,NM_000212.2,splicing,3.054723,,-2.471658,,,,,splicing|5ss|disrupted,EXON_SKIP
4,Teraoka-1999-ATM-AT51LA.json,11:108115654 C>T,NM_000051.3,splicing,,-13.686307,,,-7.215609,,1.5714,splicing|5ss|disrupted,EXON_SKIP
5,Pousada-2017-BMPR2-VARIANT251G_T.json,2:203332245 G>T,NM_001204.6,splicing,,-13.980012,,,-9.065869,,1.0887,splicing|SRE,EXON_SKIP


## Counts of consequences grouped by pathomechanism

In [3]:
csq_patho_counts = df.groupby(['pathogrp', 'csq'])['pp'].count().reset_index().rename(columns={'pp': 'count'})
csq_patho_counts

Unnamed: 0,pathogrp,csq,count
0,coding,,35
1,splicing|3css|activated,3_CSS,33
2,splicing|3ss|disrupted,3_CSS,17
3,splicing|3ss|disrupted,EXON_SKIP,41
4,splicing|3ss|disrupted,Intron retention,1
5,splicing|5css|activated,5_CSS,61
6,splicing|5ss|disrupted,3_CSS,1
7,splicing|5ss|disrupted,5_CSS,41
8,splicing|5ss|disrupted,EXON_SKIP,126
9,splicing|5ss|disrupted,Increased inclusion of alternatively spliced exon,4


## How many `splicing|5css|activated` variants do we have?



In [4]:
donor_scorers = ["pp", "tx", "variant", "pathogrp", "CANONICAL_DONOR", "CRYPTIC_DONOR", "CRYPTIC_DONOR_IN_CANONICAL_POSITION"]
df.loc[df.pathogrp == "splicing|5css|activated", donor_scorers]

Unnamed: 0,pp,tx,variant,pathogrp,CANONICAL_DONOR,CRYPTIC_DONOR,CRYPTIC_DONOR_IN_CANONICAL_POSITION
2,Fan-2013-TAZ-proband.json,NM_000116.3,X:153648055 A>G,splicing|5css|activated,,-1.057360,
9,Auclair-2006-MSH2-EL022.json,NM_000251.2,2:47702319 C>T,splicing|5css|activated,,2.487615,
11,Yamaguchi-2017-MLH1-Patient.json,NM_000249.3,3:37050394 C>T,splicing|5css|activated,1.545326,,2.621940
13,Houdayer-2012-BRCA2-c.467A_G.json,NM_000059.3,13:32900279 A>G,splicing|5css|activated,,3.026131,
37,Costantini-2011-GCK-Italian_boy.json,NM_000162.3,7:44190579 A>C,splicing|5css|activated,,0.351361,
39,Ars-2000-NF1-96-284.json,NM_000267.3,17:29556397 G>A,splicing|5css|activated,,1.391313,
48,Hehr-2006-FLNA-male_patient.json,NM_001456.3,X:153592993 G>A,splicing|5css|activated,,3.951162,
54,Richards-2007-COL2A1-MS203.json,NM_001844.4,12:48377499 G>A,splicing|5css|activated,,-0.933062,
61,Richards-2010-COL2A1-MS122_MS300_MS287.json,NM_001844.4,12:48372413 G>A,splicing|5css|activated,,3.235414,
74,Sheikh-2013-MECP2-Patient_1.json,NM_004992.3,X:153363075 G>A,splicing|5css|activated,,3.962739,
