# PTPN11

PTPN11 encodes SHP-2, a protein tyrosine phosphatase involved in intracellular signaling pathways that regulate cell growth, differentiation, and survival. It plays a key role in the RAS/MAPK signaling pathway, which is crucial for normal development and oncogenesis. Mutations in this gene are involved in several diseases, including Noonan syndrome and juvenile leukemia.

In [1]:
import gpsea
import hpotk

store = hpotk.configure_ontology_store()
hpo = store.load_minimal_hpo(release='v2023-10-09')
print(f'Loaded HPO v{hpo.version}')
print(f"Using gpsea version {gpsea.__version__}")

Loaded HPO v2023-10-09
Using gpsea version 0.4.1.dev0


### Setup

##### According to Ensembl, the canonical transcript of PTPN11 is [ENST00000351677.7]( https://useast.ensembl.org/Homo_sapiens/Transcript/Summary?db=coreg=ENSG00000179295;r=12:112418947-112509918;t=ENST00000351677) which corresponds to transcript [NM_002834.5](https://www.ncbi.nlm.nih.gov/nuccore/NM_002834.5) and protein [NP_002825.3](https://www.ncbi.nlm.nih.gov/protein/NP_002825.3)

In [2]:
PTPN11_transcript_id = "NM_002834.5"
PTPN11_protein_id = "NP_002825.3"  # <<- This I think is the NP corresponding to the above canonical transcript ID, but I get an error below when trying to retrieve this from Uniprot

# Uniprot lists these as other NP IDs:
# NP_001317366.1 NM_001330437.1 [Q06124-1]
# NP_002825.3 NM_002834.4 [Q06124-2]
# NP_542168.1 NM_080601.2 [Q06124-3]

# PTPN11_protein_id = "NP_001317366.1" # also no dice at Uniprot
# PTPN11_protein_id = "NP_002825.3" # also no dice
# PTPN11_protein_id = "NP_542168.1" # also no dice

### Load Phenopackets

We will load phenopackets from the *PTPN11* cohort of Phenopacket Store `0.1.18`.

In [3]:
from ppktstore.registry import configure_phenopacket_registry
from gpsea.preprocessing import configure_caching_cohort_creator, load_phenopackets

cohort_name = 'PTPN11'
phenopacket_store_release = '0.1.19'

registry = configure_phenopacket_registry()
with registry.open_phenopacket_store(phenopacket_store_release) as ps:
    phenopackets = tuple(ps.iter_cohort_phenopackets(cohort_name))

cohort_creator = configure_caching_cohort_creator(hpo)
cohort, validation = load_phenopackets(
    phenopackets=phenopackets, 
    cohort_creator=cohort_creator,
)
del phenopackets
validation.summarize()

Individuals Processed: 100%|██████████| 70/70 [00:00<00:00, 681.06individuals/s]
Validated under none policy


In [4]:
from gpsea.view import CohortViewable
from IPython.display import HTML, display

viewer = CohortViewable(hpo)
display(HTML(viewer.process(cohort=cohort, transcript_id=PTPN11_transcript_id)))

HPO Term,ID,Seen in n individuals
Hypertelorism,HP:0000316,37
Multiple lentigines,HP:0001003,29
Short stature,HP:0004322,26
Pulmonic stenosis,HP:0001642,18
Low-set ears,HP:0000369,18
Ptosis,HP:0000508,15
Webbed neck,HP:0000465,15
Global developmental delay,HP:0001263,13
Multiple enchondromatosis,HP:0005701,12
Multiple exostoses,HP:0002762,12

Count,Variant key,Variant Name,Protein Variant,Variant Class
16,12_112473023_112473023_A_G,c.836A>G,p.Tyr279Cys,MISSENSE_VARIANT
11,12_112488466_112488466_C_T,c.1403C>T,p.Thr468Met,MISSENSE_VARIANT
6,12_112472981_112472981_G_A,c.794G>A,p.Arg265Gln,MISSENSE_VARIANT
5,12_112477719_112477719_A_G,c.922A>G,p.Asn308Asp,MISSENSE_VARIANT
5,12_112472968_112472968_C_T,c.781C>T,p.Leu261Phe,MISSENSE_VARIANT
4,12_112472972_112472972_T_G,c.785T>G,p.Leu262Arg,MISSENSE_VARIANT
3,12_112472969_112472969_T_A,c.782T>A,p.Leu261His,MISSENSE_VARIANT
1,12_112455984_112455988_TAGAA_T,c.680_683del,p.Glu227AlafsTer6,FRAMESHIFT_VARIANT
1,12_112473040_112473040_T_C,c.853T>C,p.Phe285Leu,"MISSENSE_VARIANT, SPLICE_REGION_VARIANT"
1,12_112453274_112453274_C_T,c.412C>T,p.Arg138Ter,STOP_GAINED

Disease Name,Disease ID,Annotation Count
LEOPARD syndrome 1,OMIM:151100,31
Noonan syndrome 1,OMIM:163950,27
Metachondromatosis,OMIM:156250,12

Variant effect,Annotation Count
MISSENSE_VARIANT,58
FRAMESHIFT_VARIANT,6
SPLICE_REGION_VARIANT,2
STOP_GAINED,3
SPLICE_ACCEPTOR_VARIANT,3


In [5]:
from gpsea.model.genome import GRCh38
from gpsea.preprocessing import VVMultiCoordinateService, \
    configure_default_protein_metadata_service

txc_service = VVMultiCoordinateService(genome_build=GRCh38)
tx_coordinates = txc_service.fetch(PTPN11_transcript_id) 

pms = configure_default_protein_metadata_service()
protein_meta = pms.annotate(PTPN11_protein_id)

ValueError: Could not find an entry for NP_002825.3 in Uniprot response

In [None]:
from gpsea.view import ProteinVisualizable, ProteinVisualizer, ProteinViewable

pvis = ProteinVisualizable(tx_coordinates=tx_coordinates, protein_meta=protein_meta, cohort=cohort)
drawer = ProteinVisualizer()
drawer.draw_fig(pvis=pvis)

In [None]:
viewer = ProteinViewable()
html_prot = viewer.process(cohort, pvis)
display(HTML(html_prot))

## Genotype phenotype correlation analysis

### Genotype predicate

In [None]:
from gpsea.model import VariantEffect
from gpsea.analysis.predicate.genotype import VariantPredicates, ModeOfInheritancePredicate

is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, PTPN11_transcript_id)
moi_predicate = ModeOfInheritancePredicate.autosomal_recessive(
    variant_predicate=is_missense,
)
moi_predicate.display_question()

These are the categorizations that can be produced:

In [None]:
cats = moi_predicate.get_categorizations()
cats

However, we are only interested in comparing `HET` vs `BIALLELIC_ALT`, so we will filter the other category away:

In [None]:
from gpsea.analysis.predicate.genotype import filtering_predicate

cats_of_interest = (cats[1], cats[2])
gt_predicate = filtering_predicate(
    predicate=moi_predicate,
    targets=cats_of_interest,
)
gt_predicate.display_question()

### Phenotype predicates

In [None]:
from gpsea.analysis.predicate.phenotype import prepare_predicates_for_terms_of_interest

pheno_predicates = prepare_predicates_for_terms_of_interest(
    cohort=cohort,
    hpo=hpo,
    missing_implies_excluded=False,
    min_n_of_patients_with_term=2,
)
len(pheno_predicates)

### MTC phenotype filter

In [None]:
from gpsea.analysis.mtc_filter import HpoMtcFilter
mtc_filter = HpoMtcFilter.default_filter(
    hpo=hpo,
    term_frequency_threshold=0.2,
)
mtc_correction = 'fdr_bh'
mtc_alpha = 0.05

### Count statistic

In [None]:
from gpsea.analysis.pcats.stats import FisherExactTest

count_statistic = FisherExactTest()

### Finalize the analysis

In [None]:
from gpsea.analysis.pcats import HpoTermAnalysis

analysis = HpoTermAnalysis(
    count_statistic=count_statistic,
    mtc_filter=mtc_filter,
    mtc_correction=mtc_correction,
    mtc_alpha=mtc_alpha,
)

from gpsea.analysis import configure_cohort_analysis, CohortAnalysisConfiguration

analysis_config = CohortAnalysisConfiguration()
analysis_config.missing_implies_excluded = True
analysis_config.pval_correction = 'fdr_bh'
#analysis_config.hpo_mtc_strategy()
analysis = configure_cohort_analysis(cohort, hpo, config=analysis_config)

## Run the analyses

Compare missense variants vs. other variants.

In [None]:
result = analysis.compare_genotype_vs_phenotypes(
    cohort=cohort,
    gt_predicate=gt_predicate,
    pheno_predicates=pheno_predicates,
)

In [None]:
result.total_tests

In [None]:
from gpsea.analysis.predicate import PatientCategories

result.summarize(hpo, PatientCategories.YES).head(10)

TODO - finalize!