In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
import requests
from io import StringIO

In [None]:
# Set up data folder
DATA = Path('/content/drive/My Drive/data')

## Pathway mutation analysis

Use `KEGG REST API` to obtain all KEGG pathways and the genes.

In [None]:
# Get pathway_id, gene_id table
KEGG_PATHWAY_LINK_BASE = "http://rest.kegg.jp/link/pathway/hsa"
pathway_link = requests.get(KEGG_PATHWAY_LINK_BASE)
kegg = pd.read_csv(StringIO(pathway_link.text), header=None, sep="\t")
kegg.columns = ["gene_id","pathway_id"]

# Get name of each pathway
KEGG_PATHWAY_LIST_BASE = "http://rest.kegg.jp/list/pathway/hsa"
pathway_list = requests.get(KEGG_PATHWAY_LIST_BASE)
kegg_name = pd.read_csv(StringIO(pathway_list.text), header=None,
                                    sep="\t")
kegg_name.columns = ["pathway_id","pathway_name"]

kegg = kegg.merge(kegg_name)
kegg.head()

Unnamed: 0,gene_id,pathway_id,pathway_name
0,hsa:10327,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...
1,hsa:124,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...
2,hsa:125,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...
3,hsa:126,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...
4,hsa:127,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...


In [None]:
maf_file = DATA /"TCGA.BRCA.mutect.995c0111-d90b-4140-bee7-3845436c3b42.DR-10.0.somatic.maf.gz"

pd.set_option('display.max_columns', None)
maf = pd.read_csv(DATA / maf_file, sep="\t", comment='#', low_memory=False)
maf.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,dbSNP_RS,dbSNP_Val_Status,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Match_Norm_Seq_Allele1,Match_Norm_Seq_Allele2,Tumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2,Verification_Status,Validation_Status,Mutation_Status,Sequencing_Phase,Sequence_Source,Validation_Method,Score,BAM_File,Sequencer,Tumor_Sample_UUID,Matched_Norm_Sample_UUID,HGVSc,HGVSp,HGVSp_Short,Transcript_ID,Exon_Number,t_depth,t_ref_count,t_alt_count,n_depth,n_ref_count,n_alt_count,all_effects,Allele,Gene,Feature,Feature_type,One_Consequence,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,ALLELE_NUM,DISTANCE,TRANSCRIPT_STRAND,SYMBOL,SYMBOL_SOURCE,HGNC_ID,BIOTYPE,CANONICAL,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,RefSeq,SIFT,PolyPhen,EXON,INTRON,DOMAINS,GMAF,AFR_MAF,AMR_MAF,ASN_MAF,EAS_MAF,EUR_MAF,SAS_MAF,AA_MAF,EA_MAF,CLIN_SIG,SOMATIC,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,IMPACT,PICK,VARIANT_CLASS,TSL,HGVS_OFFSET,PHENO,MINIMISED,ExAC_AF,ExAC_AF_Adj,ExAC_AF_AFR,ExAC_AF_AMR,ExAC_AF_EAS,ExAC_AF_FIN,ExAC_AF_NFE,ExAC_AF_OTH,ExAC_AF_SAS,GENE_PHENO,FILTER,CONTEXT,src_vcf_id,tumor_bam_uuid,normal_bam_uuid,case_id,GDC_FILTER,COSMIC,MC3_Overlap,GDC_Validation_Status
0,USP24,23358,WUGSC,GRCh38,chr1,55159655,55159655,+,Missense_Mutation,SNP,T,T,C,rs150880897,by1000G;byCluster;byFrequency,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.1024A>G,p.Ile342Val,p.I342V,ENST00000294383,9/68,82,49,33,126,,,"USP24,missense_variant,p.I342V,ENST00000294383...",C,ENSG00000162402,ENST00000294383,Transcript,missense_variant,missense_variant,1024/10549,1024/7863,342/2620,I/V,Atc/Gtc,rs150880897,1,,-1.0,USP24,HGNC,HGNC:12623,protein_coding,YES,CCDS44154.2,ENSP00000294383,Q9UPU5,,UPI000059CFDE,NM_015306.2,tolerated(0.37),benign(0.003),9/68,,,0.002,0.0076,0.0,,0.0,0.0,0.0,0.0059,0.0005,,,,,,,,MODERATE,1.0,SNV,5.0,,,1,0.000715,0.002251,0.00947,0.0,0.0,0.0,0.002752,0.003676,0.0,,panel_of_normals,CTGGATTGTAG,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,common_in_exac;gdc_pon,,True,Unknown
1,ERICH3,127254,WUGSC,GRCh38,chr1,74571494,74571494,+,Missense_Mutation,SNP,C,C,T,,,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.4216G>A,p.Glu1406Lys,p.E1406K,ENST00000326665,14/15,86,42,44,64,,,"ERICH3,missense_variant,p.E1406K,ENST000003266...",T,ENSG00000178965,ENST00000326665,Transcript,missense_variant,missense_variant,4435/7159,4216/4593,1406/1530,E/K,Gag/Aag,,1,,-1.0,ERICH3,HGNC,HGNC:25346,protein_coding,YES,CCDS30755.1,ENSP00000322609,Q5RHP9,,UPI0000237200,NM_001002912.4,deleterious(0.01),possibly_damaging(0.833),14/15,,PROSITE_profiles:PS50313,,,,,,,,,,,,,,,,,MODERATE,1.0,SNV,5.0,,,1,,,,,,,,,,,PASS,TTCCTCTACCA,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1474194,True,Unknown
2,KIF26B,55083,WUGSC,GRCh38,chr1,245419680,245419680,+,Silent,SNP,G,G,T,,,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.1101G>T,p.=,p.S367S,ENST00000407071,4/15,85,29,56,59,,,"KIF26B,synonymous_variant,p.S367S,ENST00000407...",T,ENSG00000162849,ENST00000407071,Transcript,synonymous_variant,synonymous_variant,1541/7287,1101/6327,367/2108,S,tcG/tcT,,1,,1.0,KIF26B,HGNC,HGNC:25484,protein_coding,YES,CCDS44342.1,ENSP00000385545,Q2KJY2,,UPI0000695D71,NM_018012.3,,,4/15,,,,,,,,,,,,,,,,,,,LOW,1.0,SNV,1.0,,,1,,,,,,,,,,,PASS,GCCTCGCAGGG,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1473725;COSM1473726,True,Unknown
3,USP34,9736,WUGSC,GRCh38,chr2,61189055,61189055,+,Silent,SNP,G,G,C,,,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.9888C>G,p.=,p.L3296L,ENST00000398571,79/80,149,114,35,152,,,"USP34,synonymous_variant,p.L3296L,ENST00000398...",C,ENSG00000115464,ENST00000398571,Transcript,synonymous_variant,synonymous_variant,9965/11357,9888/10641,3296/3546,L,ctC/ctG,,1,,-1.0,USP34,HGNC,HGNC:20066,protein_coding,YES,CCDS42686.1,ENSP00000381577,Q70CQ2,,UPI0000410E09,NM_014709.3,,,79/80,,Low_complexity_(Seg):Seg,,,,,,,,,,,,,,,,,LOW,,SNV,2.0,,,1,,,,,,,,,,,PASS,AAAGCGAGTGC,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1483177,True,Unknown
4,ANTXR1,84168,WUGSC,GRCh38,chr2,69245305,69245305,+,Silent,SNP,G,G,A,rs573467642,by1000G,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.1515G>A,p.=,p.S505S,ENST00000303714,18/18,96,73,23,89,,,"ANTXR1,synonymous_variant,p.S505S,ENST00000303...",A,ENSG00000169604,ENST00000303714,Transcript,synonymous_variant,synonymous_variant,1837/5859,1515/1695,505/564,S,tcG/tcA,rs573467642,1,,1.0,ANTXR1,HGNC,HGNC:21014,protein_coding,YES,CCDS1892.1,ENSP00000301945,Q9H6X2,,UPI0000049806,NM_032208.2,,,18/18,,Low_complexity_(Seg):Seg;Prints_domain:PR01217,0.0002,0.0008,0.0,,0.0,0.0,0.0,,,,,,,,,,LOW,1.0,SNV,1.0,,,1,1.6e-05,1.7e-05,9.9e-05,0.0,0.0,0.0,1.5e-05,0.0,0.0,,PASS,TCCTCGCCGCC,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1409122,True,Unknown


In [None]:
# print('Filter:', maf.FILTER.unique())
# print('Mutation_Status:', maf.Mutation_Status.unique())
# print('Variant_Classification:', maf.Variant_Classification.unique())
# print('IMPACT:', maf.IMPACT.unique())

In [None]:
# make sure the mutations pass the "filter" and they are not modifiers
mut = maf[(maf.FILTER == 'PASS') & (maf.IMPACT != 'MODIFIER')].copy()

print("{} out of {} mutations passed the filtering criteria, {:.2f}%.".format(
    len(mut), len(maf), 100 * len(mut) / len(maf)))

# mut.head()

95711 out of 120988 mutations passed the filtering criteria, 79.11%.


The KEGG pathway table's gene IDs has `hsa:` prefix to them.

In [None]:
# add hsa to the id in maf table so it can match with kegg

mut_hsa = mut.copy()
mut_hsa.Entrez_Gene_Id = mut_hsa.apply(lambda x: 'hsa:'+ str(x.Entrez_Gene_Id), axis = 1)
len(mut_hsa)

95711

In [None]:
mut_hsa.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,dbSNP_RS,dbSNP_Val_Status,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Match_Norm_Seq_Allele1,Match_Norm_Seq_Allele2,Tumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2,Verification_Status,Validation_Status,Mutation_Status,Sequencing_Phase,Sequence_Source,Validation_Method,Score,BAM_File,Sequencer,Tumor_Sample_UUID,Matched_Norm_Sample_UUID,HGVSc,HGVSp,HGVSp_Short,Transcript_ID,Exon_Number,t_depth,t_ref_count,t_alt_count,n_depth,n_ref_count,n_alt_count,all_effects,Allele,Gene,Feature,Feature_type,One_Consequence,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,ALLELE_NUM,DISTANCE,TRANSCRIPT_STRAND,SYMBOL,SYMBOL_SOURCE,HGNC_ID,BIOTYPE,CANONICAL,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,RefSeq,SIFT,PolyPhen,EXON,INTRON,DOMAINS,GMAF,AFR_MAF,AMR_MAF,ASN_MAF,EAS_MAF,EUR_MAF,SAS_MAF,AA_MAF,EA_MAF,CLIN_SIG,SOMATIC,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,IMPACT,PICK,VARIANT_CLASS,TSL,HGVS_OFFSET,PHENO,MINIMISED,ExAC_AF,ExAC_AF_Adj,ExAC_AF_AFR,ExAC_AF_AMR,ExAC_AF_EAS,ExAC_AF_FIN,ExAC_AF_NFE,ExAC_AF_OTH,ExAC_AF_SAS,GENE_PHENO,FILTER,CONTEXT,src_vcf_id,tumor_bam_uuid,normal_bam_uuid,case_id,GDC_FILTER,COSMIC,MC3_Overlap,GDC_Validation_Status
1,ERICH3,hsa:127254,WUGSC,GRCh38,chr1,74571494,74571494,+,Missense_Mutation,SNP,C,C,T,,,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.4216G>A,p.Glu1406Lys,p.E1406K,ENST00000326665,14/15,86,42,44,64,,,"ERICH3,missense_variant,p.E1406K,ENST000003266...",T,ENSG00000178965,ENST00000326665,Transcript,missense_variant,missense_variant,4435/7159,4216/4593,1406/1530,E/K,Gag/Aag,,1,,-1.0,ERICH3,HGNC,HGNC:25346,protein_coding,YES,CCDS30755.1,ENSP00000322609,Q5RHP9,,UPI0000237200,NM_001002912.4,deleterious(0.01),possibly_damaging(0.833),14/15,,PROSITE_profiles:PS50313,,,,,,,,,,,,,,,,,MODERATE,1.0,SNV,5.0,,,1,,,,,,,,,,,PASS,TTCCTCTACCA,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1474194,True,Unknown
2,KIF26B,hsa:55083,WUGSC,GRCh38,chr1,245419680,245419680,+,Silent,SNP,G,G,T,,,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.1101G>T,p.=,p.S367S,ENST00000407071,4/15,85,29,56,59,,,"KIF26B,synonymous_variant,p.S367S,ENST00000407...",T,ENSG00000162849,ENST00000407071,Transcript,synonymous_variant,synonymous_variant,1541/7287,1101/6327,367/2108,S,tcG/tcT,,1,,1.0,KIF26B,HGNC,HGNC:25484,protein_coding,YES,CCDS44342.1,ENSP00000385545,Q2KJY2,,UPI0000695D71,NM_018012.3,,,4/15,,,,,,,,,,,,,,,,,,,LOW,1.0,SNV,1.0,,,1,,,,,,,,,,,PASS,GCCTCGCAGGG,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1473725;COSM1473726,True,Unknown
3,USP34,hsa:9736,WUGSC,GRCh38,chr2,61189055,61189055,+,Silent,SNP,G,G,C,,,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.9888C>G,p.=,p.L3296L,ENST00000398571,79/80,149,114,35,152,,,"USP34,synonymous_variant,p.L3296L,ENST00000398...",C,ENSG00000115464,ENST00000398571,Transcript,synonymous_variant,synonymous_variant,9965/11357,9888/10641,3296/3546,L,ctC/ctG,,1,,-1.0,USP34,HGNC,HGNC:20066,protein_coding,YES,CCDS42686.1,ENSP00000381577,Q70CQ2,,UPI0000410E09,NM_014709.3,,,79/80,,Low_complexity_(Seg):Seg,,,,,,,,,,,,,,,,,LOW,,SNV,2.0,,,1,,,,,,,,,,,PASS,AAAGCGAGTGC,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1483177,True,Unknown
4,ANTXR1,hsa:84168,WUGSC,GRCh38,chr2,69245305,69245305,+,Silent,SNP,G,G,A,rs573467642,by1000G,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.1515G>A,p.=,p.S505S,ENST00000303714,18/18,96,73,23,89,,,"ANTXR1,synonymous_variant,p.S505S,ENST00000303...",A,ENSG00000169604,ENST00000303714,Transcript,synonymous_variant,synonymous_variant,1837/5859,1515/1695,505/564,S,tcG/tcA,rs573467642,1,,1.0,ANTXR1,HGNC,HGNC:21014,protein_coding,YES,CCDS1892.1,ENSP00000301945,Q9H6X2,,UPI0000049806,NM_032208.2,,,18/18,,Low_complexity_(Seg):Seg;Prints_domain:PR01217,0.0002,0.0008,0.0,,0.0,0.0,0.0,,,,,,,,,,LOW,1.0,SNV,1.0,,,1,1.6e-05,1.7e-05,9.9e-05,0.0,0.0,0.0,1.5e-05,0.0,0.0,,PASS,TCCTCGCCGCC,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1409122,True,Unknown
5,SCN9A,hsa:6335,WUGSC,GRCh38,chr2,166199365,166199365,+,Silent,SNP,G,G,A,,,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.5274C>T,p.=,p.V1758V,ENST00000303354,27/27,150,122,28,134,,,"SCN9A,synonymous_variant,p.V1758V,ENST00000303...",A,ENSG00000169432,ENST00000303354,Transcript,synonymous_variant,synonymous_variant,5615/9786,5274/5967,1758/1988,V,gtC/gtT,,1,,-1.0,SCN9A,HGNC,HGNC:10597,protein_coding,,,ENSP00000304748,Q15858,,UPI0001881757,,,,27/27,,Transmembrane_helices:Tmhmm;Pfam_domain:PF0052...,,,,,,,,,,,,,,,,,LOW,,SNV,5.0,,,1,,,,,,,,,,,PASS,AGTATGACTGC,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1482144;COSM4814664,True,Unknown


In [None]:
path_id = []
num_mut = []
pathway_name = []

for name, group in kegg.groupby('pathway_id'):
  path_id.append(name)
  num_mut.append(sum(mut_hsa.Entrez_Gene_Id.isin(group.gene_id)))
  pathway_name.append(group.pathway_name)
  # print(name)

In [None]:
# # this cell takes too long, so use the one above

# # for n, g in kegg.groupby('pathway_id'):
# #     print(n, g)

# path_id = []
# num_mut = []

# for name, group in kegg.groupby('pathway_id'):
#   path_id.append(name)
#   num_mut.append(sum(x in list(group.gene_id) for x in mut_hsa.Entrez_Gene_Id))
# #     print(name, list(group.gene_id))
# #     print(name, sum(x in list(group.gene_id) for x in mut_hsa.Entrez_Gene_Id))

In [None]:
path_count = np.c_[path_id, num_mut, pathway_name]
path_count = pd.DataFrame(path_count, columns = ['pathway_id', 'num_samples', 'pathway_name'])
print(path_count.shape)  
path_count.head()

(336, 3)


Unnamed: 0,pathway_id,num_samples,pathway_name
0,path:hsa00010,241,0 Glycolysis / Gluconeogenesis - Homo sapi...
1,path:hsa00020,131,68 Citrate cycle (TCA cycle) - Homo sapiens...
2,path:hsa00030,111,98 Pentose phosphate pathway - Homo sapien...
3,path:hsa00040,125,128 Pentose and glucuronate interconversion...
4,path:hsa00051,133,162 Fructose and mannose metabolism - Homo ...


The cell below gives the number of breast cancer samples that have non-MODIFIER mutations in each KEGG pathway. They are ranked.

In [None]:
path_count.num_samples = path_count.num_samples.astype(int)
path_count.sort_values('num_samples', ascending=False)

Unnamed: 0,pathway_id,num_samples,pathway_name
87,path:hsa01100,6593,2966 Metabolic pathways - Homo sapiens (hum...
297,path:hsa05200,4055,27318 Pathways in cancer - Homo sapiens (hu...
294,path:hsa05168,3239,26414 Herpes simplex virus 1 infection - Ho...
291,path:hsa05165,3159,25679 Human papillomavirus infection - Homo...
147,path:hsa04151,3137,10932 PI3K-Akt signaling pathway - Homo sap...
115,path:hsa04010,2246,6241 MAPK signaling pathway - Homo sapiens ...
169,path:hsa04510,2237,13499 Focal adhesion - Homo sapiens (human)...
301,path:hsa05205,2164,28318 Proteoglycans in cancer - Homo sapien...
292,path:hsa05166,2101,26009 Human T-cell leukemia virus 1 infecti...
121,path:hsa04024,1949,7423 cAMP signaling pathway - Homo sapiens ...


The following cells are trying to calculate: Number of patients with mutated pathway / Total number of patients.

In [None]:
# Get the gene IDs under the top pathway from kegg
kegg_top = kegg[kegg.pathway_id == 'path:hsa01100']
kegg_top.head()

Unnamed: 0,gene_id,pathway_id,pathway_name
2966,hsa:10,path:hsa01100,Metabolic pathways - Homo sapiens (human)
2967,hsa:100,path:hsa01100,Metabolic pathways - Homo sapiens (human)
2968,hsa:10005,path:hsa01100,Metabolic pathways - Homo sapiens (human)
2969,hsa:10007,path:hsa01100,Metabolic pathways - Homo sapiens (human)
2970,hsa:100137049,path:hsa01100,Metabolic pathways - Homo sapiens (human)


In [None]:
# Find the gene IDs from cancer samples that match the gene IDs under the top pathway
mut_top = mut_hsa[mut_hsa.Entrez_Gene_Id.isin(kegg_top.gene_id)]
mut_top.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,dbSNP_RS,dbSNP_Val_Status,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Match_Norm_Seq_Allele1,Match_Norm_Seq_Allele2,Tumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2,Verification_Status,Validation_Status,Mutation_Status,Sequencing_Phase,Sequence_Source,Validation_Method,Score,BAM_File,Sequencer,Tumor_Sample_UUID,Matched_Norm_Sample_UUID,HGVSc,HGVSp,HGVSp_Short,Transcript_ID,Exon_Number,t_depth,t_ref_count,t_alt_count,n_depth,n_ref_count,n_alt_count,all_effects,Allele,Gene,Feature,Feature_type,One_Consequence,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,ALLELE_NUM,DISTANCE,TRANSCRIPT_STRAND,SYMBOL,SYMBOL_SOURCE,HGNC_ID,BIOTYPE,CANONICAL,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,RefSeq,SIFT,PolyPhen,EXON,INTRON,DOMAINS,GMAF,AFR_MAF,AMR_MAF,ASN_MAF,EAS_MAF,EUR_MAF,SAS_MAF,AA_MAF,EA_MAF,CLIN_SIG,SOMATIC,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,IMPACT,PICK,VARIANT_CLASS,TSL,HGVS_OFFSET,PHENO,MINIMISED,ExAC_AF,ExAC_AF_Adj,ExAC_AF_AFR,ExAC_AF_AMR,ExAC_AF_EAS,ExAC_AF_FIN,ExAC_AF_NFE,ExAC_AF_OTH,ExAC_AF_SAS,GENE_PHENO,FILTER,CONTEXT,src_vcf_id,tumor_bam_uuid,normal_bam_uuid,case_id,GDC_FILTER,COSMIC,MC3_Overlap,GDC_Validation_Status
11,PIK3CA,hsa:5290,WUGSC,GRCh38,chr3,179218303,179218303,+,Missense_Mutation,SNP,G,G,A,rs104886003,byCluster,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.1633G>A,p.Glu545Lys,p.E545K,ENST00000263967,10/21,114,70,44,83,,,"PIK3CA,missense_variant,p.E545K,ENST0000026396...",A,ENSG00000121879,ENST00000263967,Transcript,missense_variant,missense_variant,1790/9093,1633/3207,545/1068,E/K,Gag/Aag,rs104886003,1,,1.0,PIK3CA,HGNC,HGNC:8975,protein_coding,YES,CCDS43171.1,ENSP00000263967,P42336,,UPI000013D494,NM_006218.2,deleterious(0.02),probably_damaging(0.959),10/21,,Pfam_domain:PF00613;SMART_domains:SM00145;Supe...,,,,,,,,,,not_provided;pathogenic,,21264207.0,,,,,MODERATE,1.0,SNV,2.0,,1.0,1,8e-06,8e-06,0.0,0.0,0.0,0.0,1.5e-05,0.0,0.0,,PASS,TCACTGAGCAG,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM125370;COSM763,True,Unknown
12,MCCC1,hsa:56922,WUGSC,GRCh38,chr3,183092485,183092485,+,Missense_Mutation,SNP,C,C,T,rs569042803,by1000G,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.197G>A,p.Arg66His,p.R66H,ENST00000265594,3/19,191,165,26,124,,,"MCCC1,missense_variant,p.R66H,ENST00000265594,...",T,ENSG00000078070,ENST00000265594,Transcript,missense_variant,missense_variant,344/2545,197/2178,66/725,R/H,cGc/cAc,rs569042803,1,,-1.0,MCCC1,HGNC,HGNC:6936,protein_coding,YES,CCDS3241.1,ENSP00000265594,Q96RQ3,,UPI000013D646,NM_020166.3,deleterious(0.02),benign(0.276),3/19,,Pfam_domain:PF00289;PROSITE_profiles:PS50979;S...,,0.0,0.0,,0.0,0.0,0.001,,,,,,,,,,MODERATE,1.0,SNV,1.0,,,1,8e-06,8e-06,0.0,0.0,0.0,0.0,1.5e-05,0.0,0.0,,PASS,CTGTGCGCATC,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1041745,True,Unknown
14,UGT8,hsa:7368,WUGSC,GRCh38,chr4,114665726,114665727,+,Frame_Shift_Del,DEL,AT,AT,-,,,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.1013_1014delTA,p.Ile338ArgfsTer7,p.I338Rfs*7,ENST00000310836,4/6,152,113,39,166,,,"UGT8,frameshift_variant,p.I338Rfs*7,ENST000003...",-,ENSG00000174607,ENST00000310836,Transcript,frameshift_variant,frameshift_variant,1534-1535/4084,1012-1013/1626,338/541,I/X,ATa/a,,1,,1.0,UGT8,HGNC,HGNC:12555,protein_coding,YES,CCDS3705.1,ENSP00000311648,Q16880,,UPI000013F094,NM_001128174.1,,,4/6,,Pfam_domain:PF00201;Superfamily_domains:SSF53756,,,,,,,,,,,,,,,,,HIGH,1.0,deletion,1.0,1.0,,1,,,,,,,,,,,PASS,AAACTCATAGAAT,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1485618,True,Unknown
21,GUSB,hsa:2990,WUGSC,GRCh38,chr7,65974700,65974700,+,Missense_Mutation,SNP,C,C,T,,,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.1070G>A,p.Arg357Gln,p.R357Q,ENST00000304895,7/12,31,19,12,29,,,"GUSB,missense_variant,p.R357Q,ENST00000304895,...",T,ENSG00000169919,ENST00000304895,Transcript,missense_variant,missense_variant,1201/2300,1070/1956,357/651,R/Q,cGa/cAa,,1,,-1.0,GUSB,HGNC,HGNC:4696,protein_coding,YES,CCDS5530.1,ENSP00000302728,P08236,,UPI000013E9E0,NM_000181.3,deleterious(0),probably_damaging(0.992),7/12,,Pfam_domain:PF02836;Superfamily_domains:SSF51445,,,,,,,,,,,,,,,,,MODERATE,1.0,SNV,1.0,,,1,,,,,,,,,,,PASS,TCCCTCGGATC,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1488634,True,Unknown
28,EHMT1,hsa:79813,WUGSC,GRCh38,chr9,137777960,137777960,+,Silent,SNP,G,G,A,rs756140987,byFrequency,TCGA-D8-A1XY-01A-11D-A14K-09,TCGA-D8-A1XY-10A-01D-A14K-09,,,,,,,,,Somatic,,,,,,Illumina HiSeq 2000,edb6d161-8f50-4c11-8246-487c4ea9a55d,8dea96d9-5017-4872-a84e-33bfd2f37b7a,c.2097G>A,p.=,p.T699T,ENST00000460843,13/27,30,18,12,42,,,"EHMT1,synonymous_variant,p.T699T,ENST000004608...",A,ENSG00000181090,ENST00000460843,Transcript,synonymous_variant,synonymous_variant,2163/5137,2097/3897,699/1298,T,acG/acA,rs756140987,1,,1.0,EHMT1,HGNC,HGNC:24650,protein_coding,YES,CCDS7050.2,ENSP00000417980,Q9H9B1,,UPI000194EC2D,NM_024757.4,,,13/27,,,,,,,,,,,,,,,,,,,LOW,1.0,SNV,5.0,,,1,4.1e-05,4.1e-05,0.0,0.0,0.000349,0.0,1.5e-05,0.0,6.1e-05,,PASS,CCAACGGGACC,d083d669-6646-463b-853e-c58da8d06439,4374e19d-c5e7-49cf-8707-05ae5aeb7369,aadee87c-6a68-4580-bd10-64ac273b1e3d,0130d616-885e-4a6c-9d03-2f17dd692a05,,COSM1489850;COSM1489851;COSM5226076,True,Unknown


In [None]:
num_top = mut_top.Tumor_Sample_Barcode.nunique()
num_top
# Number of patients with mutated pathway

924

In [None]:
num_total = maf.Tumor_Sample_Barcode.nunique()
num_total
# Total number of patients

986

In [None]:
mut_freq = num_top/num_total
mut_freq
# the mutation frequency of the pathway (Number of patients with mutated pathway / Total number of patients)

0.9371196754563894

The most mutated pathway in the TCGA breast cancer data set is hsa01100 (2966 Metabolic pathways - Homo sapiens). The mutation frequency of this pathway is 0.937 = 93.7%.