# PyMol Mutation Visualization

This notebooks can be used to visualize mutation statistics from cBioPortal in PyMol. First mutation statistics will analyzed. Then a text file will be generated whose content can be used in PyMol.

In [1]:
import pandas as pd

In [2]:
ntrks_data = {
    "ntrk1": {
        "input_path": "../../data/NTRK1_lollipop.tsv",
        "output_path": "../../data/NTRK1_pymol_commands",
        "start": 510,
        "end": 781},
    "ntrk2": {
        "input_path": "../../data/NTRK2_lollipop.tsv",
        "output_path": "../../data/NTRK2_pymol_commands",
        "start": 538,
        "end": 807},
    "ntrk3": {
        "input_path": "../../data/NTRK3_lollipop.tsv",
        "output_path": "../../data/NTRK3_pymol_commands",
        "start": 538,
        "end": 839}
}

In [3]:
mutations = pd.read_csv(ntrks_data["ntrk1"]["input_path"], sep="\t")
mutations.head()

Unnamed: 0,Study,Sample ID,Cancer Type,Protein Change,Annotation,Functional Impact,Mutation Type,Variant Type,Copy #,COSMIC,...,Allele Freq (N),Variant Reads,Ref Reads,Variant Reads (Normal),Ref Reads (Normal),# Mut in Sample,Exon,gnomAD,ClinVar ID,dbSNP
0,"Sarcoma (TCGA, PanCancer Atlas)",TCGA-DX-A3UA-01,Leiomyosarcoma,TPM3-NTRK1,"OncoKB: Likely Oncogenic, level_1;CIViC: NA;My...",MutationAssessor: NA;SIFT: NA;Polyphen-2: NA,Fusion,,diploid,,...,,,,,,26.0,,,,
1,"Thyroid Carcinoma (TCGA, PanCancer Atlas)",TCGA-EL-A3D4-01,Papillary Thyroid Cancer,TPM3-NTRK1,"OncoKB: Likely Oncogenic, level_1;CIViC: NA;My...",MutationAssessor: NA;SIFT: NA;Polyphen-2: NA,Fusion,,diploid,,...,,,,,,6.0,,,,
2,"Bladder Urothelial Carcinoma (TCGA, PanCancer ...",TCGA-FD-A6TI-01,Bladder Urothelial Carcinoma,TPM3-NTRK1,"OncoKB: Likely Oncogenic, level_1;CIViC: NA;My...",MutationAssessor: NA;SIFT: NA;Polyphen-2: NA,Fusion,,gain,,...,,,,,,115.0,,,,
3,"MSK-IMPACT Clinical Sequencing Cohort (MSKCC, ...",P-0003329-T01-IM5,Gallbladder Cancer,NTRK1-LMNA fusion,"OncoKB: Oncogenic, level_1;CIViC: NA;MyCancerG...",MutationAssessor: NA;SIFT: NA;Polyphen-2: NA,Fusion,,diploid,,...,,,,,,1.0,,,,
4,"MSK-IMPACT Clinical Sequencing Cohort (MSKCC, ...",P-0003705-T01-IM5,Mucinous Adenocarcinoma of the Appendix,NTRK1-LMNA fusion,"OncoKB: Oncogenic, level_1;CIViC: NA;MyCancerG...",MutationAssessor: NA;SIFT: NA;Polyphen-2: NA,Fusion,,diploid,,...,,,,,,3.0,,,,


In [4]:
missense_mutations = mutations[mutations["Mutation Type"] == "Missense_Mutation"]["Protein Change"].to_list()
print(missense_mutations[:5])
print(f"Number of missense mutations: {len(missense_mutations)}")

['G595R', 'G595W', 'G595W', 'T264M', 'T264M']
Number of missense mutations: 594


In [5]:
def get_missense_mutations_dict(missense_mutations, start, end):
    missense_mutations_dict = {}
    for mutation in missense_mutations:
        try:
            resid = int(mutation[1:-1])
        except ValueError as e:
            print(e)
            continue
        residue_new = mutation[-1]
        if start <= resid <= end:
            if resid not in missense_mutations_dict.keys():
                missense_mutations_dict[resid] = {"count": 1, "amino_acids": set(residue_new)}
            else:
                missense_mutations_dict[resid]["count"] += 1
                missense_mutations_dict[resid]["amino_acids"].add(residue_new)
    return missense_mutations_dict

In [6]:
def get_sphere_scale(number):
    min_sphere_scale = 0.5
    max_sphere_scale = 1.5
    sphere_scale = min_sphere_scale + (number / 10)
    if sphere_scale > max_sphere_scale:
        return max_sphere_scale
    else:
        return sphere_scale

In [7]:
def write_pymol_visualization(output_path, missense_mutations_dict):
    with open(output_path, "w") as wf:
        for resid, data in missense_mutations_dict.items():
            wf.write(f"create {resid}, resid {resid} and name CA\n")
            wf.write(f"show_as sphere, {resid}\n")
            wf.write(f"set sphere_scale, {get_sphere_scale(data['count'])}, {resid}\n")
            wf.write(f"color 0x9e000d, {resid}\n")

In [13]:
for ntrk, data in ntrks_data.items():
    print(f"Analyzing mutations for {ntrk} ...")
    mutations = pd.read_csv(data["input_path"], sep="\t")
    missense_mutations = mutations[mutations["Mutation Type"] == "Missense_Mutation"]["Protein Change"].to_list()
    print(f"Number of missense mutations: {len(missense_mutations)}")
    missense_mutations_dict = get_missense_mutations_dict(missense_mutations, data["start"], data["end"])
    print(f"Number of mutated residues in kinase domain: {len(missense_mutations_dict)}")
    print(f"Number of unique mutations in kinase domain: {sum([len(value['amino_acids']) for value in missense_mutations_dict.values()])}")
    print(f"Total number of observed mutations: {sum([value['count'] for value in missense_mutations_dict.values()])}")
    print("Writing pymol visulaization script ...")
    write_pymol_visualization(data["output_path"], missense_mutations_dict)

Analyzing mutations for ntrk1 ...
Number of missense mutations: 594
invalid literal for int() with base 10: '216_V217delinsH'
invalid literal for int() with base 10: '216_V217delinsH'
invalid literal for int() with base 10: '216_V217delinsH'
Number of mutated residues in kinase domain: 109
Number of unique mutations in kinase domain: 149
Total number of observed mutations: 211
Writing pymol visulaization script ...
Analyzing mutations for ntrk2 ...
Number of missense mutations: 459
Number of mutated residues in kinase domain: 116
Number of unique mutations in kinase domain: 138
Total number of observed mutations: 182
Writing pymol visulaization script ...
Analyzing mutations for ntrk3 ...
Number of missense mutations: 835
Number of mutated residues in kinase domain: 166
Number of unique mutations in kinase domain: 224
Total number of observed mutations: 325
Writing pymol visulaization script ...
