# PyMol Mutation Visualization

This notebooks can be used to visualize mutation statistics from cBioPortal in PyMol. First mutation statistics will analyzed. Then a text file will be generated whose content can be used in PyMol.

In [1]:
import pandas as pd

In [1]:
ntrks_data = {
    "ntrk1": {
        "input_path": "../../data/cbioportal_data.tsv",
        "output_path": "../../data/NTRK1_pymol_commands",
        "start": 510,
        "end": 781},
    # "ntrk2": {
    #     "input_path": "../../data/NTRK2_lollipop.tsv",
    #     "output_path": "../../data/NTRK2_pymol_commands",
    #     "start": 538,
    #     "end": 807},
    # "ntrk3": {
    #     "input_path": "../../data/NTRK3_lollipop.tsv",
    #     "output_path": "../../data/NTRK3_pymol_commands",
    #     "start": 538,
    #     "end": 839}
}

In [3]:
import pandas as pd
mutations = pd.read_csv(ntrks_data["ntrk1"]["input_path"], sep="\t")
mutations.head()

Unnamed: 0,Study of Origin,Sample ID,Cancer Type,Cancer Type Detailed,Protein Change,Annotation,Functional Impact,Mutation Type,Variant Type,Copy #,...,Pathology Cancer Maximum Dimension (cm),Disease Surgical Margin Status,CIS,Distant Recurrence,Time to distant recurrence,Time to local recurrence status,Urothelial Primary,Year of Surgery,Event Type,Histology Classification in Primary Tumor
0,"Thyroid Carcinoma (TCGA, PanCancer Atlas)",TCGA-EL-A3D4-01,,Papillary Thyroid Cancer,TPM3-NTRK1,"OncoKB: Likely Oncogenic, level_1;CIViC: Predi...",MutationAssessor: NA;SIFT: NA;Polyphen-2: NA,fusion,FUSION,diploid,...,,,,,,,,,,
1,"Thyroid Carcinoma (TCGA, PanCancer Atlas)",TCGA-ET-A40S-01,,Papillary Thyroid Cancer,SQSTM1-NTRK1,"OncoKB: Likely Oncogenic, level_1;CIViC: Predi...",MutationAssessor: NA;SIFT: NA;Polyphen-2: NA,fusion,FUSION,shallowdel,...,,,,,,,,,,
2,"Sarcoma (TCGA, PanCancer Atlas)",TCGA-DX-A3UA-01,,Leiomyosarcoma,TPM3-NTRK1,"OncoKB: Likely Oncogenic, level_1;CIViC: Predi...",MutationAssessor: NA;SIFT: NA;Polyphen-2: NA,fusion,FUSION,diploid,...,,,,,,,,,,
3,"Bladder Urothelial Carcinoma (TCGA, PanCancer ...",TCGA-FD-A6TI-01,,Bladder Urothelial Carcinoma,TPM3-NTRK1,"OncoKB: Likely Oncogenic, level_1;CIViC: Predi...",MutationAssessor: NA;SIFT: NA;Polyphen-2: NA,fusion,FUSION,gain,...,,,,,,,,,,
4,"Skin Cutaneous Melanoma (TCGA, PanCancer Atlas)",TCGA-D3-A8GE-06,,Cutaneous Melanoma,LMNA-NTRK1,"OncoKB: Oncogenic, level_1;CIViC: Diagnostic: ...",MutationAssessor: NA;SIFT: NA;Polyphen-2: NA,fusion,FUSION,deepdel,...,,,,,,,,,,


In [4]:
missense_mutations = mutations[mutations["Mutation Type"] == "Missense_Mutation"]["Protein Change"].to_list()
print(missense_mutations[:5])
print(f"Number of missense mutations: {len(missense_mutations)}")

['G595R', 'G595R', 'G595E', 'G595W', 'G595W']
Number of missense mutations: 616


In [5]:
def get_missense_mutations_dict(missense_mutations, start, end):
    missense_mutations_dict = {}
    for mutation in missense_mutations:
        try:
            resid = int(mutation[1:-1])
        except ValueError as e:
            print(e)
            continue
        residue_new = mutation[-1]
        if start <= resid <= end:
            if resid not in missense_mutations_dict.keys():
                missense_mutations_dict[resid] = {"count": 1, "amino_acids": set(residue_new)}
            else:
                missense_mutations_dict[resid]["count"] += 1
                missense_mutations_dict[resid]["amino_acids"].add(residue_new)
    return missense_mutations_dict

In [285]:
def get_missense_mutations_dictv2(missense_mutations, start, end):
    missense_mutations_dict = {}
    for mutation in missense_mutations:
        
        try:
            resid = int(mutation[1:-1])
            residue_new = mutation[-1]
        except ValueError as e:
            print(e)
            continue
        
        if start <= resid <= end:

            if resid not in missense_mutations_dict.keys():

                missense_mutations_dict[resid] = {"count": 1, "amino_acids": {}}
            
            else:
                missense_mutations_dict[resid]["count"] += 1
            
            if resid in missense_mutations_dict.keys() and residue_new not in missense_mutations_dict[resid]["amino_acids"].keys():

                missense_mutations_dict[resid]["amino_acids"][residue_new] = 1

            else: 
                missense_mutations_dict[resid]["amino_acids"][residue_new] += 1
    
    return missense_mutations_dict

In [224]:
def get_sphere_scale(number):
    min_sphere_scale = 0.5
    max_sphere_scale = 1.5
    sphere_scale = min_sphere_scale + (number / 10)
    if sphere_scale > max_sphere_scale:
        return max_sphere_scale
    else:
        return sphere_scale

In [7]:
def write_pymol_visualization(output_path, missense_mutations_dict):
    with open(output_path, "w") as wf:
        for resid, data in missense_mutations_dict.items():
            wf.write(f"create {resid}, resid {resid} and name CA\n")
            wf.write(f"show_as sphere, {resid}\n")
            wf.write(f"set sphere_scale, {get_sphere_scale(data['count'])}, {resid}\n")
            wf.write(f"color 0x9e000d, {resid}\n")

In [205]:
# for ntrk, data in ntrks_data.items():
#     print(f"Analyzing mutations for {ntrk} ...")
#     mutations = pd.read_csv(data["input_path"], sep="\t")
#     missense_mutations = mutations[mutations["Mutation Type"] == "Missense_Mutation"]["Protein Change"].to_list()
#     print(f"Number of missense mutations: {len(missense_mutations)}")
#     missense_mutations_dict = get_missense_mutations_dict(missense_mutations, data["start"], data["end"])
#     print(f"Number of mutated residues in kinase domain: {len(missense_mutations_dict)}")
#     print(f"Number of unique mutations in kinase domain: {sum([len(value['amino_acids']) for value in missense_mutations_dict.values()])}")
#     print(f"Total number of observed mutations: {sum([value['count'] for value in missense_mutations_dict.values()])}")
#     print("Writing pymol visulaization script ...")
#     write_pymol_visualization(data["output_path"], missense_mutations_dict)

In [286]:
for ntrk, data in ntrks_data.items():
    print(f"Analyzing mutations for {ntrk} ...")
    
    mutations = pd.read_csv(data["input_path"], sep="\t")
    missense_mutations = mutations[mutations["Mutation Type"] == "Missense_Mutation"]["Protein Change"].to_list()
    
    print(f"Number of missense mutations: {len(missense_mutations)}")
    missense_mutations_dict = get_missense_mutations_dictv2(missense_mutations, data["start"], data["end"])
    
    print(f"Number of mutated residues in kinase domain: {len(missense_mutations_dict)}")
    print(f"Number of unique mutations in kinase domain: {sum([len(value['amino_acids']) for value in missense_mutations_dict.values()])}")
    print(f"Total number of observed mutations: {sum([value['count'] for value in missense_mutations_dict.values()])}")
    print("Writing pymol visulaization script ...")
    
    # write_pymol_visualization(data["output_path"], missense_mutations_dict)

Analyzing mutations for ntrk1 ...
Number of missense mutations: 616
invalid literal for int() with base 10: '216_V217delinsH'
invalid literal for int() with base 10: '216_V217delinsH'
invalid literal for int() with base 10: '216_V217delinsH'
Number of mutated residues in kinase domain: 112
Number of unique mutations in kinase domain: 154
Total number of observed mutations: 219
Writing pymol visulaization script ...


In [292]:
missense_mutations_dict

{595: {'count': 5, 'amino_acids': {'R': 2, 'E': 1, 'W': 2}},
 776: {'count': 1, 'amino_acids': {'E': 1}},
 712: {'count': 2, 'amino_acids': {'I': 1, 'R': 1}},
 686: {'count': 8, 'amino_acids': {'C': 4, 'H': 3, 'G': 1}},
 710: {'count': 1, 'amino_acids': {'L': 1}},
 598: {'count': 1, 'amino_acids': {'S': 1}},
 692: {'count': 7, 'amino_acids': {'C': 4, 'H': 3}},
 537: {'count': 1, 'amino_acids': {'E': 1}},
 649: {'count': 6, 'amino_acids': {'W': 2, 'Q': 3, 'L': 1}},
 516: {'count': 1, 'amino_acids': {'P': 1}},
 627: {'count': 1, 'amino_acids': {'R': 1}},
 579: {'count': 1, 'amino_acids': {'S': 1}},
 591: {'count': 1, 'amino_acids': {'N': 1}},
 654: {'count': 5, 'amino_acids': {'C': 2, 'H': 3}},
 578: {'count': 2, 'amino_acids': {'I': 2}},
 744: {'count': 2, 'amino_acids': {'H': 2}},
 661: {'count': 4, 'amino_acids': {'R': 1, 'E': 2, 'K': 1}},
 647: {'count': 1, 'amino_acids': {'L': 1}},
 599: {'count': 5, 'amino_acids': {'H': 2, 'C': 3}},
 612: {'count': 2, 'amino_acids': {'V': 1, 'T': 1

In [34]:
# missense_mutations_dict
sorted_keys = sorted(missense_mutations_dict, key=lambda x: (missense_mutations_dict[x]['count']))

In [35]:
sorted_keys[::-10]

[695, 643, 637, 511, 593, 626, 727, 641, 771, 539, 584, 710]

In [36]:
for residue in sorted_keys[::-10]:

    print(f"Residue {residue}", missense_mutations_dict[residue])

Residue 695 {'count': 9, 'amino_acids': {'Q'}}
Residue 643 {'count': 4, 'amino_acids': {'D', 'S', 'C'}}
Residue 637 {'count': 2, 'amino_acids': {'E'}}
Residue 511 {'count': 2, 'amino_acids': {'M'}}
Residue 593 {'count': 2, 'amino_acids': {'W'}}
Residue 626 {'count': 1, 'amino_acids': {'K'}}
Residue 727 {'count': 1, 'amino_acids': {'H'}}
Residue 641 {'count': 1, 'amino_acids': {'M'}}
Residue 771 {'count': 1, 'amino_acids': {'C'}}
Residue 539 {'count': 1, 'amino_acids': {'I'}}
Residue 584 {'count': 1, 'amino_acids': {'L'}}
Residue 710 {'count': 1, 'amino_acids': {'L'}}


In [39]:
top_ten = {key: missense_mutations_dict[key] for key in sorted_keys[::-10]}
write_pymol_visualization(data["output_path"], top_ten)

In [40]:
top_ten

{695: {'count': 9, 'amino_acids': {'Q'}},
 643: {'count': 4, 'amino_acids': {'C', 'D', 'S'}},
 637: {'count': 2, 'amino_acids': {'E'}},
 511: {'count': 2, 'amino_acids': {'M'}},
 593: {'count': 2, 'amino_acids': {'W'}},
 626: {'count': 1, 'amino_acids': {'K'}},
 727: {'count': 1, 'amino_acids': {'H'}},
 641: {'count': 1, 'amino_acids': {'M'}},
 771: {'count': 1, 'amino_acids': {'C'}},
 539: {'count': 1, 'amino_acids': {'I'}},
 584: {'count': 1, 'amino_acids': {'L'}},
 710: {'count': 1, 'amino_acids': {'L'}}}

We'll now use MDAnalysis to only count residues within X of the binding site. We'll define the binding site by KLIFS residues.

In [182]:
import MDAnalysis as mda
import numpy as np

def filter_mutations(pdb_file, mutations, cutoff):

    klifs_residues = ["514:526", "541:546", "556:568", "570:578", "586:603", "640:658", "666:671", "677"]

    u = mda.Universe(pdb_file)

    missense_mutations_res_sel = " ".join([str(key) for key in mutations.keys()])
    klifs_res_sel = "".join([res_sel + " " for res_sel in klifs_residues])

    klifs_sel = u.select_atoms(f"resid {klifs_res_sel}")
    missense_mutations_sel = u.select_atoms(f"resid {missense_mutations_res_sel}")

    missense_mutations_filter = missense_mutations_sel.select_atoms(f"around {cutoff} resid {klifs_res_sel}")

    print(f"{len(missense_mutations_sel.select_atoms('name CA').resids)} missense mutations")

    print(f"{len(klifs_sel.select_atoms('name CA').resids)} klifs residues")

    print(f"Filtered out {np.abs(len(klifs_sel.select_atoms('name CA').resids) - len(missense_mutations_sel.select_atoms('name CA').resids))} residues greater than 6 Angstroms from KLIFS residues")

    print(f"Total of {len(missense_mutations_filter.select_atoms('name CA').resids)} within {cutoff} Angstroms of KLIFS residues")

    return [int(resid) for resid in missense_mutations_filter.select_atoms('name CA').resids], [int(resid) for resid in klifs_sel.select_atoms('name CA').resids]


In [183]:
pdb = "../kinoml_modeling/data/kinoml_OEKLIFSKinaseHybridDockingFeaturizer_NTRK1_larotrectinib_complex.pdb"

filter, klifs_residues = filter_mutations(pdb_file=pdb, mutations=missense_mutations_dict, cutoff=5)

112 missense mutations
85 klifs residues
Filtered out 27 residues greater than 6 Angstroms from KLIFS residues
Total of 19 within 5 Angstroms of KLIFS residues


In [184]:
filtered_missense = {key: missense_mutations_dict[key] for key in filter}

In [185]:
filtered_missense_keys_sorted = sorted(filtered_missense, key=lambda x: (filtered_missense[x]['count']))

In [186]:
final_filtered_missense_mutations = {}

for residue in filtered_missense_keys_sorted[::-1][:10]:
    try:
        print(f"Residue {residue}", filtered_missense[residue])
        final_filtered_missense_mutations[residue] = filtered_missense[residue]
    except:
        continue



Residue 604 {'count': 5, 'amino_acids': {'Y'}}
Residue 725 {'count': 2, 'amino_acids': {'T', 'E'}}
Residue 679 {'count': 2, 'amino_acids': {'Y', 'N'}}
Residue 659 {'count': 2, 'amino_acids': {'C', 'S'}}
Residue 637 {'count': 2, 'amino_acids': {'E'}}
Residue 636 {'count': 2, 'amino_acids': {'V', 'E'}}
Residue 612 {'count': 2, 'amino_acids': {'V', 'T'}}
Residue 705 {'count': 1, 'amino_acids': {'I'}}
Residue 690 {'count': 1, 'amino_acids': {'L'}}
Residue 672 {'count': 1, 'amino_acids': {'I'}}


In [165]:
write_pymol_visualization(data["output_path"], final_filtered_missense_mutations)

Now we'll check if any mutations are in the KLIFS residues

In [288]:
mutations_in_klifs = {}
for key in klifs_residues:
    try:
        mutations_in_klifs[key] = missense_mutations_dict[key]
    except:
        continue

In [289]:
mutations_in_klifs_keys_sorted = sorted(mutations_in_klifs, key=lambda x: (mutations_in_klifs[x]['count']))

In [290]:
final_filtered_missense_mutations_in_klifs = {}

for residue in mutations_in_klifs_keys_sorted[::-1][:10]:
    try:
        print(f"Residue {residue}", mutations_in_klifs[residue])
        final_filtered_missense_mutations_in_klifs[residue] = mutations_in_klifs[residue]
    except:
        continue

Residue 649 {'count': 6, 'amino_acids': {'W': 2, 'Q': 3, 'L': 1}}
Residue 654 {'count': 5, 'amino_acids': {'C': 2, 'H': 3}}
Residue 599 {'count': 5, 'amino_acids': {'H': 2, 'C': 3}}
Residue 595 {'count': 5, 'amino_acids': {'R': 2, 'E': 1, 'W': 2}}
Residue 643 {'count': 4, 'amino_acids': {'S': 1, 'D': 2, 'C': 1}}
Residue 526 {'count': 4, 'amino_acids': {'F': 2, 'H': 1, 'R': 1}}
Residue 522 {'count': 3, 'amino_acids': {'E': 3}}
Residue 671 {'count': 2, 'amino_acids': {'T': 1, 'L': 1}}
Residue 668 {'count': 2, 'amino_acids': {'N': 2}}
Residue 658 {'count': 2, 'amino_acids': {'L': 1, 'A': 1}}


In [293]:
final_filtered_missense_mutations_in_klifs

{649: {'count': 6, 'amino_acids': {'W': 2, 'Q': 3, 'L': 1}},
 654: {'count': 5, 'amino_acids': {'C': 2, 'H': 3}},
 599: {'count': 5, 'amino_acids': {'H': 2, 'C': 3}},
 595: {'count': 5, 'amino_acids': {'R': 2, 'E': 1, 'W': 2}},
 643: {'count': 4, 'amino_acids': {'S': 1, 'D': 2, 'C': 1}},
 526: {'count': 4, 'amino_acids': {'F': 2, 'H': 1, 'R': 1}},
 522: {'count': 3, 'amino_acids': {'E': 3}},
 671: {'count': 2, 'amino_acids': {'T': 1, 'L': 1}},
 668: {'count': 2, 'amino_acids': {'N': 2}},
 658: {'count': 2, 'amino_acids': {'L': 1, 'A': 1}}}

In [195]:
write_pymol_visualization(data["output_path"], final_filtered_missense_mutations_in_klifs)

{515: {'count': 1, 'amino_acids': {'K'}},
 516: {'count': 1, 'amino_acids': {'P'}},
 517: {'count': 2, 'amino_acids': {'R', 'V'}},
 518: {'count': 1, 'amino_acids': {'K'}},
 519: {'count': 2, 'amino_acids': {'D', 'S'}},
 522: {'count': 3, 'amino_acids': {'E'}},
 524: {'count': 1, 'amino_acids': {'I'}},
 526: {'count': 4, 'amino_acids': {'F', 'H', 'R'}},
 541: {'count': 1, 'amino_acids': {'M'}},
 542: {'count': 1, 'amino_acids': {'T'}},
 543: {'count': 1, 'amino_acids': {'G'}},
 557: {'count': 1, 'amino_acids': {'Y'}},
 561: {'count': 1, 'amino_acids': {'V'}},
 574: {'count': 1, 'amino_acids': {'H'}},
 578: {'count': 2, 'amino_acids': {'I'}},
 587: {'count': 2, 'amino_acids': {'I'}},
 591: {'count': 1, 'amino_acids': {'N'}},
 593: {'count': 2, 'amino_acids': {'W'}},
 595: {'count': 5, 'amino_acids': {'E', 'R', 'W'}},
 596: {'count': 1, 'amino_acids': {'N'}},
 598: {'count': 1, 'amino_acids': {'S'}},
 599: {'count': 5, 'amino_acids': {'C', 'H'}},
 603: {'count': 1, 'amino_acids': {'P'}},

In [None]:
mutations_in_klifs
