### List of Imports

In [1]:
import os
import warnings
import requests
import pandas as pd
import numpy as np
import pandas as pd
from Bio.PDB import Selection, NeighborSearch
from collections import defaultdict
from Bio.PDB import PDBParser
import dask.dataframe as dd

In a future release, Dask DataFrame will use a new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 


    # via Python

    # via CLI


  import dask.dataframe as dd


# Paths definition

In [2]:
# User dependent paths (must be passed as arguments)
work_dir= "/Users/fede/Desktop/Lab/Git/project_orellana_lab/mutational_clustering_work"
out_dir= "/Users/fede/Desktop/Lab/Projects/mutational_clustering"
cosmic_genome_screens_mutant_tsv="/Users/fede/Heavyfiles/Cosmic_GenomeScreensMutant_Tsv_v99_GRCh37/Cosmic_GenomeScreensMutant_v99_GRCh37.tsv"
clinvar_variant_summary_txt="/Users/fede/Heavyfiles/clinvar_variant_summary.txt"

# User independent paths (repo files)
cosmic_classification= f"{work_dir}/Cosmic_Classification_v99_GRCh37.tsv"

# Other paths
test_pdb=f"{work_dir}/1yoh.pdb"

# Class object that we utilize as framework

Initialized with either:
- gene_name
- uniprot_id

In [25]:
class MutationalClusterer:
    def __init__(self, work_dir, out_dir,
                 cosmic_classification=None, 
                 cosmic_genome_screens_mutant_tsv=None, 
                 clinvar_variant_summary_txt=None, 
                 uniprot_id= None,
                 gene_name= None,
                 logging=True):
        '''
        General File structure should be something like:
        https://tree.nathanfriend.io/?s=(%27opGHs!(%27fancy8~fullPath!fJse~trailingSlash8~rootDot8)~K(%27K%27work_dir5C0QD64Swissprot.DB4OthA_files_for_Mlookup_in_d6.DB--IntAmediates4F_protein_mulGple_seq2.Ji4summary_staGsGcs29quJity2_df93-Results4*Mdf9*MplotsL*4*additHJ4*5Pymol_output-Example4screenshotL35PDBsQ4D6_parsA_s7%204NeighbFhood_clustAA.py43-3-%27)~vAsiH!%271%27)*%20%20-5*0HsAvaGH2_Jignment3...%2F4-*5%5Cn*6atabase7cripts8!true9.csv4AerFourGtiHonJalKsFce!L.png4Mc0_Q-S7-%01QMLKJHGFA987654320-*
        .
        └── work_dir/
            ├── Conservation/
            │   ├── Scripts
            │   ├── Database/
            │   │   ├── Swissprot.DB
            │   │   └── Other_files_for_conservation_lookup_in_database.DB
            │   ├── Intermediates/
            │   │   ├── our_protein_multiple_seq_alignment.ali
            │   │   ├── summary_statistics_alignment.csv
            │   │   ├── quality_alignment_df.csv
            │   │   └── .../
            │   └── Results/
            │       ├── conservation_df.csv
            │       ├── conservation_plots.png
            │       └── additonal
            ├── Pymol_output/
            │   └── Example/
            │       ├── screenshot.png
            │       └── .../
            └── PDBs/
                ├── Scripts/
                │   ├── Database_parser_scripts 
                │   ├── Neighbourhood_clusterer.py
                │   └── .../
                └── .../
        
        '''
        self.work_dir = work_dir #the directory we want to work in
        self.log_dir = logging # we want to store log results for whatever we do.
        self.cosmic_classification = cosmic_classification
        self.cosmic_genome_screens_mutant_tsv= cosmic_genome_screens_mutant_tsv
        self.clinvar_variant_summary_txt= clinvar_variant_summary_txt
        self.uniprot_id= uniprot_id
        self.gene_name= gene_name
        self.out_dir= out_dir
        #Your other features of the class that we need.

        self.genename_or_uniprotid()
        
    def genename_or_uniprotid(self):
        """
        if uniprot_id is provided to the class, this is always converted 
        to gene name
        """
        if self.gene_name is not None:
            pass
        elif self.uniprot_id is not None:
            self.gene_name = self._get_gene_name(self.uniprot_id)
        else:
            raise ValueError("Either uniprot_id or gene_name must be provided")

    def _get_gene_name(self, uniprot_id:str):
        """
        returns gene name from uniprot id
        """
    
        fields = "id"
        
        URL = f"https://rest.uniprot.org/uniprotkb/search?format=tsv&fields={fields}&query={uniprot_id}"
        resp = self._get_url(URL)
        resp = resp.text
        resp = resp.split("\n")

        if not resp[1]:
            raise ValueError("No gene name found for this Uniprot ID")

        if resp[2]:
            #ambigous result
            warnings.warn("Gene name search resulted in more than one hit:\n" + str(resp[1:]))
        
        #don't care about organism specification e.g. EFGR_HUMAN -> EGFR
        return resp[1].split("_")[0]

    def _visualize_clusters_pymol(self, pdb:str) -> None:
        """
        Utility function to visualize cluster in pymol.
        """
        
        #from pymol import cmd
        pass

    def _plot_clusters(self, pdb:str) -> object:
        """
        Helper function to plot some statistics or quick interactive plots to investigate clustering.
        Mostly thought about pyplot or plotly interactive plots i.e alignment where we can see the conservation etc.
        https://plotly.com/python/alignment-chart/
        """

    def compute_neighbours(self, pdb:str, cutoff=8.0) -> pd.DataFrame:

        parser= PDBParser()
        # Initialize parser and retrieve structure
        structure = parser.get_structure("default", pdb)
        atom_list = Selection.unfold_entities(structure, "A")  # Retrieve all atoms
    
        # Initialize NeighborSearch with all atoms and prepare to store results
        ns = NeighborSearch(atom_list)
        neighbour_dict = defaultdict(set)  # Use set to avoid duplicates
    
        # Define list of standard amino acids to exclude solvents and ligands
        aa_lst = [
            "VAL", "ALA", "GLY", "TRP", "ARG", "LYS", "LEU", "ILE", "ASP", "ASN",
            "GLN", "GLU", "PRO", "TYR", "PHE", "SER", "THR", "CYS", "MET", "HIS"
        ]
    
        # Search for neighboring residues for each atom
        for atom in atom_list:
            residue = atom.get_parent()
            res_name = residue.get_resname()
            res_id = residue.get_id()[1]
    
            # Skip non-amino acid residues
            if res_name not in aa_lst:
                continue
    
            # Search for neighboring residues within the cutoff distance
            for neighbour in ns.search(atom.get_coord(), cutoff, "R"):
                neighbour_id = neighbour.get_id()[1]
                if neighbour_id != res_id:  # Exclude the residue itself
                    neighbour_dict[res_name + str(res_id)].add(neighbour_id)
    
            
        # Convert the neighbor dictionary to a list of tuples
        neighbour_data = [(res_id, ' '.join(map(str, sorted(neighbours)))) for res_id, neighbours in neighbour_dict.items()]

        # Create a pandas DataFrame
        df_neighbours = pd.DataFrame(neighbour_data, columns=['Residue_ID', 'Neighbours'])
        return df_neighbours
    

    def conservation(self, uniprot_id):
        '''Gets 3 different types of Conservation:
        - Shannon conservation: 
        Shannon entropy. 
        Higher values indicate lower conservation and greater variability at the site.
        
        - Relative conservation:
        Kullback-Leibler divergence.
        Higher values indicate greater conservation and lower variability at the site.
        
        - Lockless conservation
        Evolutionary conservation parameter defined by Lockless and Ranganathan (1999). 
        Higher values indicate greater conservation and lower variability at the site.
        '''

        if self.log_dir and not os.path.exists(self.log_dir):
            os.makedirs(self.log_dir)
        
        mmseq_fasta_result = self._mmseq_multi_fasta(uniprot_id=uniprot_id, outdir=self.work_dir)
        #get 3 different conservation scores in a pandas df.
        conserv_df = self._get_conservation(path_to_msa=mmseq_fasta_result)
        self.conservation_df = conserv_df

        conserv_df.to_csv(f"{self.log_dir}/conservation_df.csv")

        
    def _mmseq_multi_fasta(self, uniprot_id:str, outdir:str, 
                      sensitivity=7, filter_msa=0,
                     query_id = 0.6):
        """
        uniprot_id: The unique uniprot identifier used to fetch the corresponding fasta file that will be used as a template for mmseq2
        outdir: location where result files will be stored.
        sensitivity: mmseq2 specific parameter that goes from 1-7. The higher the more sensitive the search.
        filter_msa = 0 default. if 1 hits are stricter.
        query_id = 0.6 [0, 1]  the higher the more identity with query is retrieved. 1 means ONLY the query hits while 0 means take everything possible.
        """

        #we blast with this fasta as query.
        trgt_fasta_seq = self._get_gene_fasta(uniprot_id)
        #Make outdir for all required files.
        #we need to write it out to file.
        with open(f"{self.work_dir}/{uniprot_id}_fasta.fa", "w") as fasta_out:
            fasta_out.write(f">{uniprot_id}\n")
            fasta_out.write(trgt_fasta_seq)

        #fetch pre downloaded database from a parent folder.
        msa_file = None
        new_location = None
        try:
            DB_storage_location = f"{work_dir}"
            #shutil.copy(previous_path, savepath)
            bash_curl_cmd = f"mmseqs createdb {self.work_dir}/{uniprot_id}_fasta.fa {DB_storage_location}/query_fastaDB" 
            bash_curl_cmd_rdy = bash_curl_cmd.split()
            #run first cmd which setups query database based on our input fasta file
            result_setup_query_db = run(bash_curl_cmd_rdy, stdout=PIPE, stderr=PIPE, 
                                 universal_newlines=True)
            bash_curl_cmd_2 = f"mmseqs search {DB_storage_location}/query_fastaDB {DB_storage_location}/swiss_DB {DB_storage_location}/result_DB {DB_storage_location}/tmp -s {sensitivity}"    
            bash_curl_cmd_rdy_2 = bash_curl_cmd_2.split()
            #run 2nd cmd which blasts against swiss_DB and generates the resultDB (i.e our hits that were found)
            result_setup_blast_db = run(bash_curl_cmd_rdy_2, stdout=PIPE, stderr=PIPE, 
                                 universal_newlines=True)
            #mmseqs convert2fasta DB_clu_rep DB_clu_rep.fasta
            bash_curl_cmd_5 = f"mmseqs result2msa {DB_storage_location}/query_fastaDB {DB_storage_location}/swiss_DB {DB_storage_location}/result_DB {DB_storage_location}/{uniprot_id}_out.fasta --msa-format-mode 3 --filter-msa {filter_msa} --qid {query_id}" 
            bash_curl_cmd_5_rdy = bash_curl_cmd_5.split()
            result_setup_msa_convert = run(bash_curl_cmd_5_rdy, stdout=PIPE, stderr=PIPE, 
                                 universal_newlines=True)
            #delete last line.. required.
            sed_cmd = f'sed -e 1,4d -e $d {DB_storage_location}/{uniprot_id}_out.fasta'        
            bash_curl_cmd_6_rdy = sed_cmd.split()
            #f"{DB_storage_location}/{uniprot_id}_new_out.fasta"
            with open(f"{DB_storage_location}/{uniprot_id}_new_out.fasta", "w") as new_fasta:
                result_truncation = run(bash_curl_cmd_6_rdy, stdout=new_fasta, stderr=PIPE, 
                                 universal_newlines=True)
            # Specify the path to your MSA file
            msa_file = f"{DB_storage_location}/{uniprot_id}_new_out.fasta"
            #transfer the meta file to another location and delete useless files.
            # we need to delete : all uniprot* files. 
            # all query*. All result* 
            new_location = f"{self.work_dir}/{uniprot_id}.fasta"
            shutil.copy(msa_file, new_location)
            #remove_files_and_dirs_msa(DB_storage_location, uniprot_id=uniprot_id)
            
        except Exception as error:
            print(error)
        #we want the path to msa_file for downstream analysis.
        return new_location

    def _get_gene_fasta(self, uniprot_id:str):
        '''
        Helper function to grab the sequence 
        based on the Uniprot ID
        '''
        fields = "sequence"
        URL = f"https://rest.uniprot.org/uniprotkb/search?format=fasta&fields={fields}&query={uniprot_id}"
        resp = self._get_url(URL)
        resp = resp.iter_lines(decode_unicode=True)
        seq = ""
        i = 0
        for lines in resp:
            if i > 0:
                seq += lines
            i += 1
        return seq

    def _get_conservation(self, path_to_msa:str):    
        '''
        Helper function to compute 3 different types of conservation.
        
        - Shannon conservation: 
        Shannon entropy. 
        Higher values indicate lower conservation and greater variability at the site.
        
        - Relative conservation:
        Kullback-Leibler divergence.
        Higher values indicate greater conservation and lower variability at the site.
        
        - Lockless conservation
        Evolutionary conservation parameter defined by Lockless and Ranganathan (1999). 
        Higher values indicate greater conservation and lower variability at the site.
        '''
        canal = Canal(fastafile=path_to_msa, #Multiple sequence alignment (MSA) of homologous sequences
          ref=0, #Position of reference sequence in MSA, use first sequence always
          startcount=0, # ALways 0 because our seqs are always from 1 - end
          verbose=False) # no verbosity 
    
        result_cons = canal.analysis(method="all")
        return result_cons

    def _get_url(self, url):
        '''Helper function that uses requests for Downloads.'''
        try:
            response = requests.get(url)  
            if not response.ok:
                print(response.text)
        except:
            response.raise_for_status()
            #sys.exit() 
        return response

    def _get_cosmic_mutations(self, **kwargs)->pd.DataFrame:
        """
        retrieves mutations from cosmic database from gene name (or uniprot id from higher level)
        """
            
        path=self.cosmic_genome_screens_mutant_tsv             

            
        #default columns we want to retrieve. can be changed / added through kwargs later
        usecols=['GENE_SYMBOL',
         'MUTATION_AA', 'MUTATION_DESCRIPTION', 'CHROMOSOME', 
                 'GENOME_START', 'GENOME_STOP', 'COSMIC_PHENOTYPE_ID', 'MUTATION_SOMATIC_STATUS', 'GENOMIC_MUTATION_ID']

        df = dd.read_csv(path, sep="\t", dtype={'CHROMOSOME': 'object', 
        'GENOME_START': 'float64',
       'GENOME_STOP': 'float64'}, usecols=usecols)  #specify dtype / usecols to minimize memory usage required through load in.


        #we need to switch these tuples and then map the 1letter aa code to 3letter aa 
        #for later compatibility.
        lst =  [('Val',"V"), ('Ile',"I"), ('Leu',"L"), ('Glu',"E"), ('Gln',"Q"),
                    ('Asp',"D"), ('Asn',"N"), ('His',"H"), ('Trp',"W"), ('Phe',"F"), ('Tyr',"Y"), 
                    ('Arg',"R"), ('Lys',"K"), ('Ser',"S"), ('Thr',"T"), ('Met',"M"), ('Ala',"A"), 
                    ('Gly',"G"), ('Pro',"P"), ('Cys',"C")]
        
        lst = [(y, x) for x, y in lst] #switch y and x position for convinience

        canonical_aas = defaultdict(lambda: "X", lst) #default if key not found = "X"

        #filtering based on "missense" mutation. this can be tricky and sometimes messy but lets stick with that
        df_re = df[df["MUTATION_DESCRIPTION"].str.contains("missense")]

        #now lets filter our uniprot gene name
        df_re = df_re[df_re["GENE_SYMBOL"] == f"{self.gene_name}"]

        #retrieve relevant information
        meta = ('Gene name', 'str') 
        df_re['CHROMOSOME'] = df_re['CHROMOSOME'].astype('object')
        df_re['WT_AA'] = df_re['MUTATION_AA'].str[2].apply(lambda x: canonical_aas[x], meta=meta)
        df_re['MUTATION_POSITION'] = df_re['MUTATION_AA'].str[3:-1]
        df_re['MUTATED_AA'] = df_re['MUTATION_AA'].str[-1].apply(lambda x: canonical_aas[x], meta=meta)

        #redundant so we drop it
        df_re = df_re.drop("MUTATION_AA", axis=1)

        #now we use compute() which finally does the computation (before all actions were "lazy" computations
        # so we dont actually need the RAM. now we do it though.)
        cosmic_df = df_re.compute()
    
        cosmic_df["GENOME_START"] = cosmic_df["GENOME_START"].astype(int)
        cosmic_df["GENOME_STOP"] = cosmic_df["GENOME_STOP"].astype(int)


        #Primary site is not directly retrievable
        #Fetch primary site from classification file based on cosmic_phenotype_id and merge
        classification_df = pd.read_csv(self.cosmic_classification, sep='\t')
        cosmic_df = pd.merge(cosmic_df, classification_df[['COSMIC_PHENOTYPE_ID', 'PRIMARY_SITE']], on='COSMIC_PHENOTYPE_ID', how='left')
        cosmic_df = cosmic_df.drop('COSMIC_PHENOTYPE_ID', axis=1)
            
        cosmic_df['MUTATION_POSITION'] = pd.to_numeric(cosmic_df['MUTATION_POSITION'], errors='coerce')
        cosmic_df = cosmic_df.dropna(subset=['MUTATION_POSITION'])
        cosmic_df['MUTATION_POSITION'] = cosmic_df['MUTATION_POSITION'].astype(int)
        cosmic_df = cosmic_df.sort_values('MUTATION_POSITION', ascending=True)
        
        #return df
        return cosmic_df


    def _get_gnomad_mutations(self, gnomad_data_table_path:str, **kwargs)-> pd.DataFrame:
        """Documentation.
        Currently this part does not convert the result to a df.
        I will implement it and return a pandas DF
        """
        mt = hl.read_matrix_table(path)  #matrix table because df would not work with such large data.


        #string based search because there is NO API for gnomAD.
        substring1 = self.gene_name
        substring2 = "missense"
        
        mt = mt.annotate_rows(Gene_names=mt.info.vep.map(
            lambda x: x.split("\|")[3]) ,
                          type_of_change = mt.info.vep.map(
            lambda x: x.split("\|")[1]) , 
                          AA_change = mt.info.vep.map(
            lambda x: x.split("\|")[11]) , 
                          ENST_identifier= mt.info.vep.map(
            lambda x: x.split("\|")[6])
    
        ) 
                 
        filtered_mt_2 = mt.filter_rows(
        #hl.any(lambda x: hl.str(x).contains(substring3), mt.AA_change)
        hl.any(lambda x: hl.str(x).contains(substring1), mt.info.vep) &
        hl.any(lambda x: hl.str(x).contains(substring2), mt.info.vep)
        
        )
                         
        filtered_mt_3 = filtered_mt_2.annotate_rows(
            Allele_count_int = filtered_mt_2.info.AC,
            Allele_frequency_float = filtered_mt_2.info.AF,
            Allele_number_int = filtered_mt_2.info.AN,
            Gene_name_str = _replace_empty(filtered_mt_2.Gene_names), 
            Mutation_change_str = _replace_empty(filtered_mt_2.AA_change),
            Type_of_change_str = _replace_empty(filtered_mt_2.type_of_change))
        
        #this can be again regulated later trough kwargs**
        rows_to_keep = ["Gene_name_str", "Mutation_change_str", "Type_of_change_str", "Allele_count_int",
                    "Allele_frequency_float", "Allele_number_int"]
    
    
        selected_rows = filtered_mt_3.select_rows(
            Allele_count_int=filtered_mt_3.Allele_count_int,
            Allele_frequency_float=filtered_mt_3.Allele_frequency_float,
            Allele_number_int=filtered_mt_3.Allele_number_int,
            Gene_name_str=hl.str(filtered_mt_3.Gene_name_str),
            Mutation_change_str=hl.str(filtered_mt_3.Mutation_change_str),
            Type_of_change_str=hl.str(filtered_mt_3.Type_of_change_str)
                )
    
        save_buffer = selected_rows.select_rows(*rows_to_keep)
        select_rows_out = save_buffer.rows()

        return (select_rows_out)
        #part missing to convert to pandas DF.
        
        #return pd.DataFrame() 

    def _get_clinvar_mutations(self, **kwargs)-> pd.DataFrame:
        """
        Documentation missing.
        """
        
        #can be regulated through kwargs
        use_cols = ["Type", "Name", "GeneSymbol",
           "ClinicalSignificance", "PhenotypeList",
           "Assembly", "ChromosomeAccession", 
           "Chromosome", "Start", "Stop"]
        
        #this here as well / mapping needed to save memory at load in.
        column_data_types = {
        "Type": str,
        "Name": str,
        "GeneSymbol": str,
        "ClinicalSignificance": str,
        "PhenotypeList": str,
        "Assembly": str,
        "ChromosomeAccession": str,
        "Chromosome": str,
        "Start": int,
        "Stop": int
        }

        #lets read in the clinvar all var file.
        df_work = pd.read_csv(self.clinvar_variant_summary_txt, sep="\t", usecols=use_cols, dtype=column_data_types)
    
        df_work.loc[:, "AA_change"] = df_work["Name"].str.split().str.get(-1)
        df_work.loc[:, "AA_change"] = df_work["AA_change"].str.replace("(", "")
        df_work.loc[:, "AA_change"] = df_work["AA_change"].str.replace(")", "")
        
        df_work.loc[:,"Original_AA"] = df_work["AA_change"].str[2:5]
        df_work.loc[:,"Modified_AA"] = df_work["AA_change"].str[-3:]
        df_work['Position'] = pd.to_numeric(df_work['AA_change'].str[5:-3], errors='coerce')
        
        # Drop rows with NaN values in the 'Position' column
        df_work.dropna(subset=['Position'], inplace=True)
        df_work['Position'] = df_work['Position'].astype(int)
        
        df_work["Genomic_location"] = df_work["Chromosome"] + ":" + df_work["Start"].astype(str)
        df_work["gnomad_aa_change"] = "p." + df_work["Original_AA"] + df_work["Position"].astype(str) + df_work["Modified_AA"]
        
        df_work = df_work.drop("AA_change", axis=1)
        df_work = df_work.drop("Name", axis=1)
        df_work = df_work.drop("Chromosome", axis=1)
        df_work = df_work.drop("Start", axis=1)
        df_work = df_work.drop("Stop", axis=1)
        
        
        accepted_residues = ["Ala", "Gly", "Ser", "Leu", "Pro",
                        "Ile", "Val", "Phe", "Tyr", "Trp",
                         "His", "Thr", "Asn", "Gln", "Asp", 
                         "Glu","Cys", "Met", "Lys", "Arg"]
        
        #filtering based on our Gene name.
        df_clinvar = df_work[(df_work["Type"] == "single nucleotide variant") & 
            (df_work["GeneSymbol"] == self.gene_name) &
            (df_work["Assembly"] == "GRCh37") & 
            (df_work['Original_AA'].isin(accepted_residues)) &
            (df_work['Modified_AA'].isin(accepted_residues)) ]
        
        #export

        df_clinvar=df_clinvar.sort_values('Position', ascending=True)
        
        return df_clinvar

    def export(self, **kwargs):
        #cosmic
        if 'cosmic' in kwargs and kwargs['cosmic']:
            self._get_cosmic_mutations().to_csv(os.path.join(self.out_dir, f'df_cosmic_{self.gene_name}.tsv'), sep='\t', index=False)
        #clinvar
        if 'clinvar' in kwargs and kwargs['clinvar']:
            self._get_clinvar_mutations().to_csv(os.path.join(self.out_dir, f'df_clinvar_{self.gene_name}.tsv'), sep='\t', index=False)


mmcluster = MutationalClusterer(work_dir=work_dir, out_dir=out_dir,
                                cosmic_classification=cosmic_classification, 
                                cosmic_genome_screens_mutant_tsv=cosmic_genome_screens_mutant_tsv,
                                clinvar_variant_summary_txt=clinvar_variant_summary_txt,
                                gene_name="EGFR"
                               )

In [26]:
mmcluster.export(cosmic=True)

In [68]:
mmcluster.compute_neighbours(test_pdb)

Unnamed: 0,Residue_ID,Neighbours
0,VAL1,2 3 4 6 7 79 80 133 134 137 140 235 240 246 25...
1,LEU2,1 3 4 5 6 7 8 9 10 11 76 79 80 127 129 130 131...
2,SER3,1 2 4 5 6 7 8 9 10 79 130 133 211 242 246 251 ...
3,GLU4,1 2 3 5 6 7 8 9 10 11 79 81 242 246 251 274 29...
4,GLY5,2 3 4 6 7 8 9 10 11 12 79 126 127 130 211 242 ...
...,...,...
148,LEU149,87 90 91 92 93 94 95 142 143 144 145 146 147 1...
149,GLY150,94 95 143 144 145 146 147 148 149 151 152 153 ...
150,TYR151,90 91 92 93 94 95 96 97 98 99 100 101 142 143 ...
151,GLN152,94 100 101 139 140 141 142 143 144 145 146 147...


In [21]:
mmcluster._get_cosmic_mutations()

Unnamed: 0,GENE_SYMBOL,MUTATION_DESCRIPTION,CHROMOSOME,GENOME_START,GENOME_STOP,WT_AA,MUTATION_POSITION,MUTATED_AA,PRIMARY_SITE
1114,EGFR,missense_variant,7,55210077,55210077,Gly,10,Arg,central_nervous_system
1115,EGFR,missense_variant,7,55210077,55210077,Gly,10,Arg,central_nervous_system
100,EGFR,missense_variant,7,55211056,55211056,Pro,100,His,large_intestine
807,EGFR,missense_variant,7,55211055,55211055,Pro,100,Ser,skin
808,EGFR,missense_variant,7,55211055,55211055,Pro,100,Ser,skin
...,...,...,...,...,...,...,...,...,...
3438,EGFR,missense_variant,7,55269438,55269438,Ser,997,Asn,kidney
4029,EGFR,missense_variant,7,55269464,55269464,Asp,998,Asn,breast
1585,EGFR,missense_variant,7,55269468,55269468,Arg,999,Ile,lung
2570,EGFR,missense_variant,7,55268929,55268929,Arg,999,Cys,small_intestine


In [96]:
mmcluster._get_clinvar_mutations()

Unnamed: 0,Type,GeneSymbol,ClinicalSignificance,PhenotypeList,Assembly,ChromosomeAccession,Original_AA,Modified_AA,Position,Genomic_location,gnomad_aa_change
30259,single nucleotide variant,EGFR,drug response,"Adenocarcinoma of lung, response to tyrosine k...",GRCh37,NC_000007.13,Leu,Arg,858,7:55259515,p.Leu858Arg
30263,single nucleotide variant,EGFR,Pathogenic/Likely pathogenic; drug response,"Nonsmall cell lung cancer, response to tyrosin...",GRCh37,NC_000007.13,Gly,Cys,719,7:55241707,p.Gly719Cys
30265,single nucleotide variant,EGFR,Uncertain significance; drug response,"Nonsmall cell lung cancer, response to tyrosin...",GRCh37,NC_000007.13,Gly,Ser,719,7:55241707,p.Gly719Ser
30267,single nucleotide variant,EGFR,drug response,"Nonsmall cell lung cancer, resistance to tyros...",GRCh37,NC_000007.13,Thr,Met,790,7:55249071,p.Thr790Met
56361,single nucleotide variant,EGFR,Uncertain significance,not specified,GRCh37,NC_000007.13,Val,Leu,689,7:55241617,p.Val689Leu
...,...,...,...,...,...,...,...,...,...,...,...
5470225,single nucleotide variant,EGFR,Uncertain significance,EGFR-related lung cancer,GRCh37,NC_000007.13,Met,Thr,1002,7:55268939,p.Met1002Thr
5474521,single nucleotide variant,EGFR,Uncertain significance,EGFR-related lung cancer,GRCh37,NC_000007.13,Arg,Pro,23,7:55087038,p.Arg23Pro
5475753,single nucleotide variant,EGFR,Uncertain significance,EGFR-related lung cancer,GRCh37,NC_000007.13,His,Tyr,1156,7:55273143,p.His1156Tyr
5476979,single nucleotide variant,EGFR,Uncertain significance,EGFR-related lung cancer,GRCh37,NC_000007.13,Gly,Asp,288,7:55221819,p.Gly288Asp


In [8]:
# Initialize variables to avoid errors
updated_clinvar_df = cbioport_df = cosmic_df = gnomad_df = clinvar_df = gnomad_mut_dict = gnomad_mutation_dict = None

    
# Step 1: Cosmic mutations
try:
    cosmic_df = get_cosmic_mutations(gene_name=main_prot_name)
except Exception as error:
    print(error)
#we save it in the folder for the protein outside of monomer / pos at the base level. 
save_DataFrame_to_csv(cosmic_df, path, "cosmic_mutations")
# Step 2: Map gnomad mutations
try:
    gnomad_table_path = map_gnomad(Gene_name=main_prot_name, outpath=oligo_state_to_check)
    gnomad_df, gnomad_mutation_dict = gnomad_to_pandas(Gene_name=main_prot_name, path_to_tsv=gnomad_table_path, fasta_seq=main_prot_seq)
except Exception as error:
    print(error)

save_DataFrame_to_csv(gnomad_df, path, "gnomad_mutations")
# Step 3: Gather mutations from clinvar
try:
    clinvar_df = map_clinvar(Gene_name=main_prot_name)
    clinvar_map_outpath = f"{path}/clinvar_intermediate.csv"
except Exception as error:
    print(error)

save_DataFrame_to_csv(clinvar_df, path, "clinvar_intermediate")
# Step 4: Map clinvar to gnomad
try:
    list_to_be_searched, clinvar_df = map_clinvar_to_gnomad_1(Gene_name=main_prot_name, clinvar_df=clinvar_df,
                                                              gnomad_mut_dict=gnomad_mutation_dict, clinvar_mapped_df_path=clinvar_map_outpath)
except Exception as error:
    print(error)
    # Update clinvar muts that were found 1 step before.
    try:
        updated_clinvar_df = update_clinvar_muts_based_on_gnomad(clinvar_df=clinvar_df, gnomad_dict=gnomad_mutation_dict)
    except Exception as error:
        print(error)

save_DataFrame_to_csv(updated_clinvar_df, path, "clinvar_mutations")
# Step 5: Fetch additional info from cbioportal
try:
    gene_name = get_hugo_name(uniprot_id)
    print(f"This is gene name in hugo: {gene_name}")
    cbioport_df = get_cbioportal_info(gene_name=gene_name)
    save_DataFrame_to_csv(cbioport_df, path, "cbioport_mutations")
except Exception as error:
    print(error)

# Print shapes (if available)
DataFrames = [cosmic_df, updated_clinvar_df, gnomad_df, cbioport_df]
for df in DataFrames:
    try:
        print(f"This is df shape: {df.shape}")
    except Exception as error:
        print(error)

name 'get_cosmic_mutations' is not defined


NameError: name 'save_DataFrame_to_csv' is not defined