# Permutation test


This notebook contains the functions used to perform the random permutation of mutations at the internal exon-centered windows level to compute the empirical p-value of difference between exons and introns (two sided test). The information obtained here has been used to fill most of the main and Supplementary Tables.

Due to legacy reasons, the permutation test for some given analysis is performed in another notebook (after the respective internal exon-centered analysis).

---

## Output files

Files with the form ``exons_centered_kmer_permut.tsv`` that contains the expected number of mutations for each of the permutations.

## Input files

Files in **data** directory.

- *middle_exons_coords*: file with the coordinates of the middle exons
- *intron_coords*: file with the coordinates of the introns

Files in **results** directory.

- *kmer_DNM_signatures.txt*: file with the mutational signatures

### Other inputs

- mutations_folder: base directory where the files with the mutations are located
- tumor_type: id of the tumor (germinal in our case)
- cluster_id: id of the cluster

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tabix  # package pytabix
import numpy.polynomial.polynomial as poly
import matplotlib.ticker as plticker
import gzip, pickle
from os import path
import math
import pybedtools
from bgreference import hg19
import statistics as st

In [2]:
# Path to data
middle_exons_coords = 'data/coordinates/genes_middle_exon_coords.bed.gz'
consequence_type_file = 'data/consequence/consequence_ranking.tsv.bgz'
consequence_ranking_file = 'data/consequence/consequence_ranking_info.tsv'

# Other
mutations_folder = 'non_provided_data/mutations/'
tumor_type = 'germinal'

## Functions

All the functions needed for this notebook are coded below

In [3]:
def remove_version_ensembl(x, colname):
    """a
    Remove version from the ensembl ID

    Args:
        x: dataframe row
        colname: column name

    Returns:
        str. Text before a '.'

    """
    ensembl_id = x[colname]
    ensembl_id_only = ensembl_id.split('.')[0]

    return ensembl_id_only

In [4]:
def stacked_sequence_level_permutation(mutations_df, exons_coords_symbol_df, signatures_file, tumor_type,
                                       middle_distance_threshold, k):
    """
    Get the co-ordinates of exons and flanking intronic sequences and perform the random sampling of mutations
    (1000 times) with the size equal of the number of mutation observed in the 2001-nt window (exon and intron
    together).
    
    Args:
        
    Returns:
        

    """
    
    nucleotides = set(['A', 'T', 'C', 'G'])
    
    # Get signatures
    all_signatures = pd.read_csv(signatures_file, sep='\t')
    probability_name = 'Probability_' + tumor_type
    sub_signatures_df = all_signatures[['mutation', probability_name]]
    signatures_dict = sub_signatures_df.set_index('mutation').T.to_dict()
    
    # Create bed of the selected mutations
    sub_mutations_df = mutations_df[['chr', 'start', 'end', 'sample']]
    mutations_bed = pybedtools.BedTool.from_dataframe(sub_mutations_df)
    
    exons_coords_symbol_df['exon_size'] = (exons_coords_symbol_df['end'] - exons_coords_symbol_df['start'])
    exons_coords_symbol_df['exon_middle_start'] = (exons_coords_symbol_df['start'] +
                                        exons_coords_symbol_df['exon_size']/2)
    exons_coords_symbol_df['exon_middle_start'] = exons_coords_symbol_df.apply(lambda x:
                                        math.floor(x['exon_middle_start']), axis=1)

    exons_coords_symbol_df['exon_middle_end'] = exons_coords_symbol_df['exon_middle_start'] + 1
    exons_coords_symbol_df['region_start'] = exons_coords_symbol_df['exon_middle_start'] - middle_distance_threshold
    exons_coords_symbol_df['region_end'] = exons_coords_symbol_df['exon_middle_end'] + middle_distance_threshold
    
    # Left flank "intronic" sequences => region_start and start froom coordinates.
    intron1_df = exons_coords_symbol_df[['chr', 'region_start', 'start', 'ID']]
    ## The region may span less than the centered exonic sequence. Only take left flank intronic coordinates with sense.
    intron1_df = intron1_df[intron1_df['region_start'] <= intron1_df['start']]
    intron1_df.columns = ['chr', 'start', 'end', 'ID']
    
    # Right flank "intronic" sequences
    intron2_df = exons_coords_symbol_df[['chr', 'end', 'region_end', 'ID']]
    ## The region may span less than the centered exonic sequence. Only take right flank intronic coordinates with sense.
    intron2_df = intron2_df[intron2_df['end'] <= intron2_df['region_end']]
    intron2_df.columns = ['chr', 'start', 'end', 'ID']
    ## Take them together
    intron_df = pd.concat([intron1_df, intron2_df])
    intron_df['label'] = 'intron'

    # Exonic sequences
    all_exon_df = exons_coords_symbol_df[['chr', 'start', 'end', 'ID', 'region_start', 'region_end']]
    ## The region may span less than the centered exonic sequence. Process exons that surpass the region size.
    large_exon_df = all_exon_df[(all_exon_df['end']-all_exon_df['start']) > (2*middle_distance_threshold+1)]
    large_exon_df = large_exon_df[['chr', 'region_start', 'region_end', 'ID']]
    large_exon_df.columns = ['chr', 'start', 'end', 'ID']
    ## The region may span less than the centered exonic sequence. Process exons that do not surpass the region size.
    short_exon_df = all_exon_df[(all_exon_df['end']-all_exon_df['start']) <= (2*middle_distance_threshold+1)]
    short_exon_df = short_exon_df[['chr', 'start', 'end', 'ID']]
    short_exon_df.columns = ['chr', 'start', 'end', 'ID']
    ## Take them together
    exon_df = pd.concat([short_exon_df, large_exon_df])
    exon_df['label'] = 'exon'
    
    # Merge exons and introns
    coords_symbol_df = pd.concat([exon_df, intron_df])
    coords_symbol_bed = pybedtools.BedTool.from_dataframe(coords_symbol_df)

    # Filter mutations by the position of interest
    my_bed = coords_symbol_bed.intersect(mutations_bed, wo=True)

    mutations_mapped = pd.read_table(my_bed.fn, names = ['chr', 'start', 'end', 'ID','label', 'mut_chr',
                                    'mut_start', 'mut_end', 'sample', 'overlap_bp'],  sep="\s+", index_col=False)
    
    print("Total amount of mutations at the middle exon-centered sequences is "+ str(len(mutations_mapped)))
    exonic_obs = mutations_mapped[mutations_mapped['label'] == 'exon'].shape[0]
    intronic_obs = mutations_mapped[mutations_mapped['label'] == 'intron'].shape[0]
    print("Observed at middle exons: " + str(exonic_obs))
    print("Observed at flanking introns: " + str(intronic_obs))
    
    # Compute expected mutation by randomization approach
    RANDOMIZATION = 1000

    collect_rand = {}
    # Initialize dictionary
    for sam in range(0, RANDOMIZATION):
        collect_rand[sam] = {}
        collect_rand[sam]['exon_Exp'] = 0
        collect_rand[sam]['intron_Exp'] = 0

    # For each unique staked exon-centered sequence with one or more mutations
    for gene in mutations_mapped['ID'].unique():
        position_prob_lol = list()
        label_flag_lol = list()
        
        # Take the exonic and the flanking intronic sequences.
        for dx, row in coords_symbol_df[coords_symbol_df['ID']==gene].iterrows():
            my_chr = row['chr'][3:]
            my_start = int(row['start'])
            my_end = int(row['end'])
            n_bases = my_end - my_start

            ## Get sequence.
            my_exon_bases = hg19(my_chr, my_start+1-(k//2), size=n_bases+2*(k//2))
            ## Divide sequences into kmers
            my_kmers = [my_exon_bases[i:i+k] for i in range(len(my_exon_bases)-(k-1))]
            
            label_flag = 0 # to mark intron vs exon
            if row['label']=='exon':
                label_flag = 1 

            ## Foreach kmer
            for my_kmer in my_kmers:
                my_ref_base = my_kmer[(k//2):(k//2)+1].upper() #Get the reference
                my_alt_bases = nucleotides - set(my_ref_base) #The rest are alternative ones
                my_base_probs = 0
                previous_base = my_kmer[0:(k//2)] #Get previous and next base
                next_base = my_kmer[(k//2)+1:]
            
                ### Foreach alternative, each of the three possible changes given a kmer.
                for alt_base in my_alt_bases:
                    #### Built reference and alternate kmers
                    tri_ref = previous_base + str(my_ref_base) + next_base
                    tri_alt = previous_base + str(alt_base) + next_base
                    #### Create key for the signature dictionary
                    my_key = str((tri_ref, tri_alt))
                    try:
                        ##### Add the probability of the three possible changes
                        my_prob = signatures_dict[my_key]['Probability_' + tumor_type]
                        my_base_probs = my_base_probs + my_prob
                    except:
                        None      
                
                position_prob_lol.append(my_base_probs)
                label_flag_lol.append(label_flag)
            
        # Total number of mutations falling in the stacked sequence.
        mutcnt = len(mutations_mapped[mutations_mapped['ID']==gene])
        # Normalize the probability vector and perform sampling
        prb_vector = np.array(position_prob_lol)
        prb_vector = prb_vector/prb_vector.sum()
        label_vector = np.array(label_flag_lol)
        mutation_rand_label = np.random.choice(label_vector, size=mutcnt*RANDOMIZATION, replace=True, p=prb_vector)
        
        # Save the mutation count for each randomization
        for sam in range(0, RANDOMIZATION):
            start = int(mutcnt*sam)
            # count for one random sampling
            count = np.unique(mutation_rand_label[start:(start+mutcnt)], return_counts=True)
            for i in range(0, len(count[0])):
                if count[0][i] == 1: # count of exons
                    collect_rand[sam]['exon_Exp'] += count[1][i]
                else: # count of introns
                    collect_rand[sam]['intron_Exp'] += count[1][i]
        
    collect_df = pd.DataFrame.from_dict(collect_rand)
    
    return(collect_df, exonic_obs, intronic_obs)

In [5]:
def compute_CpG_site(my_ref, my_pre, my_post):
    """
    Compute if mutation falls on CpG site
    """
    if my_ref == 'C' and my_post == 'G':
        my_site = 'CpG'
    elif my_ref == 'G' and my_pre == 'C':
        my_site = 'GpC'
    elif my_ref == 'C' and my_post != 'G':
        my_site = 'nonCpG'
    elif my_ref == 'G' and my_pre != 'C':
        my_site = 'nonGpC'
    elif my_ref == 'A':
        my_site = 'A'
    elif my_ref == 'T':
        my_site = 'T'
    return my_site

In [6]:
def stacked_sequence_level_permutation_CpG(mutations_df, exons_coords_symbol_df, signatures_file, tumor_type,
                                       middle_distance_threshold, k):
    """
    Get the co-ordinates of exons and flanking intronic sequences and perform the random sampling of mutations
    (1000 times) with the size equal of the number of mutation observed in the 2001-nt window (exon and intron
    together).
    
    Args:
        
    Returns:
        

    """
    
    nucleotides = set(['A', 'T', 'C', 'G'])
    
    # Get signatures
    all_signatures = pd.read_csv(signatures_file, sep='\t')
    probability_name = 'Probability_' + tumor_type
    sub_signatures_df = all_signatures[['mutation', probability_name]]
    signatures_dict = sub_signatures_df.set_index('mutation').T.to_dict()
    
    # Create bed of the selected mutations
    sub_mutations_df = mutations_df[['chr', 'start', 'end', 'sample']]
    mutations_bed = pybedtools.BedTool.from_dataframe(sub_mutations_df)
    
    exons_coords_symbol_df['exon_size'] = (exons_coords_symbol_df['end'] - exons_coords_symbol_df['start'])
    exons_coords_symbol_df['exon_middle_start'] = (exons_coords_symbol_df['start'] +
                                        exons_coords_symbol_df['exon_size']/2)
    exons_coords_symbol_df['exon_middle_start'] = exons_coords_symbol_df.apply(lambda x:
                                        math.floor(x['exon_middle_start']), axis=1)

    exons_coords_symbol_df['exon_middle_end'] = exons_coords_symbol_df['exon_middle_start'] + 1
    exons_coords_symbol_df['region_start'] = exons_coords_symbol_df['exon_middle_start'] - middle_distance_threshold
    exons_coords_symbol_df['region_end'] = exons_coords_symbol_df['exon_middle_end'] + middle_distance_threshold
    
    # Left flank "intronic" sequences => region_start and start froom coordinates.
    intron1_df = exons_coords_symbol_df[['chr', 'region_start', 'start', 'ID']]
    ## The region may span less than the centered exonic sequence. Only take left flank intronic coordinates with sense.
    intron1_df = intron1_df[intron1_df['region_start'] <= intron1_df['start']]
    intron1_df.columns = ['chr', 'start', 'end', 'ID']
    
    # Right flank "intronic" sequences
    intron2_df = exons_coords_symbol_df[['chr', 'end', 'region_end', 'ID']]
    ## The region may span less than the centered exonic sequence. Only take right flank intronic coordinates with sense.
    intron2_df = intron2_df[intron2_df['end'] <= intron2_df['region_end']]
    intron2_df.columns = ['chr', 'start', 'end', 'ID']
    ## Take them together
    intron_df = pd.concat([intron1_df, intron2_df])
    intron_df['label'] = 'intron'

    # Exonic sequences
    all_exon_df = exons_coords_symbol_df[['chr', 'start', 'end', 'ID', 'region_start', 'region_end']]
    ## The region may span less than the centered exonic sequence. Process exons that surpass the region size.
    large_exon_df = all_exon_df[(all_exon_df['end']-all_exon_df['start']) > (2*middle_distance_threshold+1)]
    large_exon_df = large_exon_df[['chr', 'region_start', 'region_end', 'ID']]
    large_exon_df.columns = ['chr', 'start', 'end', 'ID']
    ## The region may span less than the centered exonic sequence. Process exons that do not surpass the region size.
    short_exon_df = all_exon_df[(all_exon_df['end']-all_exon_df['start']) <= (2*middle_distance_threshold+1)]
    short_exon_df = short_exon_df[['chr', 'start', 'end', 'ID']]
    short_exon_df.columns = ['chr', 'start', 'end', 'ID']
    ## Take them together
    exon_df = pd.concat([short_exon_df, large_exon_df])
    exon_df['label'] = 'exon'
    
    # Merge exons and introns
    coords_symbol_df = pd.concat([exon_df, intron_df])
    coords_symbol_bed = pybedtools.BedTool.from_dataframe(coords_symbol_df)

    # Filter mutations by the position of interest
    my_bed = coords_symbol_bed.intersect(mutations_bed, wo=True)

    mutations_mapped = pd.read_table(my_bed.fn, names = ['chr', 'start', 'end', 'ID','label', 'mut_chr',
                                    'mut_start', 'mut_end', 'sample', 'overlap_bp'],  sep="\s+", index_col=False)
    
    print("Total amount of mutations at the middle exon-centered sequences is "+ str(len(mutations_mapped)))
    exonic_obs = mutations_mapped[mutations_mapped['label'] == 'exon'].shape[0]
    intronic_obs = mutations_mapped[mutations_mapped['label'] == 'intron'].shape[0]
    print("Observed at middle exons: " + str(exonic_obs))
    print("Observed at flanking introns: " + str(intronic_obs))
    
    # Compute expected mutation by randomization approach
    RANDOMIZATION = 1000

    collect_rand = {}
    # Initialize dictionary
    for sam in range(0, RANDOMIZATION):
        collect_rand[sam] = {}
        collect_rand[sam]['exon_Exp'] = 0
        collect_rand[sam]['intron_Exp'] = 0

    # For each unique staked exon-centered sequence with one or more mutations
    for gene in mutations_mapped['ID'].unique():
        position_prob_lol = list()
        label_flag_lol = list()
        
        # Take the exonic and the flanking intronic sequences.
        for dx, row in coords_symbol_df[coords_symbol_df['ID']==gene].iterrows():
            my_chr = row['chr'][3:]
            my_start = int(row['start'])
            my_end = int(row['end'])
            n_bases = my_end - my_start

            ## Get sequence.
            my_exon_bases = hg19(my_chr, my_start+1-(k//2), size=n_bases+2*(k//2))
            ## Divide sequences into kmers
            my_kmers = [my_exon_bases[i:i+k] for i in range(len(my_exon_bases)-(k-1))]
            
            label_flag = 0 # to mark intron vs exon
            if row['label']=='exon':
                label_flag = 1 
            
            ## Foreach kmer
            for my_kmer in my_kmers:
                my_ref_base = my_kmer[(k//2):(k//2)+1].upper() #Get the reference
                my_alt_bases = nucleotides - set(my_ref_base) #The rest are alternative ones
                my_base_probs = 0
                previous_base = my_kmer[0:(k//2)] #Get previous and next base
                next_base = my_kmer[(k//2)+1:]
                tri_ref = compute_CpG_site(my_ref_base, previous_base, next_base)
            
                ### Foreach alternative, each of the three possible changes given a kmer.
                for alt_base in my_alt_bases:
                    #### Built reference and alternate kmers             
                    tri_alt = alt_base
                    #### Create key for the signature dictionary
                    my_key = str((tri_ref, tri_alt))
                    try:
                        ##### Add the probability of the three possible changes
                        my_prob = signatures_dict[my_key]['Probability_' + tumor_type]
                        my_base_probs = my_base_probs + my_prob
                    except:
                        None

                position_prob_lol.append(my_base_probs)
                label_flag_lol.append(label_flag)
            
        # Total number of mutations falling in the stacked sequence.
        mutcnt = len(mutations_mapped[mutations_mapped['ID']==gene])
        # Normalize the probability vector and perform sampling
        prb_vector = np.array(position_prob_lol)
        prb_vector = prb_vector/prb_vector.sum()
        label_vector = np.array(label_flag_lol)
        mutation_rand_label = np.random.choice(label_vector, size=mutcnt*RANDOMIZATION, replace=True, p=prb_vector)
        
        # Save the mutation count for each randomization
        for sam in range(0, RANDOMIZATION):
            start = int(mutcnt*sam)
            # count for one random sampling
            count = np.unique(mutation_rand_label[start:(start+mutcnt)], return_counts=True)
            for i in range(0, len(count[0])):
                if count[0][i] == 1: # count of exons
                    collect_rand[sam]['exon_Exp'] += count[1][i]
                else: # count of introns
                    collect_rand[sam]['intron_Exp'] += count[1][i]
        
    collect_df = pd.DataFrame.from_dict(collect_rand)
    
    return(collect_df, exonic_obs, intronic_obs)

In [7]:
def synonymous_or_not(x):
    """
    Classifies consequence types between synonymous and non synonymous
    using the RANK column

    Args:
        x: dataframe row

    Returns:
        str. *synonymous* or *non_synonymous*

    """

    my_rank = x['RANK']

    if my_rank < 15:
        my_consequence = 'non_synonymous'

    else:
        my_consequence = 'synonymous'

    return my_consequence

In [8]:
def check_consequence(chromosome, start_coord, alt, tabix, rank_info):
    """
    Function that classifies a given exonic mutation into synonymous 
    or not given a rank info and a tabix file with predicted effect.

    Args:
        chromosome: chromosome coordinate
        start_coord: coordinate of the mutation
        alt: alternative nucleotide
        tabix: tabix indexed file containing the predicted VEP classification effect for a given position and change.
        rank_info: dictionary with all the types of predicted effect and if we classify them as synonymous or not.
    Returns:
        str. *synonymous* or *non_synonymous*

    """
    my_chr = chromosome[3:]
    effects = tabix.querys("{}:{}-{}".format(my_chr, start_coord, start_coord))
    for effect in effects:
        if len(effect) < 6:
            continue
        if effect[3] == alt:
            consequence = effect[5].rstrip('\r')
            try:
                con = rank_info[consequence]
                return(con)
            except:
                return('None')

In [9]:
def compute_intron_probs(sequence, k, signatures_dict, tumor_type):
    """
    Function that computes the summed probabilities of the three posible trinucleotide changes for each trinucleotide
    in a given intronic sequence of a middle exon-centered window.
    
    Args:
        sequence: intronic sequence. It should contain one extra nucleotide on each side to compute all the
        trinucleotides.
        signatures_dict: Dictionary containing probabilities of each trinucleotide change.
        rel_start: Relative start of that sequence within the middle exon-centered window.
        middle_distance_threshold: middle distance of the exon-centered window.
        tumor_type: First type of class to find the specific mutation signature class.
        cluster_id: Second type of class to find the specific mutation signature class.
      
    Returns:
        list containing subslists with paired normalized position within the middle exon-centered window and
        associated probability.

    """
    
    nucleotides = set(['A', 'T', 'C', 'G'])
    position_prob_lol = list()
    
    ## Divide intronic sequences into kmers
    my_kmers = [sequence[i:i+k] for i in range(len(sequence)-(k-1))]
        
    ## Foreach kmer
    for my_kmer in my_kmers:
        my_ref_base = my_kmer[(k//2):(k//2)+1].upper() #Get the reference
        my_alt_bases = nucleotides - set(my_ref_base) #The rest are alternative ones
        my_base_probs = 0
        previous_base = my_kmer[0:(k//2)] #Get previous and next base
        next_base = my_kmer[(k//2)+1:]
            
        ### Foreach alternative, each of the three possible changes given a kmer.
        for alt_base in my_alt_bases:
            #### Built reference and alternate kmers
            tri_ref = previous_base + str(my_ref_base) + next_base
            tri_alt = previous_base + str(alt_base) + next_base
            #### Create key for the signature dictionary
            my_key = str((tri_ref, tri_alt))
            try:
                ##### Add the probability of the three possible changes
                my_prob = signatures_dict[my_key]['Probability_' + tumor_type]
                my_base_probs = my_base_probs + my_prob
            except:
                None      
                
        position_prob_lol.append(my_base_probs)
        
    return position_prob_lol

In [10]:
def compute_exon_probs(chromosome, abs_start, sequence, k, signatures_dict, tumor_type, rank_info, tabix, include):
    """
    Function that computes the summed probabilities of the three posible trinucleotide changes for each trinucleotide
    in a given exonic sequence of a middle exon-centered window.
    
    Args:
      
    Returns:
        list containing subslists with paired normalized position within the middle exon-centered window and associated
        probability.

    """
    
    position_prob_lol = list()
    
    ## Divide intronic sequences into trinucleotides
    my_kmers = [sequence[i:i+k] for i in range(len(sequence)-(k-1))]
    
    #Foreach trinucleotide
    for my_kmer in my_kmers:
        #Get the reference
        my_ref_base = my_kmer[(k//2):(k//2)+1].upper()
        my_base_probs = 0
        #Get previous and next base
        previous_base = my_kmer[0:(k//2)]
        next_base = my_kmer[(k//2)+1:]
        effects = tabix.querys("{}:{}-{}".format(chromosome, abs_start + 1, abs_start + 1))
        for effect in effects:
            if len(effect) < 6:
                print('At chromosome ' + str(my_chr) + ' position ' + str(abs_start + 1) + ' no effect is reported.')
                continue
            consequence = effect[5].rstrip('\r')
            try:
                synonymous_or_not = rank_info[consequence]
                if synonymous_or_not == include:
                    #Built reference and alternative trinucleotide
                    tri_ref = previous_base + str(my_ref_base) + next_base
                    tri_alt = previous_base + str(effect[3]) + next_base
                    #Create key for the signatue dictionary.
                    my_key = str((tri_ref, tri_alt))
                    try:
                        #Get the probability
                        my_prob = signatures_dict[my_key]['Probability_' + tumor_type]
                        #Sum the probability of the three possible changes.
                        my_base_probs = my_base_probs + my_prob
                    except:
                        None
            except:
                None
        
        position_prob_lol.append(my_base_probs)
        abs_start += 1
    
    return position_prob_lol

In [11]:
def stacked_sequence_level_permutation_excluding(mutations_df, exons_coords_symbol_df, signatures_file, tumor_type,
                                middle_distance_threshold, k, include, tb_consequence_type, consequence_rank_dict):
    """
    Get the co-ordinates of exons and flanking intronic sequences and perform the random sampling of mutations
    (1000 times) with the size equal of the number of mutation observed in the 2001-nt window (exon and intron
    together).
    
    Args:
        
    Returns:
        
    """
    
    nucleotides = set(['A', 'T', 'C', 'G'])
    
    # Get signatures
    all_signatures = pd.read_csv(signatures_file, sep='\t')
    probability_name = 'Probability_' + tumor_type
    sub_signatures_df = all_signatures[['mutation', probability_name]]
    signatures_dict = sub_signatures_df.set_index('mutation').T.to_dict()
    
    # Create bed of the selected mutations
    sub_mutations_df = mutations_df[['chr', 'start', 'end', 'alt', 'sample']]
    mutations_bed = pybedtools.BedTool.from_dataframe(sub_mutations_df)
    
    exons_coords_symbol_df['exon_size'] = (exons_coords_symbol_df['end'] - exons_coords_symbol_df['start'])
    exons_coords_symbol_df['exon_middle_start'] = (exons_coords_symbol_df['start'] +
                                        exons_coords_symbol_df['exon_size']/2)
    exons_coords_symbol_df['exon_middle_start'] = exons_coords_symbol_df.apply(lambda x:
                                        math.floor(x['exon_middle_start']), axis=1)

    exons_coords_symbol_df['exon_middle_end'] = exons_coords_symbol_df['exon_middle_start'] + 1
    exons_coords_symbol_df['region_start'] = exons_coords_symbol_df['exon_middle_start'] - middle_distance_threshold
    exons_coords_symbol_df['region_end'] = exons_coords_symbol_df['exon_middle_end'] + middle_distance_threshold
    
    # Left flank "intronic" sequences => region_start and start from coordinates.
    intron1_df = exons_coords_symbol_df[['chr', 'region_start', 'start', 'ID']]
    ## The region may span less than the centered exonic sequence. Only take left flank intronic coordinates with sense.
    intron1_df = intron1_df[intron1_df['region_start'] <= intron1_df['start']]
    intron1_df.columns = ['chr', 'start', 'end', 'ID']
    
    # Right flank "intronic" sequences
    intron2_df = exons_coords_symbol_df[['chr', 'end', 'region_end', 'ID']]
    ## The region may span less than the centered exonic sequence. Only take right flank intronic coordinates with sense.
    intron2_df = intron2_df[intron2_df['end'] <= intron2_df['region_end']]
    intron2_df.columns = ['chr', 'start', 'end', 'ID']
    ## Take them together
    intron_df = pd.concat([intron1_df, intron2_df])
    intron_df['label'] = 'intron'

    # Exonic sequences
    all_exon_df = exons_coords_symbol_df[['chr', 'start', 'end', 'ID', 'region_start', 'region_end']]
    ## The region may span less than the centered exonic sequence. Process exons that surpass the region size.
    large_exon_df = all_exon_df[(all_exon_df['end']-all_exon_df['start']) > (2*middle_distance_threshold+1)]
    large_exon_df = large_exon_df[['chr', 'region_start', 'region_end', 'ID']]
    large_exon_df.columns = ['chr', 'start', 'end', 'ID']
    ## The region may span less than the centered exonic sequence. Process exons that do not surpass the region size.
    short_exon_df = all_exon_df[(all_exon_df['end']-all_exon_df['start']) <= (2*middle_distance_threshold+1)]
    short_exon_df = short_exon_df[['chr', 'start', 'end', 'ID']]
    short_exon_df.columns = ['chr', 'start', 'end', 'ID']
    ## Take them together
    exon_df = pd.concat([short_exon_df, large_exon_df])
    exon_df['label'] = 'exon'
    
    # Merge exons and introns
    coords_symbol_df = pd.concat([exon_df, intron_df])
    coords_symbol_bed = pybedtools.BedTool.from_dataframe(coords_symbol_df)

    # Filter mutations by the position of interest
    my_bed = coords_symbol_bed.intersect(mutations_bed, wo=True)

    mutations_mapped = pd.read_table(my_bed.fn, names = ['chr', 'start', 'end', 'ID','label', 'mut_chr', 'mut_start',
                                    'mut_end', 'mut_alt', 'sample', 'overlap_bp'],  sep="\s+", index_col=False)
    
    mutations_mapped['conseq'] = mutations_mapped.apply(lambda x: 
                                    check_consequence(x['mut_chr'], x['mut_end'], x['mut_alt'], tb_consequence_type,
                                    consequence_rank_dict), axis=1)
    
    print("Total amount of mutations at the middle exon-centered sequences is "+ str(len(mutations_mapped)))
    exonic_obs = mutations_mapped[(mutations_mapped['label'] == 'exon') & 
                                  (mutations_mapped['conseq'] == include)].shape[0]
    intronic_obs = mutations_mapped[mutations_mapped['label'] == 'intron'].shape[0]
    print("Observed at middle exons: " + str(exonic_obs))
    print("Observed at flanking introns: " + str(intronic_obs))
    
    # Compute expected mutation by randomization approach
    RANDOMIZATION = 1000

    collect_rand = {}
    # Initialize dictionary
    for sam in range(0, RANDOMIZATION):
        collect_rand[sam] = {}
        collect_rand[sam]['exon_Exp'] = 0
        collect_rand[sam]['intron_Exp'] = 0

    # For each unique staked exon-centered sequence with one or more mutations
    for gene in mutations_mapped['ID'].unique():
        position_prob_lol = []
        label_flag_lol = []
        
        # Take the exonic and the flanking intronic sequences.
        for dx, row in coords_symbol_df[coords_symbol_df['ID']==gene].iterrows():
            my_chr = row['chr'][3:]
            my_start = int(row['start'])
            my_end = int(row['end'])
            n_bases = my_end - my_start

            ## Get sequence.
            my_sequence = hg19(my_chr, my_start+1-(k//2), size=n_bases+2*(k//2))

            if row['label'] == 'intron':
                label_flag = 0
                probs = compute_intron_probs(my_sequence, k, signatures_dict, tumor_type)
                flags = [label_flag]*len(probs)
            else:
                label_flag = 1
                probs = compute_exon_probs(my_chr, my_start, my_sequence, k, signatures_dict, tumor_type,
                                           consequence_rank_dict, tb_consequence_type, include)
                flags = [label_flag]*len(probs)
                
            position_prob_lol = position_prob_lol + probs
            label_flag_lol = label_flag_lol + flags
        
        # Total number of mutations falling in the stacked sequence.
        mutcnt = len(mutations_mapped[(mutations_mapped['ID']==gene) &
            ((mutations_mapped['label'] == 'intron') | ((mutations_mapped['label'] == 'exon') & 
                                  (mutations_mapped['conseq'] == include)))])
        # Normalize the probability vector and perform sampling
        prb_vector = np.array(position_prob_lol)
        prb_vector = prb_vector/prb_vector.sum()
        label_vector = np.array(label_flag_lol)
        mutation_rand_label = np.random.choice(label_vector, size=mutcnt*RANDOMIZATION, replace=True, p=prb_vector)
        
        # Save the mutation count for each randomization
        for sam in range(0, RANDOMIZATION):
            start = int(mutcnt*sam)
            # count for one random sampling
            count = np.unique(mutation_rand_label[start:(start+mutcnt)], return_counts=True)
            for i in range(0, len(count[0])):
                if count[0][i] == 1: # count of exons
                    collect_rand[sam]['exon_Exp'] += count[1][i]
                else: # count of introns
                    collect_rand[sam]['intron_Exp'] += count[1][i]
        
    collect_df = pd.DataFrame.from_dict(collect_rand)
    
    return(collect_df, exonic_obs, intronic_obs)

## Load data

In [12]:
## Get exon coordinates
exons_coords_df = pd.read_csv(middle_exons_coords, sep="\t", header=None, low_memory=False)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl', 'symbol', 'strand']
exons_coords_df = exons_coords_df[['chr', 'start', 'end', 'ensembl']]
exons_coords_df['ensembl'] = exons_coords_df.apply(lambda x: remove_version_ensembl(x, 'ensembl'), axis=1)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl']
exons_coords_symbol_df = exons_coords_df[['chr', 'start', 'end', 'ensembl']]
exons_coords_symbol_df['ID'] = exons_coords_symbol_df.index + 1

## Get mutations
mutations_file = mutations_folder + 'germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
sub_copy_mutations_df = mutations_df
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]
mutations_df = mutations_df[mutations_df['type'] == 'subs']
mutations_df['ID'] = mutations_df.index + 1

## Random permutation of mutations at stacked sequence level

### 1-mer, all datasets

In [None]:
k = 1
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_new.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut.tsv'), sep="\t",
                               header=True, index=False)

#### Compute emprical p-value

In [None]:
k = 1
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut.tsv'), sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(), count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 3-mer, all datasets

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_new.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut.tsv'), sep="\t",
                               header=True, index=False)

#### Compute emprical p-value

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut.tsv'), sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(), count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 3-mer, all datasets, CCDS signatures

In [14]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_CCDS.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_CCDS.tsv'), sep="\t",
                               header=True, index=False)

Total amount of mutations at the middle exon-centered sequences is 50780
Observed at middle exons: 4669
Observed at flanking introns: 46111


#### Compute emprical p-value

In [15]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_CCDS.tsv'), sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(), count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

Unnamed: 0,obs_exon_cnt,exp_exon_cnt,obs_intron_cnt,exp_intron_cnt,empirical_pvalue
0,4669,4325.909,46111,46454.091,0.001


In [16]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

0    7.931073
dtype: float64

In [17]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

(0    7.88393
 dtype: float64, 0    7.978216
 dtype: float64)

In [18]:
st.stdev(rand_output['increase'].tolist())

1.490800762925067

### 3-mer, specific datasets

#### Francioli

In [None]:
tumor_type = 'GONL'
sub_mutations_df = mutations_df[mutations_df['sample'] == tumor_type]

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/ALL_signatures.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) +
                                        '.tsv'), sep="\t", header=True, index=False)

##### Compute emprical p-value

In [None]:
(exonic_obs, intronic_obs) = (64, 617)
tumor_type = 'GONL'

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) + '.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

#### Goldmann 2016

In [None]:
tumor_type = 'Goldmann2016'
sub_mutations_df = mutations_df[mutations_df['sample'] == tumor_type]

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/ALL_signatures.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) +
                                        '.tsv'), sep="\t", header=True, index=False)

##### Compute emprical p-value

In [None]:
(exonic_obs, intronic_obs) = (289, 2729)
tumor_type = 'Goldmann2016'

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) + '.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

#### Goldmann 2018

In [None]:
tumor_type = 'Goldmann2018'
sub_mutations_df = mutations_df[mutations_df['sample'] == tumor_type]

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/ALL_signatures.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) +
                                        '.tsv'), sep="\t", header=True, index=False)

##### Compute emprical p-value

In [None]:
(exonic_obs, intronic_obs) = (485, 5113)
tumor_type = 'Goldmann2018'

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) + '.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

#### Halldorsson2019

In [None]:
tumor_type = 'Halldorsson2019'
sub_mutations_df = mutations_df[mutations_df['sample'] == tumor_type]

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/ALL_signatures.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) +
                                        '.tsv'), sep="\t", header=True, index=False)

##### Compute emprical p-value

In [None]:
(exonic_obs, intronic_obs) = (1236, 11659)
tumor_type = 'Halldorsson2019'

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) + '.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

#### An

In [None]:
tumor_type = 'An2019'
sub_mutations_df = mutations_df[mutations_df['sample'] == tumor_type]

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/ALL_signatures.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) +
                                        '.tsv'), sep="\t", header=True, index=False)

##### Compute emprical p-value

In [None]:
(exonic_obs, intronic_obs) = (1610, 16379)
tumor_type = 'An2019'

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) + '.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

#### An (healthy)

In [None]:
tumor_type = 'An2019'
sub_mutations_df = sub_copy_mutations_df[sub_copy_mutations_df['sample'] == tumor_type]
sub_mutations_df = sub_mutations_df[sub_mutations_df['class'] == tumor_type]

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/ALL_signatures.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) +
                                        '.tsv'), sep="\t", header=True, index=False)

##### Compute emprical p-value

In [None]:
(exonic_obs, intronic_obs) = (1610, 16379)
tumor_type = 'An2019'

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) + '.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

#### Yuen

In [None]:
tumor_type = 'Yuen2017'
sub_mutations_df = mutations_df[mutations_df['sample'] == tumor_type]

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/ALL_signatures.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) +
                                        '.tsv'), sep="\t", header=True, index=False)

##### Compute emprical p-value

In [None]:
(exonic_obs, intronic_obs) = (790, 7777)
tumor_type = 'Yuen2017'

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) + '.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

#### Sasani

In [None]:
tumor_type = 'Sasani2019'
sub_mutations_df = mutations_df[mutations_df['sample'] == tumor_type]

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/ALL_signatures.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) +
                                        '.tsv'), sep="\t", header=True, index=False)

##### Compute emprical p-value

In [None]:
(exonic_obs, intronic_obs) = (195, 1837)
tumor_type = 'Sasani2019'

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + str(tumor_type) + '.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 1-mer with CpG, all datasets

In [None]:
tumor_type = 'germinal'
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/CpG_signatures_DNM_new.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation_CpG(mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_CpGmer_permut.tsv'), sep="\t",
                               header=True, index=False)

#### Compute emprical p-value

In [None]:
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_CpGmer_permut.tsv'), sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(), count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 5-mer, all datasets

In [None]:
k = 5
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_new.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut.tsv'), sep="\t",
                               header=True, index=False)

#### Compute emprical p-value

In [None]:
k = 5
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut.tsv'), sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(), count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 7-mer, all datasets

In [None]:
k = 7
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_DNM_signatures_decomp.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut.tsv'), sep="\t",
                               header=True, index=False)

#### Compute emprical p-value

In [None]:
k = 7
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut.tsv'), sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(), count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

## Random permutation of syn/non-syn mutations at stacked sequence level

### Load extra data

In [None]:
## Get consequence tabix file
tb_consequence_type = tabix.open(consequence_type_file)

## Get the consequence rank info and classify separate syn from other types of non-syn (missense, non-sense,...)
consequence_rank_info = pd.read_csv(consequence_ranking_file, delimiter='\t')
consequence_rank_info['TYPE'] = consequence_rank_info.apply(lambda x:synonymous_or_not(x) ,1)
consequence_rank_info = consequence_rank_info[['CONSEQUENCE', 'TYPE']]
consequence_rank_dict = dict(zip(consequence_rank_info['CONSEQUENCE'], consequence_rank_info['TYPE']))

tumor_type = 'germinal'

### 3-mer, all datasets, synonymous

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_new.txt'
include = 'synonymous'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation_excluding(mutations_df,
                                    exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold,
                                    k, include, tb_consequence_type, consequence_rank_dict)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include + '.tsv'),
                              sep="\t", header=True, index=False)

#### Compute emprical p-value

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include + '.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] <= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
(results_df["exp_exon_cnt"]/results_df["obs_exon_cnt"]-1)*100

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 3-mer, all datasets, non-synonymous

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_new.txt'
include = 'non_synonymous'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation_excluding(mutations_df,
                                    exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold,
                                    k, include, tb_consequence_type, consequence_rank_dict)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include + '.tsv'),
                              sep="\t", header=True, index=False)

#### Compute emprical p-value

In [None]:
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include + '.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
(results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

## Load sample data

In [None]:
# Get the mutations
sample_mutations_file = mutations_folder + 'germinal_sample.bed.gz'
sample_mutations_df = pd.read_csv(sample_mutations_file, sep="\t", header=None)
sample_mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class', 'ID']
sample_mutations_df = sample_mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'ID']]
sample_mutations_df = sample_mutations_df[sample_mutations_df['type'] == 'subs']

### 3-mer, random sample, synonymous

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_sample.txt'
include = 'synonymous'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation_excluding(sample_mutations_df,
                                    exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold,
                                    k, include, tb_consequence_type, consequence_rank_dict)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include +
                                        '_sample.tsv'), sep="\t", header=True, index=False)

#### Compute emprical p-value

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include + '_sample.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] <= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
(results_df["exp_exon_cnt"]/results_df["obs_exon_cnt"]-1)*100

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 3-mer, random sample, non-synonymous

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_sample.txt'
include = 'non_synonymous'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation_excluding(sample_mutations_df,
                                    exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold,
                                    k, include, tb_consequence_type, consequence_rank_dict)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include +
                                        '_sample.tsv'), sep="\t", header=True, index=False)

#### Compute emprical p-value

In [None]:
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include + '_sample.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
(results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

## Load healthy data

In [None]:
# Get the mutations
sample_mutations_file = mutations_folder + 'germinal_healthy.bed.gz'
sample_mutations_df = pd.read_csv(sample_mutations_file, sep="\t", header=None)
sample_mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class', 'ID']
sample_mutations_df = sample_mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'ID']]
sample_mutations_df = sample_mutations_df[sample_mutations_df['type'] == 'subs']

### 3-mer, healthy, synonymous

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_healthy.txt'
include = 'synonymous'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation_excluding(sample_mutations_df,
                                    exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold,
                                    k, include, tb_consequence_type, consequence_rank_dict)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include +
                                        '_healthy.tsv'), sep="\t", header=True, index=False)

#### Compute emprical p-value

In [None]:
(exonic_obs, intronic_obs) = (404, 15657)
include = 'synonymous'

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include + '_healthy.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] <= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
(results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 3-mer, healthy, non-synonymous

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_healthy.txt'
include = 'non_synonymous'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation_excluding(sample_mutations_df,
                                    exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold,
                                    k, include, tb_consequence_type, consequence_rank_dict)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include +
                                        '_healthy.tsv'), sep="\t", header=True, index=False)

#### Compute emprical p-value

In [None]:
(exonic_obs, intronic_obs) = (1085, 15657)
include = 'non_synonymous'

In [None]:
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include + '_healthy.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
(results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

## Load autism data

In [None]:
# Get the mutations
sample_mutations_file = mutations_folder + 'germinal_autism.bed.gz'
sample_mutations_df = pd.read_csv(sample_mutations_file, sep="\t", header=None)
sample_mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class', 'ID']
sample_mutations_df = sample_mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'ID']]
sample_mutations_df = sample_mutations_df[sample_mutations_df['type'] == 'subs']

### 3-mer, random sample, synonymous

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_autism.txt'
include = 'synonymous'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation_excluding(sample_mutations_df,
                                    exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold,
                                    k, include, tb_consequence_type, consequence_rank_dict)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include +
                                        '_autism.tsv'), sep="\t", header=True, index=False)

#### Compute emprical p-value

In [None]:
(exonic_obs, intronic_obs) = (406, 16066)
include = 'synonymous'

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include + '_autism.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] <= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
(results_df["exp_exon_cnt"]/results_df["obs_exon_cnt"]-1)*100

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 3-mer, random sample, non-synonymous

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_autism.txt'
include = 'non_synonymous'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation_excluding(sample_mutations_df,
                                    exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold,
                                    k, include, tb_consequence_type, consequence_rank_dict)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include +
                                        '_autism.tsv'), sep="\t", header=True, index=False)

#### Compute emprical p-value

In [None]:
(exonic_obs, intronic_obs) = (1241, 16066)
include = 'non_synonymous'

In [None]:
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_' + include + '_autism.tsv'),
                            sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(),
                            count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
(results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100

In [None]:
exonic_increase =(results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

## Context dependency analysis, Golden dataset (Goldmann2018)

In [None]:
sub_mutations_df = mutations_df[mutations_df['sample'] == 'Goldmann2018']

### 1-mer

In [None]:
k = 1
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_golden.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_golden.tsv'), sep="\t",
                               header=True, index=False)

#### Compute emprical p-value

In [None]:
k = 1
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_golden.tsv'), sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(), count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 1-mer with CpG

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/CpG_signatures_DNM_golden.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation_CpG(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_CpGmer_permut_golden.tsv'), sep="\t",
                               header=True, index=False)

#### Compute emprical p-value

In [None]:
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_CpGmer_permut_golden.tsv'), sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(), count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 3-mer

In [None]:
k = 3
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_golden.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_golden.tsv'), sep="\t",
                               header=True, index=False)

#### Compute emprical p-value

In [None]:
k = 3
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_golden.tsv'), sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(), count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 5-mer

In [None]:
k = 5
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_DNM_signatures_decomp_golden.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_golden.tsv'), sep="\t",
                               header=True, index=False)

#### Compute emprical p-value

In [None]:
k = 5
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_golden.tsv'), sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(), count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)

### 7-mer

In [None]:
k = 7
middle_distance_threshold = 1000
signatures_file = 'results/' + str(k) + 'mer_DNM_signatures_decomp_golden.txt'

(exons_centered_results, exonic_obs, intronic_obs) = stacked_sequence_level_permutation(sub_mutations_df,
                        exons_coords_symbol_df, signatures_file, tumor_type, middle_distance_threshold, k)

exons_centered_results = pd.DataFrame.transpose(exons_centered_results)
exons_centered_results.to_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_golden.tsv'), sep="\t",
                               header=True, index=False)

#### Compute emprical p-value

In [None]:
k = 7
results = []
RANDOMIZATION = 1000
    
# Read the expected exon and intron mutation counts from permutations
rand_output = pd.read_csv(path.join('results', 'exons_centered_' + str(k) + 'mer_permut_golden.tsv'), sep="\t")
    
# count the number of times the expected exon mutations is higher than the observed
count = len(rand_output[rand_output['exon_Exp'] >= exonic_obs])
    
if count == 0:
    count = 1 
    
results.append([exonic_obs, rand_output['exon_Exp'].mean(), intronic_obs, rand_output['intron_Exp'].mean(), count/RANDOMIZATION])
    
results_df = pd.DataFrame(results)
results_df.columns = ['obs_exon_cnt', 'exp_exon_cnt','obs_intron_cnt', 'exp_intron_cnt', 'empirical_pvalue']
results_df

In [None]:
exonic_increase = (results_df["obs_exon_cnt"]/results_df["exp_exon_cnt"]-1)*100
exonic_increase

In [None]:
rand_output['increase'] = (exonic_obs/rand_output['exon_Exp']-1)*100
error = (st.stdev(rand_output['increase'].tolist())/np.sqrt(1000))
(exonic_increase-error, exonic_increase+error)