# Mutation signatures for all k-mers

The aim of this notebook is to compute the probability of the different k-mer changes through the approach described by Frigola et al. (2017). The signature probability is defined by the number of mutations found for a particular alteration taking into account its context, divided by the number of sites with the same reference k-mer. More information in the Materials and Methods section.

---

## Output

The output of the notebook are files of the form ``kmer_signatures.txt`` that contains for each kmer the probability to mutate into the three possibles alternatives.

An intermediate output (goes to data folder) contains the abundance of k-mers in the autosomal genome (or the CCDS).

In [2]:
from os import path

from pyfasta import Fasta
import pandas as pd
from bgreference import hg19
import math
import pybedtools
import sys
import numpy as np

## Input

Files in **data** directory.

- *kmer_freq_file*: file that contains the counts of k-mers, which will be used for normalization
**This last file is computed in this notebook, if still is not available**

Files in **non_provided_data/mutations** directory.

- *germinal_ultimate_dataset.bed.gz*: file that contains the autosomal *de novo* mutations (DNM) from 7 different datasets.

Files in **genomes** directory.

- *hg19.fa*: file with the genome hg19 from UCSC in fasta format.

### Other inputs

- mutations_path: base directory where the files with the mutations are located

In [3]:
# Other
mutations_path = 'non_provided_data/mutations/'

# Load the hg19 genome
f = Fasta('genomes/hg19/hg19.fa')

## Functions

All the functions needed for this notebook are coded below

In [6]:
def remove_version_ensembl(x, colname):
    """
    Remove version from the ensembl ID

    Args:
        x: dataframe row
        colname: column name

    Returns:
        str. Text before a '.'

    """
    ensembl_id = x[colname]
    ensembl_id_only = ensembl_id.split('.')[0]

    return ensembl_id_only

In [7]:
def compute_kmer_alt(x,k):
    """
    Compute the alternate kmer sequence from a DataFrame row or a dict 
    with the reference kmer and the alternated nucleotide
    """
    my_alt = x['alt']
    my_kmer = x['ref_kmer']

    my_alt_kmer = my_kmer[0:(k//2)] + my_alt + my_kmer[(k//2)+1:]
    
    return my_alt_kmer

In [8]:
def create_kmer_dict(k):
    """
    Construct a dictionary where the values are the combinations
    """
    
    characters = 'ACGT'
    nucleotides = ['A', 'C', 'T', 'G']
    from itertools import product   
    kmers = sorted([''.join(x) for x in product(characters, repeat=k)])
    kmers_dict = dict()

    for my_kmer in kmers:
        mid_nucl = my_kmer[(k//2):(k//2)+1]

        for my_nucleotide in nucleotides:
            if my_nucleotide != mid_nucl:
                my_alt_kmer = my_kmer[0:(k//2)] + my_nucleotide + my_kmer[(k//2)+1:]

                kmers_dict[(my_kmer, my_alt_kmer)] = 0
                
    return kmers_dict

In [9]:
def create_CpG_dict():
    """
    Construct a dictionary where the values are the combinations
    """
    
    nucleotides = ['A', 'C', 'T', 'G']
    kmers = ['A', 'T', 'nonCpG', 'CpG', 'GpC', 'nonGpC']
    kmers_dict = dict()

    for my_kmer in kmers:
        if len(my_kmer) == 1:
            mid_nucl = my_kmer
        elif len(my_kmer) == 3:
            mid_nucl = my_kmer[0]
        else:
            mid_nucl = my_kmer[3]

        for my_nucleotide in nucleotides:
            if my_nucleotide != mid_nucl:
                kmers_dict[(my_kmer, my_nucleotide)] = 0
                
    return kmers_dict

In [10]:
def kmer_generator(characters,k):
    """
    Function that generates all DNA k-mers (alphabetically sorted) giving
    a k-mer length and retrieves a dictionary with them as keys and 0 values.
    
    INPUT: A collection of at most 10 symbols defining an ordered alphabet, and a positive integer n (n<=10).
    OUTPUT: A dictionary with all strings of length n that can be formed from the alphabet, ordered lexicographically
    (use the standard order of symbols in the English alphabet) as keys and 0 as value.
    """
    
    # To obtain the cartesian product (permutations including repetitions).
    from itertools import product   
    kmers = sorted([''.join(x) for x in product(characters, repeat=k)])
    return(dict((kmer,0) for kmer in kmers))

In [11]:
def kmer_abundance_autosomes(k,f):
    """
    Function that compute the counts of k-mers in the autosomal hg19 reference genome.
    
    Args:
        k: k-mer length
        f: hg19 in Fasta

    Returns:
        :class:`~pandas.DataFrame`. Table with each k-mer and its abundace in the autosomal genome. 
    """

    # Create the dictionary
    kmer_count = kmer_generator('ACGT',k)
    # For each chromosome
    for chromosome in range(1,23):
        chrom = 'chr' + str(chromosome)
        ## Get the length of the chromosome
        length = len(str(f[chrom]))
        ## Iterate over the sequence by kmers.
        for i in range(length-(k-1)):
            my_kmer = f[chrom][i:i+k]
            try:
                kmer_count[my_kmer.upper()] += 1
            except:
                None
    
    # Return a data.frame with the counts.
    counts_df = pd.DataFrame({'kmer' : list(kmer_count.keys()) , 'count' : list(kmer_count.values()) })
    counts_df = counts_df[['kmer', 'count']]
    return(counts_df)

In [12]:
def signature(k, mutation_df, count_file, sampleID):
    
    my_probs_df = pd.DataFrame()
    my_probs_df['mutation'] = list(create_kmer_dict(k).keys())
    
    ## Trinucleotide counts
    k_freq_df = pd.read_csv(count_file, sep='\t', header=None, low_memory=False)
    k_freq_df.columns = ['kmer', 'count']
    ## Compute the kmer changes (XYX -> XZX) of each alteration
    mutation_df['ref_kmer'] = mutation_df.apply(lambda x: hg19(x['chr'], x['start']+1-(k//2),1+2*(k//2)), axis=1)
    mutation_df['alt_kmer'] = mutation_df.apply(lambda x: compute_kmer_alt(x, k), axis=1)
    mutation_df['mutation'] = list(zip(mutation_df.ref_kmer, mutation_df.alt_kmer))
    
    kmer_dict = create_kmer_dict(k)

    # Compute the counts of each triplet alteration
    for my_count in mutation_df['mutation'].value_counts().iteritems():
        if my_count[0] in my_probs_df['mutation'].tolist():
            kmer_dict[my_count[0]] = my_count[1]
            
    # Divide each count by the counts of the reference kmer
    for my_kmer in kmer_dict.keys():
        my_ref_kmer = my_kmer[0]
        total_count = k_freq_df[k_freq_df['kmer'] == my_ref_kmer]['count'].values[0]
        kmer_dict[my_kmer] = kmer_dict[my_kmer]/total_count
        

    column_name = 'Probability_' + str(sampleID)
    my_results_df = pd.DataFrame({'mutation' : list(kmer_dict.keys()) , column_name : list(kmer_dict.values()) })
    my_probs_df = pd.merge(my_probs_df, my_results_df, on='mutation')
    return(my_probs_df)

In [13]:
def compute_CpG_site(x):
    """
    Compute if mutation falls on CpG site (or CpG sites where G is the mutated position,
    they are CpG sites in the other strand. We call them GpC, though it is not a good term).
    """
    my_ref = x['nt']
    my_pre = x['pre']
    my_post = x['post']
    if my_ref == 'C' and my_post == 'G':
        my_site = 'CpG'
    elif my_ref == 'G' and my_pre == 'C':
        my_site = 'GpC'
    elif my_ref == 'C' and my_post != 'G':
        my_site = 'nonCpG'
    elif my_ref == 'G' and my_pre != 'C':
        my_site = 'nonGpC'
    elif my_ref == 'A':
        my_site = 'A'
    elif my_ref == 'T':
        my_site = 'T'
    return my_site

In [14]:
def signature_CpG(k, mutation_df, count_file, sampleID):

    my_probs_df = pd.DataFrame()
    my_probs_df['mutation'] = list(create_CpG_dict().keys())
    
    ## Trinucleotide counts
    CpG_df = pd.read_csv(count_file, sep='\t', header=None, low_memory=False)
    CpG_df.columns = ['kmer', 'count']
    
    ## Compute the kmer changes (XYX -> XZX) of each alteration
    mutation_df['ref_kmer'] = mutation_df.apply(lambda x: hg19(x['chr'], x['start']+1-(k//2),1+2*(k//2)), axis=1)
    mutation_df[['pre', 'nt', 'post']] = mutation_df['ref_kmer'].apply(lambda x: pd.Series(list(x)))
    mutation_df['site'] = mutation_df.apply(lambda x: compute_CpG_site(x), axis=1)
    mutation_df['mutation'] = list(zip(mutation_df.site, mutation_df.alt))
    
    kmer_dict = create_CpG_dict()

    # Compute the counts of each triplet alteration
    for my_count in mutation_df['mutation'].value_counts().iteritems():
        if my_count[0] in my_probs_df['mutation'].tolist():
            kmer_dict[my_count[0]] = my_count[1]
    
    # Divide each count by the counts of the reference kmer
    for my_kmer in kmer_dict.keys():
        my_ref_kmer = my_kmer[0]
        total_count = CpG_df[CpG_df['kmer'] == my_ref_kmer]['count'].values[0]
        kmer_dict[my_kmer] = kmer_dict[my_kmer]/total_count

    column_name = 'Probability_' + str(sampleID)
    my_results_df = pd.DataFrame({'mutation' : list(kmer_dict.keys()) , column_name : list(kmer_dict.values()) })
    my_probs_df = pd.merge(my_probs_df, my_results_df, on='mutation')
    return(my_probs_df)

In [15]:
def kmer_abundance_window(k, middle_exon_coords, middle_distance_threshold):
    """
    
    Args:
        k: k-mer length
        middle_exon_coords:
        middle_distance_threshold:

    Returns:
        :class:`~pandas.DataFrame`. 
    """
    
    middle_exon_coords['exon_size'] = (middle_exon_coords['end'] - middle_exon_coords['start'])
    middle_exon_coords['exon_middle_start'] = (middle_exon_coords['start'] + middle_exon_coords['exon_size']/2)
    middle_exon_coords['exon_middle_start'] = middle_exon_coords.apply(lambda x:
                                                                    math.floor(x['exon_middle_start']),
                                                                    axis=1)

    ## Process coordinates
    middle_exon_coords['exon_middle_end'] = middle_exon_coords['exon_middle_start'] + 1
    middle_exon_coords['region_start'] = middle_exon_coords['exon_middle_start'] - middle_distance_threshold
    middle_exon_coords['region_end'] = middle_exon_coords['exon_middle_end'] + middle_distance_threshold
    sub_exons_coords = middle_exon_coords[['chr', 'region_start', 'region_end', 'ensembl', 'exon_size',
                                           'exon_middle_start', 'exon_middle_end']]

    # Create the dictionary
    kmer_count = kmer_generator('ACGT',k)
    
    # Create the dictionary
    for my_row in sub_exons_coords.values.tolist():
    
        ## Get coordinates
        my_chr = my_row[0]
        my_start = int(my_row[1])
        my_end = int(my_row[2])
        n_bases = my_end - my_start
    
        ## Get sequence
        my_exon_bases = hg19(my_chr, my_start+1-(k//2), size=n_bases+2*(k//2))
        ## Divide sequence into k-mers
        my_kmers = [my_exon_bases[i:i+k] for i in range(len(my_exon_bases)-(k-1))]
        
        ## Count each k-mer
        for my_kmer in my_kmers:
            try:
                kmer_count[my_kmer.upper()] += 1
            except:
                None
    
    # Return a data.frame with the counts.
    counts_df = pd.DataFrame({'kmer' : list(kmer_count.keys()) , 'count' : list(kmer_count.values()) })
    counts_df = counts_df[['kmer', 'count']]
    return(counts_df)

In [16]:
def kmer_abundance_coords(k, coords):
    """
    
    Args:
        k: k-mer length
        coords: coordinates

    Returns:
        :class:`~pandas.DataFrame`. 
    """

    # Create the dictionary
    kmer_count = kmer_generator('ACGT',k)
    
    # Create the dictionary
    for my_row in coords.values.tolist():
    
        ## Get coordinates
        my_chr = my_row[0]
        my_start = int(my_row[1])
        my_end = int(my_row[2])
        n_bases = my_end - my_start
    
        ## Get sequence
        my_exon_bases = hg19(my_chr, my_start+1-(k//2), size=n_bases+2*(k//2))
        ## Divide sequence into k-mers
        my_kmers = [my_exon_bases[i:i+k] for i in range(len(my_exon_bases)-(k-1))]
        
        ## Count each k-mer
        for my_kmer in my_kmers:
            try:
                kmer_count[my_kmer.upper()] += 1
            except:
                None
    
    # Return a data.frame with the counts.
    counts_df = pd.DataFrame({'kmer' : list(kmer_count.keys()) , 'count' : list(kmer_count.values()) })
    counts_df = counts_df[['kmer', 'count']]
    return(counts_df)

## Compute the nucleotide counts at the autosomal genome

We retrieve the hg19 autosomal genome and compute the counts of the 4 nucleotides.

**IGNORE THIS BLOCK OF CODE IF THET ARE ALREADY COMPUTED**

In [None]:
k = 1

results = kmer_abundance_autosomes(k, f)
filename = str(k) + 'mer.txt' 
results.to_csv(path.join('data', filename), sep="\t", header=False, index=False)

## Compute the trinucleotide counts at the autosomal genome

We retrieve the hg19 autosomal genome and compute the counts of the 64 possible trinucleotides.

**IGNORE THIS BLOCK OF CODE IF THET ARE ALREADY COMPUTED**

In [None]:
k = 3

results = kmer_abundance_autosomes(k, f)
filename = str(k) + 'mer.txt' 
results.to_csv(path.join('data', filename), sep="\t", header=False, index=False)

## Compute the CpG counts at the autosomal genome

Based on the trinucleotide counts.

**IGNORE THIS BLOCK OF CODE IF THET ARE ALREADY COMPUTED**

In [None]:
k = 3
filename =  'data/' + str(k) + 'mer.txt' 

k_freq_df = pd.read_csv(filename, sep='\t', header=None, low_memory=False)
k_freq_df.columns = ['kmer', 'count']
k_freq_df[['pre', 'nt', 'post']] = k_freq_df['kmer'].apply(lambda x: pd.Series(list(x)))
    
CpG_dict = {}
CpG_dict['A'] = k_freq_df.loc[k_freq_df['nt'] == 'A', 'count'].sum()
CpG_dict['T'] = k_freq_df.loc[k_freq_df['nt'] == 'T', 'count'].sum()
CpG_dict['nonCpG'] = k_freq_df.loc[(k_freq_df['nt'] == 'C') & (k_freq_df['post'] != 'G'), 'count'].sum()
CpG_dict['CpG'] = k_freq_df.loc[(k_freq_df['nt'] == 'C') & (k_freq_df['post'] == 'G'), 'count'].sum()
CpG_dict['nonGpC'] = k_freq_df.loc[(k_freq_df['nt'] == 'G') & (k_freq_df['pre'] != 'C'), 'count'].sum()
CpG_dict['GpC'] = k_freq_df.loc[(k_freq_df['nt'] == 'G') & (k_freq_df['pre'] == 'C'), 'count'].sum()
CpG_df = pd.DataFrame(list(CpG_dict.items()), columns=['kmer', 'count'])

CpG_df.to_csv(path.join('data', 'CpGmer.txt'), sep="\t", header=False, index=False)

## Compute the pentanucleotide counts at the autosomal genome

We retrieve the hg19 autosomal genome and compute the counts of the 1024 possible pentanucleotides.

**IGNORE THIS BLOCK OF CODE IF THET ARE ALREADY COMPUTED**

In [None]:
k = 5

results = kmer_abundance_autosomes(k, f)
filename = str(k) + 'mer.txt' 
results.to_csv(path.join('data', filename), sep="\t", header=False, index=False)

## Compute the heptanucleotide counts at the autosomal genome

We retrieve the hg19 autosomal genome and compute the counts of the 16,384 possible heptanucleotides.

**IGNORE THIS BLOCK OF CODE IF THET ARE ALREADY COMPUTED**

In [None]:
k = 7

results = kmer_abundance_autosomes(k, f)
filename = str(k) + 'mer.txt' 
results.to_csv(path.join('data', filename), sep="\t", header=False, index=False)

## Compute the nonanucleotide counts at the autosomal genome

We retrieve the hg19 autosomal genome and compute the counts of the possible nonanucleotides.

**IGNORE THIS BLOCK OF CODE IF THET ARE ALREADY COMPUTED**

In [None]:
k = 9

results = kmer_abundance_autosomes(k, f)
filename = str(k) + 'mer.txt' 
results.to_csv(path.join('data', filename), sep="\t", header=False, index=False)

## Compute the trinucleotide counts at the entire CCDS exons and the respective introns

**IGNORE THIS BLOCK OF CODE IF THET ARE ALREADY COMPUTED**

In [15]:
# Load CCDS exon coords
exons_file = 'data/coordinates/exons_CCDS.bed.gz'
exons_coords_df = pd.read_csv(exons_file, sep="\t", header=None, low_memory=False)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl', 'symbol', 'strand']
exons_coords_df = exons_coords_df[['chr', 'start', 'end', 'ensembl']]
exons_coords_df['ensembl'] = exons_coords_df.apply(lambda x: remove_version_ensembl(x, 'ensembl'), axis=1)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl']

# Load respective intron coords
introns_file = 'data/coordinates/introns_CCDS.bed.gz'
introns_coords_df = pd.read_csv(introns_file, sep="\t", header=None, low_memory=False)
introns_coords_df.columns = ['chr', 'start', 'end', 'ensembl', 'symbol', 'strand']
introns_coords_df = introns_coords_df[['chr', 'start', 'end', 'ensembl']]
introns_coords_df['ensembl'] = introns_coords_df.apply(lambda x: remove_version_ensembl(x, 'ensembl'), axis=1)
introns_coords_df.columns = ['chr', 'start', 'end', 'ensembl']

# Merge them
frames = [exons_coords_df, introns_coords_df]
all_coords_df = pd.concat(frames)

k = 3

results = kmer_abundance_coords(k, all_coords_df)
filename = str(k) + 'mer_CCDS.txt' 
results.to_csv(path.join('data', filename), sep="\t", header=False, index=False)

## Compute the signature for the golden dataset (Goldmann 2018)

In [17]:
# Get the mutations
mutations_file = mutations_path + 'germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[mutations_df['type'] == 'subs']
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]
mutations_df = mutations_df[mutations_df['sample'] == 'Goldmann2018']

sampleID = 'germinal'

### Mutation signature 1-mer with DNM, WG counts

In [None]:
k = 1

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_golden.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 1-mer 9 classes with CpG with DNM, WG counts

In [None]:
k = 3

k_mer_count_file = 'data/CpGmer.txt'

results = signature_CpG(k, mutations_df, k_mer_count_file, sampleID)
filename = 'CpG_signatures_DNM_golden.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 3-mer with DNM, WG counts

In [None]:
k = 3

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_golden.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 5-mer with DNM, WG counts

In [None]:
k = 5

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_golden.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 7-mer with DNM, WG counts

In [None]:
k = 7

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_golden.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

## Compute the signature for the largest dataset (Halldorsson 2019).

In [18]:
# Get the mutations
mutations_file = mutations_path + 'germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[mutations_df['type'] == 'subs']
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]
mutations_df = mutations_df[mutations_df['sample'] == 'Halldorsson2019']

sampleID = 'germinal'

### Mutation signature 1-mer with DNM, WG counts

In [19]:
k = 1

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_Halldorsson.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 1-mer 9 classes with CpG with DNM, WG counts

In [20]:
k = 3

k_mer_count_file = 'data/CpGmer.txt'

results = signature_CpG(k, mutations_df, k_mer_count_file, sampleID)
filename = 'CpG_signatures_DNM_Halldorsson.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 3-mer with DNM, WG counts

In [21]:
k = 3

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_Halldorsson.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 5-mer with DNM, WG counts

In [22]:
k = 5

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_Halldorsson.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 7-mer with DNM, WG counts

In [23]:
k = 7

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_Halldorsson.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

## Compute the DNM signature for all datasets

In [17]:
# Get the mutations
mutations_file = mutations_path + 'germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[mutations_df['type'] == 'subs']
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]

sampleID = 'germinal'

### Mutation signature 1-mer with DNM, WG counts

In [None]:
k = 1

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_new.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 1-mer 9 classes with CpG with DNM, WG counts

In [None]:
k = 3

k_mer_count_file = 'data/CpGmer.txt'

results = signature_CpG(k, mutations_df, k_mer_count_file, sampleID)
filename = 'CpG_signatures_DNM_new.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 3-mer with DNM, WG counts

In [None]:
k = 3

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_new.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 5-mer with DNM, WG counts

In [None]:
k = 5

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_new.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 7-mer with DNM, WG counts

In [None]:
k = 7

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_new.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 3-mer with DNM, CCDS exons and introns (only the genes)

In [16]:
# Load CCDS exon coords
exons_file = 'data/coordinates/exons_CCDS.bed.gz'
exons_coords_df = pd.read_csv(exons_file, sep="\t", header=None, low_memory=False)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl', 'symbol', 'strand']
exons_coords_df = exons_coords_df[['chr', 'start', 'end', 'ensembl']]
exons_coords_df['ensembl'] = exons_coords_df.apply(lambda x: remove_version_ensembl(x, 'ensembl'), axis=1)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl']

# Load respective intron coords
introns_file = 'data/coordinates/introns_CCDS.bed.gz'
introns_coords_df = pd.read_csv(introns_file, sep="\t", header=None, low_memory=False)
introns_coords_df.columns = ['chr', 'start', 'end', 'ensembl', 'symbol', 'strand']
introns_coords_df = introns_coords_df[['chr', 'start', 'end', 'ensembl']]
introns_coords_df['ensembl'] = introns_coords_df.apply(lambda x: remove_version_ensembl(x, 'ensembl'), axis=1)
introns_coords_df.columns = ['chr', 'start', 'end', 'ensembl']

# Merge them
frames = [exons_coords_df, introns_coords_df]
all_coords_df = pd.concat(frames)

In [19]:
# Convert mutations into bed to intersect values
mutations_bed = pybedtools.BedTool.from_dataframe(mutations_df)

region_coords_bed = pybedtools.BedTool.from_dataframe(all_coords_df)

# Filter mutations by the full region of interest
my_bed = region_coords_bed.intersect(mutations_bed, wao=True)

sub_mutations_df = pd.read_table(my_bed.fn, names = ['range_chr', 'range_start', 'range_end',
                        'ensembl', 'mut_chr', 'mut_start', 'mut_end', 'mut_ref', 'mut_alt',
                        'mut_sample', 'mut_type', 'overlap_bp'],  sep="\s+", index_col=False)
sub_mutations_df = sub_mutations_df[sub_mutations_df['overlap_bp'] != 0]
sub_mutations_df = sub_mutations_df[['mut_chr', 'mut_start', 'mut_end', 'mut_ref', 'mut_alt', 'mut_sample',
                                     'mut_type']]
sub_mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']
print("The total amount of mutations at the middle exon-centered sequences is " +
        str(len(sub_mutations_df.index)))

The total amount of mutations at the middle exon-centered sequences is 291137


In [20]:
k = 3

k_mer_count_file = 'data/' + str(k) + 'mer_CCDS.txt'

results = signature(k, sub_mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_CCDS.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

## Compute the 3-mer signature for each dataset

In [None]:
# Get the mutations
mutations_file = mutations_path + 'germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[mutations_df['type'] == 'subs']

In [None]:
k = 3
count_file = 'data/' + str(k) + 'mer.txt'

my_probs_df = pd.DataFrame()
my_probs_df['mutation'] = list(create_kmer_dict(k).keys())

## Trinucleotide counts
k_freq_df = pd.read_csv(count_file, sep='\t', header=None, low_memory=False)
k_freq_df.columns = ['kmer', 'count']

studies_list = mutations_df['sample'].unique().tolist()

for my_study in studies_list:
    sub_mutations_df = mutations_df[mutations_df['sample'] == my_study].copy()
    sub_mutations_df['ref_kmer'] = sub_mutations_df.apply(lambda x: hg19(x['chr'], x['start']+1-(k//2),1+2*(k//2)),
                                axis=1)
    sub_mutations_df['alt_kmer'] = sub_mutations_df.apply(lambda x: compute_kmer_alt(x, k), axis=1)
    sub_mutations_df['mutation'] = list(zip(sub_mutations_df.ref_kmer, sub_mutations_df.alt_kmer))

    kmer_dict = create_kmer_dict(k)

    # Compute the counts of each triplet alteration
    for my_count in sub_mutations_df['mutation'].value_counts().iteritems():
        if my_count[0] in my_probs_df['mutation'].tolist():
            kmer_dict[my_count[0]] = my_count[1]

    # Divide each count by the counts of the reference trinucleotide
    for my_kmer in kmer_dict.keys():
        my_ref_kmer = my_kmer[0]
        total_count = k_freq_df[k_freq_df['kmer'] == my_ref_kmer]['count'].values[0]
        kmer_dict[my_kmer] = kmer_dict[my_kmer]/total_count
    
    column_name = 'Probability_' + str(my_study)
    study_probs_df = pd.DataFrame({'mutation' : list(kmer_dict.keys()) , column_name : list(kmer_dict.values()) })
    my_probs_df = pd.merge(my_probs_df, study_probs_df, on='mutation')

my_probs_df.to_csv(path.join('results', 'ALL_signatures.txt'), header=True, index=None, sep='\t')