# Decomposition signatures

The aim of this notebook is to compute the probability of the different k-mer changes, so it can be used later on to compute the expected mutations. The computation is done using an aproximation of probability decomposition as detailed in the Methods section of the paper.

---

## Input

Files in **data** directory.

- *kmer_freq_file*: file that contains the counts of k-mers, which will be used for normalization.

Files in **non_provided_data/mutations** directory.

- *germinal_ultimate_dataset.bed.gz*: file containing the mutations.

---

## Output

Files in **results** directory.

- *kmer_DNM_signatures_decomp.txt*: files containing decomposed signatures from all datasets.
- *kmer_DNM_signatures_decomp_golden.txt*: files containing decomposed signatures from Goldmann2018 dataset.
- *kmer_DNM_signatures_decomp_Halldorsson.txt*: files containing decomposed signatures from Halldorsson2019 dataset.


In [1]:
from os import path

import pandas as pd
from bgreference import hg19

## Functions

All the functions needed for this notebook are coded below

In [15]:
def compute_kmer_alt(x,k):
    """
    Compute the alternate kmer sequence from a DataFrame row or a dict 
    with the reference kmer and the alternated nucleotide
    """
    my_alt = x['alt']
    my_kmer = x['ref_kmer']

    my_alt_kmer = my_kmer[0:(k//2)] + my_alt + my_kmer[(k//2)+1:]
    
    return my_alt_kmer

In [9]:
def create_kmer_dict(k):
    """
    Construct a dictionary where the values are the combinations
    """
    nucleotides = ['A', 'C', 'T', 'G']
    from itertools import product   
    kmers = sorted([''.join(x) for x in product('ACGT', repeat=k)])
    kmers_dict = dict()

    for my_kmer in kmers:
        mid_nucl = my_kmer[(k//2):(k//2)+1]

        for my_nucleotide in nucleotides:
            if my_nucleotide != mid_nucl:
                my_alt_kmer = my_kmer[0:(k//2)] + my_nucleotide + my_kmer[(k//2)+1:]

                kmers_dict[(my_kmer, my_alt_kmer)] = 0
                
    return kmers_dict

In [10]:
def generate_keys(k, h, ref_core, alt_core):
    
    key_list = []
    from itertools import product   
    kmers = sorted([''.join(x) for x in product('ACGT', repeat=int((k-h)/2))])
    for kmer1 in kmers:
        for kmer2 in kmers:
            ref_key = kmer1 + ref_core + kmer2
            alt_key = kmer1 + alt_core + kmer2
            key_list.append((ref_key, alt_key))
    
    return(key_list)

In [11]:
def filter_keys(set_keys,nt_filter,position):
    subset_keys = []
    for my_key in set_keys:
        if my_key[0][position] == nt_filter:
            subset_keys.append(my_key)
    return(subset_keys)

In [None]:
def compute_probs(list_keys, mutation_df, k_freq_df):
    
    # Compute the count of observed mutations for the set of keys
    obs_count = 0
    for my_count in mutation_df['mutation'].value_counts().iteritems():
        if my_count[0] in list_keys:
            obs_count += my_count[1]
    
    # Compute the count of observed contexts for the set of keys
    context_count = 0
    for my_kmer in list_keys:
        my_ref_kmer = my_kmer[0]
        count = k_freq_df[k_freq_df['kmer'] == my_ref_kmer]['count'].values[0]
        context_count += count
    
    # Return the probability as the number of observed mutations divided by the context relative abundance
    return(obs_count/context_count)

In [None]:
def decomp_prob(k, mutation_df, count_file, h):
    
    my_probs_df = pd.DataFrame()
    my_probs_df['mutation'] = list(create_kmer_dict(k).keys())
    
    ## Kmer counts
    k_freq_df = pd.read_csv(count_file, sep='\t', header=None, low_memory=False)
    k_freq_df.columns = ['kmer', 'count']
    
    ## Compute the kmer changes (XYX -> XZX) of each alteration
    mutation_df['ref_kmer'] = mutation_df.apply(lambda x: hg19(x['chr'], x['start']+1-(k//2),1+2*(k//2)), axis=1)
    mutation_df['alt_kmer'] = mutation_df.apply(lambda x: compute_kmer_alt(x, k), axis=1)
    mutation_df['mutation'] = list(zip(mutation_df.ref_kmer, mutation_df.alt_kmer))
    kmer_dict = create_kmer_dict(k)
    
    ## Compute the decomposition
    for mutation in kmer_dict.keys():
        ref_core = mutation[0][(k//2)-(h//2):(k//2)+(h//2)+1]
        alt_core = mutation[1][(k//2)-(h//2):(k//2)+(h//2)+1]
        all_keys = generate_keys(k, h, ref_core, alt_core)
        core_prob = compute_probs(all_keys, mutation_df, k_freq_df)
        if core_prob == 0.0:
            kmer_dict[mutation] = 0.0
            continue
        prob_product = core_prob
        
        right_flank = mutation[0][(k//2)+(h//2)+1:]
        left_flank = mutation[0][:(k//2)-(h//2)]
        
        for pos in range(0,len(left_flank)):
            keys = filter_keys(all_keys,left_flank[pos],pos)
            sub_prob = compute_probs(keys, mutation_df, k_freq_df)
            prob_product = prob_product*(sub_prob/core_prob)
        
        for pos in range(-len(right_flank),0):
            keys = filter_keys(all_keys,right_flank[pos],pos)
            sub_prob = compute_probs(keys, mutation_df, k_freq_df)
            prob_product = prob_product*(sub_prob/core_prob)
        
        kmer_dict[mutation] = prob_product
    
    # Normalize the values (once I have all the probabilities computed).
#    all_values_sum = sum(kmer_dict.values())
#    for my_kmer in kmer_dict.keys():
#        kmer_dict[my_kmer] = kmer_dict[my_kmer]/all_values_sum
    
    column_name = 'Probability_germinal'
    my_results_df = pd.DataFrame({'mutation' : list(kmer_dict.keys()) , column_name : list(kmer_dict.values()) })
    my_probs_df = pd.merge(my_probs_df, my_results_df, on='mutation')
    return(my_probs_df)

## Compute the signature for the golden dataset (Goldmann 2018).

In [12]:
## Get the mutations
mutations_file = 'non_provided_data/mutations/germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]
mutations_df = mutations_df[mutations_df['type'] == 'subs']
mutations_df = mutations_df[mutations_df['sample'] == 'Goldmann2018']

### Mutation signature 5-mer with DNMs, autosomal genome counts

In [7]:
k = 5
h = 3

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = decomp_prob(k, mutations_df, k_mer_count_file,h)


filename = str(k) + 'mer_DNM_signatures_decomp_golden.txt'
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

TypeError: 'int' object is not iterable

### Mutation signature 7-mer with DNMs, autosomal genome counts

In [12]:
k = 7
h = 3

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = decomp_prob(k, mutations_df, k_mer_count_file, h)


filename = str(k) + 'mer_DNM_signatures_decomp_golden.txt'
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

## Compute the signature for the largest dataset (Halldorsson 2019).

In [13]:
## Get the mutations
mutations_file = 'non_provided_data/mutations/germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]
mutations_df = mutations_df[mutations_df['type'] == 'subs']
mutations_df = mutations_df[mutations_df['sample'] == 'Halldorsson2019']

### Mutation signature 5-mer with DNMs, autosomal genome counts

In [16]:
k = 5
h = 3

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = decomp_prob(k, mutations_df, k_mer_count_file,h)


filename = str(k) + 'mer_DNM_signatures_decomp_Halldorsson.txt'
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 7-mer with DNMs, autosomal genome counts

In [17]:
k = 7
h = 3

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = decomp_prob(k, mutations_df, k_mer_count_file, h)


filename = str(k) + 'mer_DNM_signatures_decomp_Halldorsson.txt'
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

## Compute the signature for all DNM.

In [8]:
## Get the mutations
mutations_file = 'non_provided_data/mutations/germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]
mutations_df = mutations_df[mutations_df['type'] == 'subs']

### Mutation signature 3-mer with DNMs, autosomal genome counts

**Only used for comparison at Supplementary Figure S4** 

In [18]:
k = 3
h = 1

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = decomp_prob(k, mutations_df, k_mer_count_file,h)


filename = str(k) + 'mer_DNM_signatures_decomp.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 5-mer with DNMs, autosomal genome counts

**Only used for comparison at Supplementary Figure S4**

In [None]:
k = 5
h = 3

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = decomp_prob(k, mutations_df, k_mer_count_file,h)


filename = str(k) + 'mer_DNM_signatures_decomp.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Mutation signature 7-mer with DNMs, autosomal genome counts

In [12]:
k = 7
h = 5

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = decomp_prob(k, mutations_df, k_mer_count_file,h)


filename = str(k) + 'mer_DNM_signatures_decomp.txt' 
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')