# Preprocess for R plots

In [2]:
from os import path

from bgreference import hg19
import pandas as pd

In [3]:
# Other
mutations_folder = 'non_provided_data/mutations/'

## Functions

In [None]:
def rev_comp(seq):
    return ''.join([{'A':'T','C':'G','G':'C','T':'A','N':'N'}[B] for B in seq][::-1])

In [None]:
def compute_sixway_spectrum(x):
    """
    Compute the class based on a 6-way spectrum
    """
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    my_ref = x['ref']
    my_alt = x['alt']
    if my_ref == 'C' or my_ref == 'T':
        my_sub = my_ref + '>' + my_alt
    else:
        my_sub = complement[my_ref] + '>' + complement[my_alt]
    return my_sub

In [None]:
def compute_CpG_site(x):
    """
    Compute if mutation fell on CpG site
    """
    my_ref = x['ref']
    my_pre = x['pre']
    my_post = x['post']
    if my_ref == 'C' and my_post == 'G':
        my_site = 'CpG'
    elif my_ref == 'G' and my_pre == 'C':
        my_site = 'CpG'
    else:
        my_site = 'nonCpG'
    return my_site

In [None]:
def collapse_spectrum(x):
    """
    Compute the class based on a 6-way spectrum
    """
    my_ref = x['ref']
    my_alt = x['alt']
    
    if my_ref == 'C' and my_alt == 'T':
        return my_ref, my_alt
    elif my_ref == 'C' and my_alt == 'A':
        return my_ref, my_alt
    elif my_ref == 'C' and my_alt == 'G':
        return my_ref, my_alt
    elif my_ref == 'A' and my_alt == 'T':
        return my_ref, my_alt
    elif my_ref == 'A' and my_alt == 'G':
        return my_ref, my_alt
    elif my_ref == 'A' and my_alt == 'C':
        return my_ref, my_alt
    elif my_ref == 'G' and my_alt == 'A':
        return 'C','T'
    elif my_ref == 'G' and my_alt == 'T':
        return 'C','A'
    elif my_ref == 'G' and my_alt == 'C':
        return 'C','G'
    elif my_ref == 'T' and my_alt == 'A':
        return 'A','T'
    elif my_ref == 'T' and my_alt == 'C':
        return 'A','G'
    elif my_ref == 'T' and my_alt == 'G':
        return 'A','C'

In [None]:
def create_kmer_dict_collapsed(k):
    """
    Construct a dictionary where the values are the combinations.
    """
    
    characters = 'ACGT'
    nucleotides = ['A', 'C', 'T', 'G']
    from itertools import product   
    kmers_not_filtered = sorted([''.join(x) for x in product(characters, repeat=k)])
    
    kmers = []
    for kmer in kmers_not_filtered:
        mid_nucl = kmer[(k//2):(k//2)+1]
        if mid_nucl == 'C' or mid_nucl == 'A':
            kmers.append(kmer)
    
    kmers_dict = dict()
    
    for my_kmer in kmers:
        mid_nucl = my_kmer[(k//2):(k//2)+1]

        for my_nucleotide in nucleotides:
            if my_nucleotide != mid_nucl:
                my_alt_kmer = my_kmer[0:(k//2)] + my_nucleotide + my_kmer[(k//2)+1:]

                kmers_dict[(my_kmer, my_alt_kmer)] = 0
                
    return kmers_dict

In [None]:
def kmer_freqs_collapse(x,k):
    mid_nucl = x[(k//2):(k//2)+1]
    left_flank = x[0:(k//2)]
    right_flank = x[(k//2)+1:]
    
    if mid_nucl == 'C':
        return x
    elif mid_nucl == 'A':
        return x
    elif mid_nucl == 'G':
        return rev_comp(x)
    elif mid_nucl == 'T':
        return rev_comp(x)

In [None]:
def compute_kmer_ref(x,k):
    """
    Compute the alternate kmer sequence from a DataFrame row or a dict 
    with the reference kmer and the alternated nucleotide
    """
    my_gen = x['ref_genome']
    
    ref_nucl = my_gen[(k//2):(k//2)+1]
    if (ref_nucl == 'C') | (ref_nucl == 'A'):
        my_ref_kmer = my_gen
    else:
        my_ref_kmer = rev_comp(my_gen)
        
    return my_ref_kmer

In [None]:
def compute_kmer_alt(x,k):
    """
    Compute the alternate kmer sequence from a DataFrame row or a dict 
    with the reference kmer and the alternated nucleotide
    """
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    my_alt = x['alt']
    my_kmer = x['ref_kmer']
    my_gen = x['ref_genome']
    ref_nucl = my_gen[(k//2):(k//2)+1]
    
    if (ref_nucl == 'C') | (ref_nucl == 'A'):
        my_alt_kmer = my_kmer[0:(k//2)] + my_alt + my_kmer[(k//2)+1:]
    else:
        my_alt_kmer = my_kmer[0:(k//2)] + complement[my_alt] + my_kmer[(k//2)+1:]
    
    return my_alt_kmer

In [None]:
def signature_collapsed(k, mutation_df, count_file, sampleID):
    
    my_probs_df = pd.DataFrame()
    my_probs_df['mutation'] = list(create_kmer_dict_collapsed(k).keys())

    ## Trinucleotide counts
    k_freq_df = pd.read_csv(count_file, sep='\t', header=None, low_memory=False)
    k_freq_df.columns = ['kmer', 'count']
    k_freq_df['kmer'] = k_freq_df['kmer'].apply(lambda x: kmer_freqs_collapse(x,k))
    k_freq_df = k_freq_df.groupby('kmer')['count'].sum()
    k_freq_df = pd.DataFrame({'kmer':k_freq_df.index, 'count':k_freq_df.values})
    
    ## Compute the kmer changes (XYX -> XZX) of each alteration
    mutation_df['ref_genome'] = mutation_df.apply(lambda x: hg19(x['chr'], x['start']+1-(k//2),1+2*(k//2)), axis=1)
    mutation_df['ref_kmer'] = mutation_df.apply(lambda x: compute_kmer_ref(x, k), axis=1)
    mutation_df['alt_kmer'] = mutation_df.apply(lambda x: compute_kmer_alt(x, k), axis=1)
    mutation_df['mutation'] = list(zip(mutation_df.ref_kmer, mutation_df.alt_kmer))
    
    kmer_dict = create_kmer_dict_collapsed(k)
    
    # Compute the counts of each triplet alteration
    for my_count in mutation_df['mutation'].value_counts().iteritems():
        if my_count[0] in my_probs_df['mutation'].tolist():
            kmer_dict[my_count[0]] = my_count[1]
    
    # Divide each count by the counts of the reference kmer
    for my_kmer in kmer_dict.keys():
        my_ref_kmer = my_kmer[0]
        total_count = k_freq_df[k_freq_df['kmer'] == my_ref_kmer]['count'].values[0]
        kmer_dict[my_kmer] = kmer_dict[my_kmer]/total_count
    
    column_name = 'Probability_' + str(sampleID)
    my_results_df = pd.DataFrame({'mutation' : list(kmer_dict.keys()) , column_name : list(kmer_dict.values()) })
    my_probs_df = pd.merge(my_probs_df, my_results_df, on='mutation')
    return(my_probs_df)

In [None]:
def numbers_collapsed(k, mutation_df, count_file):
    
    my_probs_df = pd.DataFrame()
    my_probs_df['mutation'] = list(create_kmer_dict_collapsed(k).keys())
    
    ## Trinucleotide counts
    k_freq_df = pd.read_csv(count_file, sep='\t', header=None, low_memory=False)
    k_freq_df.columns = ['kmer', 'count']
    k_freq_df['kmer'] = k_freq_df['kmer'].apply(lambda x: kmer_freqs_collapse(x,k))
    k_freq_df = k_freq_df.groupby('kmer')['count'].sum()
    k_freq_df = pd.DataFrame({'kmer':k_freq_df.index, 'count':k_freq_df.values})
    
    ## Compute the kmer changes (XYX -> XZX) of each alteration
    mutation_df['ref_genome'] = mutation_df.apply(lambda x: hg19(x['chr'], x['start']+1-(k//2),1+2*(k//2)), axis=1)
    mutation_df['ref_kmer'] = mutation_df.apply(lambda x: compute_kmer_ref(x, k), axis=1)
    mutation_df['alt_kmer'] = mutation_df.apply(lambda x: compute_kmer_alt(x, k), axis=1)
    mutation_df['mutation'] = list(zip(mutation_df.ref_kmer, mutation_df.alt_kmer))
    
    kmer_dict = create_kmer_dict_collapsed(k)
    
    # Compute the counts of each triplet alteration
    for my_count in mutation_df['mutation'].value_counts().iteritems():
        if my_count[0] in my_probs_df['mutation'].tolist():
            kmer_dict[my_count[0]] = my_count[1]
    
    column_name = 'Number_mutations'
    my_results_df = pd.DataFrame({'mutation' : list(kmer_dict.keys()) , column_name : list(kmer_dict.values()) })
    my_results_df['count'] = my_results_df.apply(lambda x: int(k_freq_df[k_freq_df['kmer']==x['mutation'][0]]['count']), axis=1)
    my_probs_df = pd.merge(my_probs_df, my_results_df, on='mutation')
    return(my_probs_df)

## Preprocess for 9-way spectrum plot (at R script)

In [8]:
# Get the mutations
mutations_file = mutations_folder + 'germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[mutations_df['type'] == 'subs']

In [9]:
def check(ref, chrom, pos):
    real_ref = hg19(chrom, pos)
    if real_ref == ref:
        return('Valid')
    else:
        return('Error')

In [10]:
mutations_df['check'] = mutations_df.apply(lambda x: check(x['ref'], x['chr'], x['end']), axis=1)

In [13]:
valid = mutations_df[mutations_df['check']=='Valid']

In [14]:
valid.to_csv(path.join('results', 'valid.bed'), header=True, index=None, sep='\t')

In [None]:
mutations_df['sub'] = mutations_df.apply(lambda x: compute_sixway_spectrum(x), axis=1)
mutations_df['pre'] = mutations_df.apply(lambda x: hg19(x['chr'], x['end']-1), axis=1)
mutations_df['post'] = mutations_df.apply(lambda x: hg19(x['chr'], x['end']+1), axis=1)
mutations_df['site'] = mutations_df.apply(lambda x: compute_CpG_site(x), axis=1)
mutations_df.to_csv(path.join('results', 'germinal_class.bed'), header=True, index=None, sep='\t')

## Preprocess for context dependency plot (at R script)

In [None]:
# Get the mutations
mutations_file = mutations_folder + 'germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]
mutations_df = mutations_df[mutations_df['type'] == 'subs']

In [None]:
k = 21
mutations_df['mut'] = mutations_df.apply(lambda x: compute_sixway_spectrum(x), axis=1)
mutations_df['kmer'] = mutations_df.apply(lambda x: hg19(x['chr'], x['start']+1-(k//2),1+2*(k//2)), axis=1)
mutations_df.to_csv('results/frequencies_plot_new.tsv', sep="\t", header=True, index=False)

## Collapsed signatures by 6-class spectrum. Preprocess for heatmap plot (at R script)

In [None]:
# Get the mutations
mutations_file = mutations_folder + 'germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]
mutations_df = mutations_df[mutations_df['type'] == 'subs']
sampleID = 'germinal_collapsed'

### 1-mer signatures

In [None]:
k = 1

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature_collapsed(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_collapsed.txt'
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### 3-mer signatures

In [None]:
k = 3

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature_collapsed(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_collapsed.txt'
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### 5-mer signatures

In [None]:
k = 5

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature_collapsed(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_collapsed.txt'
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### 7-mer signatures

In [None]:
k = 7

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature_collapsed(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_collapsed.txt'
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### 9-mer signatures

In [None]:
k = 9

k_mer_count_file = 'data/' + str(k) + 'mer.txt'

results = signature_collapsed(k, mutations_df, k_mer_count_file, sampleID)
filename = str(k) + 'mer_signatures_DNM_collapsed.txt'
results.to_csv(path.join('results', filename), header=True, index=None, sep='\t')