# Model comparison

This notebook computes the loglikelihood for a given kmer model and the AIC based on the number of parameters.

---

## Output

Several files in the form of *model.txt*, containing the loglikelihood and the number of parameters.

## Input

Files in **data** directory.

- *kmer_freq_file*: file that contains the counts of k-mers.

Files in **non_provided_data/mutations** directory.

- *germinal_ultimate_dataset.bed.gz*: file containing the mutations.

In [None]:
from os import path

import pandas as pd
import numpy as np
import math
import tabix  # package pytabix
import pybedtools
from bgreference import hg19
import gzip, pickle

## Functions

All the functions needed for this notebook are coded below

In [2]:
def remove_version_ensembl(x, colname):
    """
    Remove version from the ensembl ID

    Args:
        x: dataframe row
        colname: column name

    Returns:
        str. Text before a '.'

    """
    ensembl_id = x[colname]
    ensembl_id_only = ensembl_id.split('.')[0]

    return ensembl_id_only

In [15]:
def log_likelihood(k, mutation_df, middle_exon_coords, signatures_file, middle_distance_threshold):
    
    nucleotides = set(['A', 'T', 'C', 'G'])
    
    # Process signatures
    all_signatures = pd.read_csv(signatures_file, sep='\t')
    name = 'Probability_germinal'
    sub_signatures_df = all_signatures[['mutation', name]]
    prob_dict = sub_signatures_df.set_index('mutation')[name].T.to_dict()
    
    # Add probability of no mutation
    for my_kmer in kmer_generator('ACGT',k):
        my_ref_base = my_kmer[(k//2):(k//2)+1].upper()
        my_alt_bases = list(nucleotides - set(my_ref_base)) #The rest are alternative ones
        my_key1 = str((my_kmer , my_kmer[0:(k//2)] + my_alt_bases[0] + my_kmer[(k//2)+1:]))
        my_key2 = str((my_kmer , my_kmer[0:(k//2)] + my_alt_bases[1] + my_kmer[(k//2)+1:]))
        my_key3 = str((my_kmer , my_kmer[0:(k//2)] + my_alt_bases[2] + my_kmer[(k//2)+1:]))
        new_key = str((my_kmer , my_kmer))
        prob_dict[new_key] = 1 - prob_dict[my_key1] - prob_dict[my_key2] - prob_dict[my_key3]
    
    # Process coordinates
    middle_exon_coords['exon_size'] = (middle_exon_coords['end'] - middle_exon_coords['start'])
    middle_exon_coords['exon_middle_start'] = (middle_exon_coords['start'] + middle_exon_coords['exon_size']/2)
    middle_exon_coords['exon_middle_start'] = middle_exon_coords.apply(lambda x:
                                                                    math.floor(x['exon_middle_start']),
                                                                    axis=1)
    
    middle_exon_coords['exon_middle_end'] = middle_exon_coords['exon_middle_start'] + 1
    middle_exon_coords['region_start'] = middle_exon_coords['exon_middle_start'] - middle_distance_threshold
    middle_exon_coords['region_end'] = middle_exon_coords['exon_middle_end'] + middle_distance_threshold
    sub_exons_coords = middle_exon_coords[['chr', 'region_start', 'region_end', 'ensembl', 'exon_size']]
    
    # Process mutations
    sub_exons_coords_bed = pybedtools.BedTool.from_dataframe(sub_exons_coords)
    mutations_bed = pybedtools.BedTool.from_dataframe(mutation_df)
    my_bed = sub_exons_coords_bed.intersect(mutations_bed, wao=True)

    mutations_in_range = pd.read_table(my_bed.fn, names = ['chr', 'region_start', 'region_end',
                        'ensembl', 'exon_size', 'mut_chr', 'mut_start', 'mut_end', 'mut_ref', 'mut_alt',
                        'mut_sample', 'mut_type', 'overlap_bp'],  sep="\s+", index_col=False)
    mutations_in_range = mutations_in_range[mutations_in_range['overlap_bp'] != 0]
    mutations_in_range = mutations_in_range[['mut_chr', 'mut_start', 'mut_end', 'mut_ref',
                                             'mut_alt']].drop_duplicates()
    mutations_in_range.columns = ['chr', 'start', 'end', 'ref', 'alt']
    mutations_in_range['ref_kmer'] = mutations_in_range.apply(lambda x: hg19(x['chr'],
                                                                            x['start']+1-(k//2),1+2*(k//2)), axis=1)
    mutations_in_range['alt_kmer'] = mutations_in_range.apply(lambda x: compute_kmer_alt(x, k), axis=1)
    mutations_in_range['mutation'] = list(zip(mutations_in_range.ref_kmer, mutations_in_range.alt_kmer))
    mutations_in_range['coords'] = list(zip(mutations_in_range.chr, mutations_in_range.start))
    mut_dict = mutations_in_range.set_index('coords')['mutation'].T.to_dict()
    
    log_ll = 0
    
    # For each exon, each row, each 2001-nt sequence
    for my_row in sub_exons_coords.values.tolist():

        ## Get coordinates
        my_chr = my_row[0]
        my_start = int(my_row[1])
        my_end = int(my_row[2])
        n_bases = my_end - my_start
        
        ## Get sequence.
        my_exon_bases = hg19(my_chr, my_start+1-(k//2), size=n_bases+2*(k//2))
        ## Divide sequences into kmers
        my_kmers = [my_exon_bases[i:i+k] for i in range(len(my_exon_bases)-(k-1))]

        for i in range(0,len(my_kmers)):
            pos = my_start + i
            try:
                log_ll = log_ll + np.log(prob_dict[str(mut_dict[(my_chr, pos)])])
            except:
                log_ll = log_ll + np.log(prob_dict[str(tuple((my_kmers[i],my_kmers[i])))])

    return(log_ll)

In [16]:
def log_likelihood_CpG(k, mutation_df, middle_exon_coords, signatures_file, middle_distance_threshold):
    
    nucleotides = set(['A', 'T', 'C', 'G'])
    
    # Process signatures
    all_signatures = pd.read_csv(signatures_file, sep='\t')
    name = 'Probability_germinal'
    sub_signatures_df = all_signatures[['mutation', name]]
    prob_dict = sub_signatures_df.set_index('mutation')[name].T.to_dict()
    
    # Add probability of no mutation (manually)
    prob_dict[str(('A','A'))] = 1 - prob_dict[str(('A','C'))] - prob_dict[str(('A','G'))] - prob_dict[str(('A','T'))]
    prob_dict[str(('T','T'))] = 1 - prob_dict[str(('T','C'))] - prob_dict[str(('T','G'))] - prob_dict[str(('T','A'))]
    prob_dict[str(('CpG','CpG'))] = 1 - prob_dict[str(('CpG','T'))] - prob_dict[str(('CpG','G'))] - prob_dict[str(('CpG','A'))]
    prob_dict[str(('GpC','GpC'))] = 1 - prob_dict[str(('GpC','A'))] - prob_dict[str(('GpC','C'))] - prob_dict[str(('GpC','T'))]
    prob_dict[str(('nonCpG','nonCpG'))] = 1 - prob_dict[str(('nonCpG','T'))] - prob_dict[str(('nonCpG','G'))] - prob_dict[str(('nonCpG','A'))]
    prob_dict[str(('nonGpC','nonGpC'))] = 1 - prob_dict[str(('nonGpC','A'))] - prob_dict[str(('nonGpC','C'))] - prob_dict[str(('nonGpC','T'))]
    
    # Process coordinates
    middle_exon_coords['exon_size'] = (middle_exon_coords['end'] - middle_exon_coords['start'])
    middle_exon_coords['exon_middle_start'] = (middle_exon_coords['start'] + middle_exon_coords['exon_size']/2)
    middle_exon_coords['exon_middle_start'] = middle_exon_coords.apply(lambda x:
                                                                    math.floor(x['exon_middle_start']),
                                                                    axis=1)
    
    middle_exon_coords['exon_middle_end'] = middle_exon_coords['exon_middle_start'] + 1
    middle_exon_coords['region_start'] = middle_exon_coords['exon_middle_start'] - middle_distance_threshold
    middle_exon_coords['region_end'] = middle_exon_coords['exon_middle_end'] + middle_distance_threshold
    sub_exons_coords = middle_exon_coords[['chr', 'region_start', 'region_end', 'ensembl', 'exon_size']]
    
    # Process mutations
    sub_exons_coords_bed = pybedtools.BedTool.from_dataframe(sub_exons_coords)
    mutations_bed = pybedtools.BedTool.from_dataframe(mutation_df)
    my_bed = sub_exons_coords_bed.intersect(mutations_bed, wao=True)

    mutations_in_range = pd.read_table(my_bed.fn, names = ['chr', 'region_start', 'region_end',
                        'ensembl', 'exon_size', 'mut_chr', 'mut_start', 'mut_end', 'mut_ref', 'mut_alt',
                        'mut_sample', 'mut_type', 'overlap_bp'],  sep="\s+", index_col=False)
    mutations_in_range = mutations_in_range[mutations_in_range['overlap_bp'] != 0]
    mutations_in_range = mutations_in_range[['mut_chr', 'mut_start', 'mut_end', 'mut_ref',
                                             'mut_alt']].drop_duplicates()
    mutations_in_range.columns = ['chr', 'start', 'end', 'ref', 'alt']
    mutations_in_range['ref_kmer'] = mutations_in_range.apply(lambda x: hg19(x['chr'],
                                                                            x['start']+1-(k//2),1+2*(k//2)), axis=1)
    mutations_in_range[['pre', 'nt', 'post']] = mutations_in_range['ref_kmer'].apply(lambda x: pd.Series(list(x)))
    mutations_in_range['site'] = mutations_in_range.apply(lambda x: compute_CpG_site(x), axis=1)
    mutations_in_range['mutation'] = list(zip(mutations_in_range.site, mutations_in_range.alt))
    mutations_in_range['coords'] = list(zip(mutations_in_range.chr, mutations_in_range.start))
    mut_dict = mutations_in_range.set_index('coords')['mutation'].T.to_dict()
    
    log_ll = 0
    
    # For each exon, each row, each 2001-nt sequence
    for my_row in sub_exons_coords.values.tolist():

        ## Get coordinates
        my_chr = my_row[0]
        my_start = int(my_row[1])
        my_end = int(my_row[2])
        n_bases = my_end - my_start
        
        ## Get sequence.
        my_exon_bases = hg19(my_chr, my_start+1-(k//2), size=n_bases+2*(k//2))
        ## Divide sequences into kmers
        my_kmers = [my_exon_bases[i:i+k] for i in range(len(my_exon_bases)-(k-1))]

        for i in range(0,len(my_kmers)):
            pos = my_start + i
            site = compute_CpG_site_string(my_kmers[i])
            try:
                log_ll = log_ll + np.log(prob_dict[str(mut_dict[(my_chr, pos)])])
            except:
                log_ll = log_ll + np.log(prob_dict[str((site,site))])

    return(log_ll)

In [17]:
def compute_AIC(log_likelihood, n_params):
    return(2*n_params - 2*log_likelihood)

In [21]:
def remove_muts(mut_data, IDs_list):
    """
    Function that removes mutations from a data frame, only if the mutation matches and ID
    and it is enclosed in the exon coordinates of the specific 2001-nt window.

    Args:
        mut_data: dataframe with columns "range_chr range_start range_end ensembl region_size
                    region_middle_start region_middle_end exon_start exon_end mut_chr mut_start
                    mut_end mut_ref mut_alt mut_sample mut_type mut_ID overlap_bp
        IDs_list: list of IDs that must be removed
    Returns:
        pandas data frame with mutations with the ID removed, they sould be removed only once if the IDs are repeated.

    """
    rows_to_remove = []
    for ID in IDs_list:
        sub_frame = mut_data[mut_data['mut_ID'] == ID]
        for index, row in sub_frame.iterrows():
            if (row['exon_start'] <= row['mut_end'] and row['exon_end'] >= row['mut_end']):
                rows_to_remove.append(index)
    return(mut_data.drop(rows_to_remove))

In [22]:
# This function was made for testing purposes:
def countX(lst, x): 
    count = 0
    for ele in lst: 
        if (ele == x): 
            count = count + 1
    return count

In [23]:
# This function was made for testing purposes:
def check_consequence_TEST(chromosome, start_coord, alt, tabix):
    """
    Function that classifies a given exonic mutation into the VEP classification given
    tabix file with predicted effect.

    Args:
        chromosome: chromosome coordinate
        start_coord: coordinate of the mutation
        alt: alternative nucleotide
        tabix: tabix indexed file containing the information
        rank_info: dictionary with all the types of predicted effect and if we classify them as synonymous or not.
    Returns:
        str. Class of mutation on VEP classification

    """
    
    my_chr = chromosome[3:]
    effects = tabix.querys("{}:{}-{}".format(my_chr, start_coord, start_coord))
    for effect in effects:
        if len(effect) < 6:
            continue
        if effect[3] == alt:
            consequence = effect[5].rstrip('\r')
            return(consequence)

In [24]:
def check_consequence(chromosome, start_coord, alt, tabix, rank_info):
    """
    Function that classifies a given exonic mutation into synonymous 
    or not given a rank info and a tabix file with predicted effect.

    Args:
        chromosome: chromosome coordinate
        start_coord: coordinate of the mutation
        alt: alternative nucleotide
        tabix: tabix indexed file containing the predicted VEP classification effect for a given position and change.
        rank_info: dictionary with all the types of predicted effect and if we classify them as synonymous or not.
    Returns:
        str. *synonymous* or *non_synonymous*

    """
    
    my_chr = chromosome[3:]
    effects = tabix.querys("{}:{}-{}".format(my_chr, start_coord, start_coord))
    for effect in effects:
        if len(effect) < 6:
            continue
        if effect[3] == alt:
            consequence = effect[5].rstrip('\r')
            try:
                con = rank_info[consequence]
                return(con)
            except:
                return('None')

In [25]:
def synonymous_or_not(x):
    """
    Classifies consequence types between synonymous and non synonymous
    using the RANK column

    Args:
        x: dataframe row

    Returns:
        str. *synonymous* or *non_synonymous*

    """

    my_rank = x['RANK']

    if my_rank < 15:
        my_consequence = 'non_synonymous'

    else:
        my_consequence = 'synonymous'

    return my_consequence

In [26]:
def log_likelihood_excluding(k, mutation_df, exon_coords, signatures_file, middle_distance_threshold, include):
    
    nucleotides = set(['A', 'T', 'C', 'G'])
    
    middle_exon_coords = exon_coords.copy(deep=True)
    
    # Process signatures
    all_signatures = pd.read_csv(signatures_file, sep='\t')
    name = 'Probability_germinal'
    sub_signatures_df = all_signatures[['mutation', name]]
    prob_dict = sub_signatures_df.set_index('mutation')[name].T.to_dict()
    
    # Add probability of no mutation
    for my_kmer in kmer_generator('ACGT',k):
        my_ref_base = my_kmer[(k//2):(k//2)+1].upper()
        my_alt_bases = list(nucleotides - set(my_ref_base)) #The rest are alternative ones
        my_key1 = str((my_kmer , my_kmer[0:(k//2)] + my_alt_bases[0] + my_kmer[(k//2)+1:]))
        my_key2 = str((my_kmer , my_kmer[0:(k//2)] + my_alt_bases[1] + my_kmer[(k//2)+1:]))
        my_key3 = str((my_kmer , my_kmer[0:(k//2)] + my_alt_bases[2] + my_kmer[(k//2)+1:]))
        new_key = str((my_kmer , my_kmer))
        prob_dict[new_key] = 1 - prob_dict[my_key1] - prob_dict[my_key2] - prob_dict[my_key3]
    
    # Convert exonic coordinates into bed to intersect values
    exon_coords_bed = pybedtools.BedTool.from_dataframe(middle_exon_coords)
    
    # Process coordinates
    middle_exon_coords['exon_size'] = (middle_exon_coords['end'] - middle_exon_coords['start'])
    middle_exon_coords['exon_middle_start'] = (middle_exon_coords['start'] + middle_exon_coords['exon_size']/2)
    middle_exon_coords['exon_middle_start'] = middle_exon_coords.apply(lambda x:
                                                                    math.floor(x['exon_middle_start']),
                                                                    axis=1)
    
    middle_exon_coords['exon_middle_end'] = middle_exon_coords['exon_middle_start'] + 1
    middle_exon_coords['region_start'] = middle_exon_coords['exon_middle_start'] - middle_distance_threshold
    middle_exon_coords['region_end'] = middle_exon_coords['exon_middle_end'] + middle_distance_threshold
    region_coords = middle_exon_coords[['chr', 'region_start', 'region_end', 'ensembl', 'exon_size', 'start', 'end']]
    region_coords_bed = pybedtools.BedTool.from_dataframe(region_coords)
    
    # Process mutations
    mutations_bed = pybedtools.BedTool.from_dataframe(mutation_df)
    my_bed = region_coords_bed.intersect(mutations_bed, wao=True)
    
    # Filter mutations by the full region of interest
    mutations_in_range = pd.read_table(my_bed.fn, names = ['chr', 'region_start', 'region_end',
                        'ensembl', 'exon_size', 'exon_start', 'exon_end', 'mut_chr', 'mut_start', 'mut_end', 'mut_ref', 'mut_alt',
                        'mut_sample', 'mut_type', 'mut_ID', 'overlap_bp'],  sep="\s+", index_col=False)
    mutations_in_range = mutations_in_range[mutations_in_range['overlap_bp'] != 0]
    
    
    # Filter mutations by the exonic regions
    my_bed2 = exon_coords_bed.intersect(mutations_bed, wao=True)
    mutations_in_range2 = pd.read_table(my_bed2.fn, names = ['exon_chr', 'exon_start', 'exon_end',
                        'ensembl', 'mut_chr', 'mut_start', 'mut_end', 'mut_ref', 'mut_alt',
                        'mut_sample', 'mut_type', 'mut_ID', 'overlap_bp'],  sep="\s+", index_col=False)
    mutations_in_range2 = mutations_in_range2[mutations_in_range2['overlap_bp'] != 0]
    mutations_in_range2 = mutations_in_range2[['mut_chr', 'mut_start', 'mut_end', 'mut_ref', 
                        'mut_alt', 'mut_sample', 'mut_type', 'mut_ID']]
    mutations_in_range2.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'ID']
    
    
    mutations_in_range2['category'] = mutations_in_range2.apply(lambda x: 
                           check_consequence_TEST(x['chr'], x['end'], x['alt'], tb_consequence_type), axis=1)
    all_elements = mutations_in_range2['category'].tolist()
    unique_elements = set(all_elements)
    for n in unique_elements:
        count = countX(all_elements, n)
        print ("The category " + str(n) + " is found " + str(count) + " times.")

    # Find if mutations at exonic regions are synonymous or not.
    mutations_in_range2['conseq'] = mutations_in_range2.apply(lambda x: 
                       check_consequence(x['chr'], x['end'], x['alt'], tb_consequence_type, consequence_rank_dict),
                       axis=1)
    
    # Drop exonic outsise 2001-nt window
    muts_to_drop = mutations_in_range2[mutations_in_range2['conseq'] != include]
    IDs_to_drop = muts_to_drop['ID'].tolist()
    mutations_in_range = remove_muts(mutations_in_range, IDs_to_drop)
    
    # Drop recurrent mutations
    mutations_in_range = mutations_in_range[['mut_chr', 'mut_start', 'mut_end', 'mut_ref',
                                             'mut_alt']].drop_duplicates()
    mutations_in_range.columns = ['chr', 'start', 'end', 'ref', 'alt']
    
    # Compute dictionary of mutations
    mutations_in_range['ref_kmer'] = mutations_in_range.apply(lambda x: hg19(x['chr'],
                                                                            x['start']+1-(k//2),1+2*(k//2)), axis=1)
    mutations_in_range['alt_kmer'] = mutations_in_range.apply(lambda x: compute_kmer_alt(x, k), axis=1)
    mutations_in_range['mutation'] = list(zip(mutations_in_range.ref_kmer, mutations_in_range.alt_kmer))
    mutations_in_range['coords'] = list(zip(mutations_in_range.chr, mutations_in_range.start))
    mut_dict = mutations_in_range.set_index('coords')['mutation'].T.to_dict()
    
    log_ll = 0
    
    # For each exon, each row, each 2001-nt sequence
    for my_row in region_coords.values.tolist():

        ## Get coordinates
        my_chr = my_row[0]
        my_start = int(my_row[1])
        my_end = int(my_row[2])
        n_bases = my_end - my_start
        
        ## Get sequence.
        my_exon_bases = hg19(my_chr, my_start+1-(k//2), size=n_bases+2*(k//2))
        ## Divide sequences into kmers
        my_kmers = [my_exon_bases[i:i+k] for i in range(len(my_exon_bases)-(k-1))]

        for i in range(0,len(my_kmers)):
            pos = my_start + i
            try:
                log_ll = log_ll + np.log(prob_dict[str(mut_dict[(my_chr, pos)])])
            except:
                log_ll = log_ll + np.log(prob_dict[str(tuple((my_kmers[i],my_kmers[i])))])

    return(log_ll)

In [27]:
def log_likelihood_excluding_CpG(k, mutation_df, exon_coords, signatures_file, middle_distance_threshold, include):
    
    nucleotides = set(['A', 'T', 'C', 'G'])
    
    middle_exon_coords = exon_coords.copy(deep=True)
    
    # Process signatures
    all_signatures = pd.read_csv(signatures_file, sep='\t')
    name = 'Probability_germinal'
    sub_signatures_df = all_signatures[['mutation', name]]
    prob_dict = sub_signatures_df.set_index('mutation')[name].T.to_dict()
    
    # Add probability of no mutation (manually)
    prob_dict[str(('A','A'))] = 1 - prob_dict[str(('A','C'))] - prob_dict[str(('A','G'))] - prob_dict[str(('A','T'))]
    prob_dict[str(('T','T'))] = 1 - prob_dict[str(('T','C'))] - prob_dict[str(('T','G'))] - prob_dict[str(('T','A'))]
    prob_dict[str(('CpG','CpG'))] = 1 - prob_dict[str(('CpG','T'))] - prob_dict[str(('CpG','G'))] - prob_dict[str(('CpG','A'))]
    prob_dict[str(('GpC','GpC'))] = 1 - prob_dict[str(('GpC','A'))] - prob_dict[str(('GpC','C'))] - prob_dict[str(('GpC','T'))]
    prob_dict[str(('nonCpG','nonCpG'))] = 1 - prob_dict[str(('nonCpG','T'))] - prob_dict[str(('nonCpG','G'))] - prob_dict[str(('nonCpG','A'))]
    prob_dict[str(('nonGpC','nonGpC'))] = 1 - prob_dict[str(('nonGpC','A'))] - prob_dict[str(('nonGpC','C'))] - prob_dict[str(('nonGpC','T'))]
    
    # Convert exonic coordinates into bed to intersect values
    exon_coords_bed = pybedtools.BedTool.from_dataframe(middle_exon_coords)
    
    # Process coordinates
    middle_exon_coords['exon_size'] = (middle_exon_coords['end'] - middle_exon_coords['start'])
    middle_exon_coords['exon_middle_start'] = (middle_exon_coords['start'] + middle_exon_coords['exon_size']/2)
    middle_exon_coords['exon_middle_start'] = middle_exon_coords.apply(lambda x:
                                                                    math.floor(x['exon_middle_start']),
                                                                    axis=1)
    
    middle_exon_coords['exon_middle_end'] = middle_exon_coords['exon_middle_start'] + 1
    middle_exon_coords['region_start'] = middle_exon_coords['exon_middle_start'] - middle_distance_threshold
    middle_exon_coords['region_end'] = middle_exon_coords['exon_middle_end'] + middle_distance_threshold
    region_coords = middle_exon_coords[['chr', 'region_start', 'region_end', 'ensembl', 'exon_size', 'start', 'end']]
    region_coords_bed = pybedtools.BedTool.from_dataframe(region_coords)
    
    # Process mutations
    mutations_bed = pybedtools.BedTool.from_dataframe(mutation_df)
    my_bed = region_coords_bed.intersect(mutations_bed, wao=True)
    
    # Filter mutations by the full region of interest
    mutations_in_range = pd.read_table(my_bed.fn, names = ['chr', 'region_start', 'region_end',
                        'ensembl', 'exon_size', 'exon_start', 'exon_end', 'mut_chr', 'mut_start', 'mut_end', 'mut_ref', 'mut_alt',
                        'mut_sample', 'mut_type', 'mut_ID', 'overlap_bp'],  sep="\s+", index_col=False)
    mutations_in_range = mutations_in_range[mutations_in_range['overlap_bp'] != 0]
    
    
    # Filter mutations by the exonic regions
    my_bed2 = exon_coords_bed.intersect(mutations_bed, wao=True)
    mutations_in_range2 = pd.read_table(my_bed2.fn, names = ['exon_chr', 'exon_start', 'exon_end',
                        'ensembl', 'mut_chr', 'mut_start', 'mut_end', 'mut_ref', 'mut_alt',
                        'mut_sample', 'mut_type', 'mut_ID', 'overlap_bp'],  sep="\s+", index_col=False)
    mutations_in_range2 = mutations_in_range2[mutations_in_range2['overlap_bp'] != 0]
    mutations_in_range2 = mutations_in_range2[['mut_chr', 'mut_start', 'mut_end', 'mut_ref', 
                        'mut_alt', 'mut_sample', 'mut_type', 'mut_ID']]
    mutations_in_range2.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'ID']
    
    
    mutations_in_range2['category'] = mutations_in_range2.apply(lambda x: 
                           check_consequence_TEST(x['chr'], x['end'], x['alt'], tb_consequence_type), axis=1)
    all_elements = mutations_in_range2['category'].tolist()
    unique_elements = set(all_elements)
    for n in unique_elements:
        count = countX(all_elements, n)
        print ("The category " + str(n) + " is found " + str(count) + " times.")

    # Find if mutations at exonic regions are synonymous or not.
    mutations_in_range2['conseq'] = mutations_in_range2.apply(lambda x: 
                       check_consequence(x['chr'], x['end'], x['alt'], tb_consequence_type, consequence_rank_dict),
                       axis=1)
    
    # Drop exonic outsise 2001-nt window
    muts_to_drop = mutations_in_range2[mutations_in_range2['conseq'] != include]
    IDs_to_drop = muts_to_drop['ID'].tolist()
    mutations_in_range = remove_muts(mutations_in_range, IDs_to_drop)
    
    # Drop recurrent mutations
    mutations_in_range = mutations_in_range[['mut_chr', 'mut_start', 'mut_end', 'mut_ref',
                                             'mut_alt']].drop_duplicates()
    mutations_in_range.columns = ['chr', 'start', 'end', 'ref', 'alt']
    
    # Compute dictionary of mutations
    mutations_in_range['ref_kmer'] = mutations_in_range.apply(lambda x: hg19(x['chr'],
                                                                            x['start']+1-(k//2),1+2*(k//2)), axis=1)
    mutations_in_range[['pre', 'nt', 'post']] = mutations_in_range['ref_kmer'].apply(lambda x: pd.Series(list(x)))
    mutations_in_range['site'] = mutations_in_range.apply(lambda x: compute_CpG_site(x), axis=1)
    mutations_in_range['mutation'] = list(zip(mutations_in_range.site, mutations_in_range.alt))
    mutations_in_range['coords'] = list(zip(mutations_in_range.chr, mutations_in_range.start))
    mut_dict = mutations_in_range.set_index('coords')['mutation'].T.to_dict()
    
    log_ll = 0
    
    # For each exon, each row, each 2001-nt sequence
    for my_row in region_coords.values.tolist():

        ## Get coordinates
        my_chr = my_row[0]
        my_start = int(my_row[1])
        my_end = int(my_row[2])
        n_bases = my_end - my_start
        
        ## Get sequence.
        my_exon_bases = hg19(my_chr, my_start+1-(k//2), size=n_bases+2*(k//2))
        ## Divide sequences into kmers
        my_kmers = [my_exon_bases[i:i+k] for i in range(len(my_exon_bases)-(k-1))]

        for i in range(0,len(my_kmers)):
            pos = my_start + i
            site = compute_CpG_site_string(my_kmers[i])
            try:
                log_ll = log_ll + np.log(prob_dict[str(mut_dict[(my_chr, pos)])])
            except:
                log_ll = log_ll + np.log(prob_dict[str((site,site))])

    return(log_ll)

## Goldmann 2018: Compute log likelihood for each model

In [None]:
## Get the mutations
mutations_file = 'non_provided_data/mutations/germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]
mutations_df = mutations_df[mutations_df['type'] == 'subs']
mutations_df = mutations_df[mutations_df['sample'] == 'Goldmann2018']

In [None]:
middle_exons_file = 'data/coordinates/genes_middle_exon_coords.bed.gz'

## Get exon coordinates
exons_coords_df = pd.read_csv(middle_exons_file, sep="\t", header=None, low_memory=False)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl', 'symbol', 'strand']
exons_coords_df = exons_coords_df[['chr', 'start', 'end', 'ensembl']]
exons_coords_df['ensembl'] = exons_coords_df.apply(lambda x: remove_version_ensembl(x, 'ensembl'), axis=1)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl']

In [None]:
results_dict = {'model': [], 'log_likelihood': [], 'n_param': []}

### 1-mer model

In [None]:
k = 1
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_golden.txt'

result = log_likelihood(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

### 1-mer with CpG

In [None]:
k = 3
middle_distance_threshold = 1000

signatures_file = 'results/CpG_signatures_DNM_golden.txt'

result = log_likelihood_CpG(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold)
results_dict['model'].append('CpG')
results_dict['n_param'].append(18)
results_dict['log_likelihood'].append(result)

### 3-mer model

In [None]:
k = 3
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_golden.txt'

result = log_likelihood(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

### 5-mer model

In [None]:
k = 5
h = 3
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_DNM_decomp_golden.txt'

result = log_likelihood(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(((4**h*3)*((k-h)*3)) # Linear increase form the core exponential increase
results_dict['log_likelihood'].append(result)

### 7-mer model

In [None]:
k = 7
h = 3
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_DNM_decomp_golden.txt'

result = log_likelihood(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(((4**h)*3)*((k-h)*3)) # Linear increase form the core exponential increase
results_dict['log_likelihood'].append(result)

#### Save into file

In [None]:
results_df = pd.DataFrame(results_dict, columns=['model', 'log_likelihood', 'n_param'])
filename = 'model_golden.txt'

results_df.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Evaluate models

#### Compute AIC

In [None]:
# Load model information
models_file = 'results/model_golden.txt'
models_df = pd.read_csv(models_file, sep="\t", header='infer')

In [None]:
# Compute AIC
models_df['AIC'] = models_df.apply(lambda x: compute_AIC(x['log_likelihood'], x['n_param']), axis=1)

In [None]:
models_df[]

## All DNM: Compute log likelihood for each model

In [None]:
## Get the mutations
mutations_file = 'non_provided_data/mutations/germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]
mutations_df = mutations_df[mutations_df['type'] == 'subs']

In [None]:
middle_exons_file = 'data/coordinates/genes_middle_exon_coords.bed.gz'

## Get exon coordinates
exons_coords_df = pd.read_csv(middle_exons_file, sep="\t", header=None, low_memory=False)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl', 'symbol', 'strand']
exons_coords_df = exons_coords_df[['chr', 'start', 'end', 'ensembl']]
exons_coords_df['ensembl'] = exons_coords_df.apply(lambda x: remove_version_ensembl(x, 'ensembl'), axis=1)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl']

In [None]:
results_dict = {'model': [], 'log_likelihood': [], 'n_param': []}

### 1-mer model

In [None]:
k = 1
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM.txt'

result = log_likelihood(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

### 1-mer with CpG

In [None]:
k = 3
middle_distance_threshold = 1000

signatures_file = 'results/CpG_signatures_DNM.txt'

result = log_likelihood_CpG(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold)
results_dict['model'].append('CpG')
results_dict['n_param'].append(18)
results_dict['log_likelihood'].append(result)

### 3-mer model

In [None]:
k = 3
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM.txt'

result = log_likelihood(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

### 5-mer model

In [None]:
k = 5
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM.txt'

result = log_likelihood(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

### 7-mer model

In [None]:
k = 7
h = 5
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_DNM_decomp.txt'

result = log_likelihood(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(((4**h)*3)*((k-h)*3)) # Linear increase form the core exponential increase
results_dict['log_likelihood'].append(result)

#### Save into file

In [None]:
results_df = pd.DataFrame(results_dict, columns=['model', 'log_likelihood', 'n_param'])
filename = 'model_all_k7h5.txt'

results_df.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Evaluate models

#### Compute AIC

In [None]:
# Load model information
models_file = 'results/model_all_k7h5.txt'
models_df = pd.read_csv(models_file, sep="\t", header='infer')

In [None]:
# Compute AIC
models_df['AIC'] = models_df.apply(lambda x: compute_AIC(x['log_likelihood'], x['n_param']), axis=1)

In [None]:
models_df

## Goldmann, synonymous vs non-synonymous

In [28]:
## Get the mutations
mutations_file = 'non_provided_data/mutations/germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]
mutations_df = mutations_df[mutations_df['type'] == 'subs']
mutations_df = mutations_df[mutations_df['sample'] == 'Goldmann2018']
mutations_df['ID'] = mutations_df.index + 1

consequence_type_file = 'data/consequence/consequence_ranking.tsv.bgz'
consequence_ranking_file = 'data/consequence/consequence_ranking_info.tsv'

## Get consequence tabix file
tb_consequence_type = tabix.open(consequence_type_file)

## Get the consequence rank info and classify separate syn from other types of non-syn (missense, non-sense,...)
consequence_rank_info = pd.read_csv(consequence_ranking_file, delimiter='\t')
consequence_rank_info['TYPE'] = consequence_rank_info.apply(lambda x:synonymous_or_not(x) ,1)
consequence_rank_info = consequence_rank_info[['CONSEQUENCE', 'TYPE']]
consequence_rank_dict = dict(zip(consequence_rank_info['CONSEQUENCE'], consequence_rank_info['TYPE']))

In [29]:
middle_exons_file = 'data/coordinates/genes_middle_exon_coords.bed.gz'

## Get exon coordinates
exons_coords_df = pd.read_csv(middle_exons_file, sep="\t", header=None, low_memory=False)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl', 'symbol', 'strand']
exons_coords_df = exons_coords_df[['chr', 'start', 'end', 'ensembl']]
exons_coords_df['ensembl'] = exons_coords_df.apply(lambda x: remove_version_ensembl(x, 'ensembl'), axis=1)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl']

In [30]:
results_dict = {'model': [], 'log_likelihood': [], 'n_param': []}

## SYNONYMOUS

In [31]:
include = 'synonymous'

### 1-mer model

In [32]:
k = 1
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_golden.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 2 times.
The category synonymous_variant is found 130 times.
The category initiator_codon_variant is found 1 times.
The category missense_variant is found 336 times.
The category stop_gained is found 25 times.


### 1-mer with CpG

In [33]:
k = 3
middle_distance_threshold = 1000

signatures_file = 'results/CpG_signatures_DNM_golden.txt'

result = log_likelihood_excluding_CpG(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append('CpG')
results_dict['n_param'].append(18)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 2 times.
The category synonymous_variant is found 130 times.
The category initiator_codon_variant is found 1 times.
The category missense_variant is found 336 times.
The category stop_gained is found 25 times.


### 3-mer model

In [34]:
k = 3
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_golden.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 2 times.
The category synonymous_variant is found 130 times.
The category initiator_codon_variant is found 1 times.
The category missense_variant is found 336 times.
The category stop_gained is found 25 times.


### 5-mer model

In [36]:
k = 5
h = 3
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_DNM_decomp_golden.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(((4**h)*3)*((k-h)*3)) # Linear increase form the core exponential increase
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 2 times.
The category synonymous_variant is found 130 times.
The category initiator_codon_variant is found 1 times.
The category missense_variant is found 336 times.
The category stop_gained is found 25 times.


### 7-mer model

In [37]:
k = 7
h = 5
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_DNM_decomp_golden.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(((4**h)*3)*((k-h)*3)) # Linear increase form the core exponential increase
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 2 times.
The category synonymous_variant is found 130 times.
The category initiator_codon_variant is found 1 times.
The category missense_variant is found 336 times.
The category stop_gained is found 25 times.


#### Save into file

In [38]:
results_df = pd.DataFrame(results_dict, columns=['model', 'log_likelihood', 'n_param'])
filename = 'model_golden_syn.txt'

results_df.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Evaluate models

#### Compute AIC

In [39]:
# Load model information
models_file = 'results/model_golden_syn.txt'
models_df = pd.read_csv(models_file, sep="\t", header='infer')

In [40]:
# Compute AIC
models_df['AIC'] = models_df.apply(lambda x: compute_AIC(x['log_likelihood'], x['n_param']), axis=1)

In [42]:
models_df

Unnamed: 0,model,log_likelihood,n_param,AIC
0,1mer,-65704.130856,12,131432.261713
1,CpG,-64306.21885,18,128648.437699
2,3mer,-64036.529684,192,128457.059368
3,5mer,-63795.550749,3072,133735.101498
4,7mer,-63725.577705,49152,225755.155409


## NON_SYNONYMOUS

In [45]:
include = 'non_synonymous'

### 1-mer model

In [46]:
k = 1
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_golden.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 2 times.
The category synonymous_variant is found 130 times.
The category initiator_codon_variant is found 1 times.
The category missense_variant is found 336 times.
The category stop_gained is found 25 times.


### 1-mer with CpG

In [47]:
k = 3
middle_distance_threshold = 1000

signatures_file = 'results/CpG_signatures_DNM_golden.txt'

result = log_likelihood_excluding_CpG(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append('CpG')
results_dict['n_param'].append(18)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 2 times.
The category synonymous_variant is found 130 times.
The category initiator_codon_variant is found 1 times.
The category missense_variant is found 336 times.
The category stop_gained is found 25 times.


### 3-mer model

In [48]:
k = 3
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM_golden.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 2 times.
The category synonymous_variant is found 130 times.
The category initiator_codon_variant is found 1 times.
The category missense_variant is found 336 times.
The category stop_gained is found 25 times.


### 5-mer model

In [49]:
k = 5
h = 3
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_DNM_decomp_golden.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(((4**h)*3)*((k-h)*3)) # Linear increase form the core exponential increase
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 2 times.
The category synonymous_variant is found 130 times.
The category initiator_codon_variant is found 1 times.
The category missense_variant is found 336 times.
The category stop_gained is found 25 times.


### 7-mer model

In [50]:
k = 7
h = 3
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_DNM_decomp_golden.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(((4**h)*3)*((k-h)*3)) # Linear increase form the core exponential increase
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 2 times.
The category synonymous_variant is found 130 times.
The category initiator_codon_variant is found 1 times.
The category missense_variant is found 336 times.
The category stop_gained is found 25 times.


#### Save into file

In [51]:
results_df = pd.DataFrame(results_dict, columns=['model', 'log_likelihood', 'n_param'])
filename = 'model_golden_nonsyn.txt'

results_df.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Evaluate models

#### Compute AIC

In [52]:
# Load model information
models_file = 'results/model_golden_nonsyn.txt'
models_df = pd.read_csv(models_file, sep="\t", header='infer')

In [53]:
# Compute AIC
models_df['AIC'] = models_df.apply(lambda x: compute_AIC(x['log_likelihood'], x['n_param']), axis=1)

In [54]:
models_df

Unnamed: 0,model,log_likelihood,n_param,AIC
0,1mer,-65704.130856,12,131432.261713
1,CpG,-64306.21885,18,128648.437699
2,3mer,-64036.529684,192,128457.059368
3,5mer,-63795.550749,3072,133735.101498
4,7mer,-63725.577705,49152,225755.155409
5,1mer,-67336.345083,12,134696.690167
6,CpG,-65870.714371,18,131777.428742
7,3mer,-65592.225899,192,131568.451797
8,5mer,-65332.703955,3072,136809.40791
9,7mer,-65256.830314,49152,228817.660627


## All datasets, synonymous vs non-synonymous

In [55]:
## Get the mutations
mutations_file = 'non_provided_data/mutations/germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_file, sep="\t", header=None)
mutations_df.columns = ['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type', 'class']
mutations_df = mutations_df[['chr', 'start', 'end', 'ref', 'alt', 'sample', 'type']]
mutations_df = mutations_df[mutations_df['type'] == 'subs']
mutations_df['ID'] = mutations_df.index + 1

consequence_type_file = 'data/consequence/consequence_ranking.tsv.bgz'
consequence_ranking_file = 'data/consequence/consequence_ranking_info.tsv'

## Get consequence tabix file
tb_consequence_type = tabix.open(consequence_type_file)

## Get the consequence rank info and classify separate syn from other types of non-syn (missense, non-sense,...)
consequence_rank_info = pd.read_csv(consequence_ranking_file, delimiter='\t')
consequence_rank_info['TYPE'] = consequence_rank_info.apply(lambda x:synonymous_or_not(x) ,1)
consequence_rank_info = consequence_rank_info[['CONSEQUENCE', 'TYPE']]
consequence_rank_dict = dict(zip(consequence_rank_info['CONSEQUENCE'], consequence_rank_info['TYPE']))

In [56]:
middle_exons_file = 'data/coordinates/genes_middle_exon_coords.bed.gz'

## Get exon coordinates
exons_coords_df = pd.read_csv(middle_exons_file, sep="\t", header=None, low_memory=False)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl', 'symbol', 'strand']
exons_coords_df = exons_coords_df[['chr', 'start', 'end', 'ensembl']]
exons_coords_df['ensembl'] = exons_coords_df.apply(lambda x: remove_version_ensembl(x, 'ensembl'), axis=1)
exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl']

In [57]:
results_dict = {'model': [], 'log_likelihood': [], 'n_param': []}

## SYNONYMOUS

In [58]:
include = 'synonymous'

### 1-mer model

In [59]:
k = 1
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 58 times.
The category None is found 10 times.
The category 5 is found 2 times.
The category synonymous_variant is found 1184 times.
The category downstream_gene_variant is found 2 times.
The category initiator_codon_variant is found 6 times.
The category splice_acceptor_variant is found 1 times.
The category missense_variant is found 3280 times.
The category upstream_gene_variant is found 8 times.
The category stop_gained is found 202 times.
The category intron_variant is found 1 times.
The category splice_donor_variant is found 3 times.


### 1-mer with CpG

In [60]:
k = 3
middle_distance_threshold = 1000

signatures_file = 'results/CpG_signatures_DNM.txt'

result = log_likelihood_excluding_CpG(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append('CpG')
results_dict['n_param'].append(18)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 58 times.
The category None is found 10 times.
The category 5 is found 2 times.
The category synonymous_variant is found 1184 times.
The category downstream_gene_variant is found 2 times.
The category initiator_codon_variant is found 6 times.
The category splice_acceptor_variant is found 1 times.
The category missense_variant is found 3280 times.
The category upstream_gene_variant is found 8 times.
The category stop_gained is found 202 times.
The category intron_variant is found 1 times.
The category splice_donor_variant is found 3 times.


### 3-mer model

In [61]:
k = 3
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 58 times.
The category None is found 10 times.
The category 5 is found 2 times.
The category synonymous_variant is found 1184 times.
The category downstream_gene_variant is found 2 times.
The category initiator_codon_variant is found 6 times.
The category splice_acceptor_variant is found 1 times.
The category missense_variant is found 3280 times.
The category upstream_gene_variant is found 8 times.
The category stop_gained is found 202 times.
The category intron_variant is found 1 times.
The category splice_donor_variant is found 3 times.


### 5-mer model

In [62]:
k = 5
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 58 times.
The category None is found 10 times.
The category 5 is found 2 times.
The category synonymous_variant is found 1184 times.
The category downstream_gene_variant is found 2 times.
The category initiator_codon_variant is found 6 times.
The category splice_acceptor_variant is found 1 times.
The category missense_variant is found 3280 times.
The category upstream_gene_variant is found 8 times.
The category stop_gained is found 202 times.
The category intron_variant is found 1 times.
The category splice_donor_variant is found 3 times.


### 7-mer model

In [63]:
k = 7
h = 5
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_DNM_decomp.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(((4**h)*3)*((k-h)*3)) # Linear increase form the core exponential increase
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 58 times.
The category None is found 10 times.
The category 5 is found 2 times.
The category synonymous_variant is found 1184 times.
The category downstream_gene_variant is found 2 times.
The category initiator_codon_variant is found 6 times.
The category splice_acceptor_variant is found 1 times.
The category missense_variant is found 3280 times.
The category upstream_gene_variant is found 8 times.
The category stop_gained is found 202 times.
The category intron_variant is found 1 times.
The category splice_donor_variant is found 3 times.


#### Save into file

In [64]:
results_df = pd.DataFrame(results_dict, columns=['model', 'log_likelihood', 'n_param'])
filename = 'model_syn.txt'

results_df.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Evaluate models

#### Compute AIC

In [65]:
# Load model information
models_file = 'results/model_syn.txt'
models_df = pd.read_csv(models_file, sep="\t", header='infer')

In [66]:
# Compute AIC
models_df['AIC'] = models_df.apply(lambda x: compute_AIC(x['log_likelihood'], x['n_param']), axis=1)

In [67]:
models_df

Unnamed: 0,model,log_likelihood,n_param,AIC
0,1mer,-483499.62229,12,967023.2
1,CpG,-468484.357064,18,937004.7
2,3mer,-466394.156015,192,933172.3
3,5mer,-464468.425375,3072,935080.9
4,7mer,-462878.93421,49152,1024062.0


## NON_SYNONYMOUS

In [68]:
include = 'non_synonymous'

### 1-mer model

In [69]:
k = 1
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 58 times.
The category None is found 10 times.
The category 5 is found 2 times.
The category synonymous_variant is found 1184 times.
The category downstream_gene_variant is found 2 times.
The category initiator_codon_variant is found 6 times.
The category splice_acceptor_variant is found 1 times.
The category missense_variant is found 3280 times.
The category upstream_gene_variant is found 8 times.
The category stop_gained is found 202 times.
The category intron_variant is found 1 times.
The category splice_donor_variant is found 3 times.


### 1-mer with CpG

In [70]:
k = 3
middle_distance_threshold = 1000

signatures_file = 'results/CpG_signatures_DNM.txt'

result = log_likelihood_excluding_CpG(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append('CpG')
results_dict['n_param'].append(18)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 58 times.
The category None is found 10 times.
The category 5 is found 2 times.
The category synonymous_variant is found 1184 times.
The category downstream_gene_variant is found 2 times.
The category initiator_codon_variant is found 6 times.
The category splice_acceptor_variant is found 1 times.
The category missense_variant is found 3280 times.
The category upstream_gene_variant is found 8 times.
The category stop_gained is found 202 times.
The category intron_variant is found 1 times.
The category splice_donor_variant is found 3 times.


### 3-mer model

In [71]:
k = 3
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 58 times.
The category None is found 10 times.
The category 5 is found 2 times.
The category synonymous_variant is found 1184 times.
The category downstream_gene_variant is found 2 times.
The category initiator_codon_variant is found 6 times.
The category splice_acceptor_variant is found 1 times.
The category missense_variant is found 3280 times.
The category upstream_gene_variant is found 8 times.
The category stop_gained is found 202 times.
The category intron_variant is found 1 times.
The category splice_donor_variant is found 3 times.


### 5-mer model

In [72]:
k = 5
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_signatures_DNM.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(4**k*3)
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 58 times.
The category None is found 10 times.
The category 5 is found 2 times.
The category synonymous_variant is found 1184 times.
The category downstream_gene_variant is found 2 times.
The category initiator_codon_variant is found 6 times.
The category splice_acceptor_variant is found 1 times.
The category missense_variant is found 3280 times.
The category upstream_gene_variant is found 8 times.
The category stop_gained is found 202 times.
The category intron_variant is found 1 times.
The category splice_donor_variant is found 3 times.


### 7-mer model

In [73]:
k = 7
h = 5
middle_distance_threshold = 1000

signatures_file = 'results/' + str(k) + 'mer_DNM_decomp.txt'

result = log_likelihood_excluding(k, mutations_df, exons_coords_df, signatures_file, middle_distance_threshold, include)
results_dict['model'].append(str(k) + 'mer')
results_dict['n_param'].append(((4**h)*3)*((k-h)*3)) # Linear increase form the core exponential increase
results_dict['log_likelihood'].append(result)

The category splice_region_variant is found 58 times.
The category None is found 10 times.
The category 5 is found 2 times.
The category synonymous_variant is found 1184 times.
The category downstream_gene_variant is found 2 times.
The category initiator_codon_variant is found 6 times.
The category splice_acceptor_variant is found 1 times.
The category missense_variant is found 3280 times.
The category upstream_gene_variant is found 8 times.
The category stop_gained is found 202 times.
The category intron_variant is found 1 times.
The category splice_donor_variant is found 3 times.


#### Save into file

In [74]:
results_df = pd.DataFrame(results_dict, columns=['model', 'log_likelihood', 'n_param'])
filename = 'model_nonsyn.txt'

results_df.to_csv(path.join('results', filename), header=True, index=None, sep='\t')

### Evaluate models

#### Compute AIC

In [75]:
# Load model information
models_file = 'results/model_nonsyn.txt'
models_df = pd.read_csv(models_file, sep="\t", header='infer')

In [76]:
# Compute AIC
models_df['AIC'] = models_df.apply(lambda x: compute_AIC(x['log_likelihood'], x['n_param']), axis=1)

In [77]:
models_df

Unnamed: 0,model,log_likelihood,n_param,AIC
0,1mer,-483499.62229,12,967023.2
1,CpG,-468484.357064,18,937004.7
2,3mer,-466394.156015,192,933172.3
3,5mer,-464468.425375,3072,935080.9
4,7mer,-462878.93421,49152,1024062.0
5,1mer,-495582.659663,12,991189.3
6,CpG,-479643.425319,18,959322.9
7,3mer,-477516.727115,192,955417.5
8,5mer,-475512.667354,3072,957169.3
9,7mer,-473870.16053,49152,1046044.0
