# Gene and sample analysis

This notebook performs by gene and sample precomputations needed for the regression of H3K36me3 and nucleosomes (Figure 4 and Supplementary Figure S6).

---

## Output

Two tabulated files *germinal_gene_dependent_results.txt* and *germinal_sample_dependent_results.txt*).

One of the files file contains information at the gene level: number of bp in exons and introns, mean replication time, number of synonymous and non-synonymous variants, and sumed probabilities of exon, intron, synonymous and non-synonymous mutations to occur.

The other file contains information at sample level: number of observed mutations in exons and introns and number of  observed synonymous and non-synonymous mutations.

In [1]:
from os import path

import bgdata
import tabix  # package pytabix
import numpy as np
import pandas as pd
import pybedtools
from collections import Counter
from bgreference import hg19

## Input

Files in the **data** directory.

- *middle_exons_coords*: file with the coordinates of the middle exons
- *consequence_type_file*: tabix file with the consequence types of exonic mutations
- *replication_time_path*: tab separated file containing information about replication time (processed but not used in our analysis unlike Frigola et al. 2017).

### Other inputs

- mutations_folder: base directory where the files with the mutations are located
- tumor_type: id of the tumor (germinal in our case)

In [2]:
consequence_type_file = 'data/consequence/consequence_ranking.tsv.bgz'

mutations_folder = 'non_provided_data/mutations/'
coordinates_folder = 'data/coordinates/'
replication_time_path = 'non_provided_data/replication_timing.txt'

## Functions

All the functions needed for this notebook are coded below

In [3]:
def remove_version_ensembl(x, colname):
    """
    Remove version from the ensembl ID

    Args:
        x: dataframe row
        colname: column name

    Returns:
        str. Text before a '.'

    """
    ensembl_id = x[colname]
    ensembl_id_only = ensembl_id.split('.')[0]

    return ensembl_id_only

In [4]:
def synonymous_or_not(x):
    """
    Classifies consequence types between synonymous and non synonymous
    using the RANK column

    Args:
        x: dataframe row

    Returns:
        str. *synonymous* or *non_synonymous*

    """

    my_rank = x['RANK']

    if my_rank < 15:
        my_consequence = 'non_synonymous'

    else:
        my_consequence = 'synonymous'

    return my_consequence

In [5]:
def parse_rep_time_data(replication_time_path, whole_gene_coords):
    """
    Compute the mean replication time of each gene (using the replication time of each of regions)

    Args:
        replication_time_path (str): path to a tabulated file with 3 columns (chomosome, position and replication time)
        whole_gene_coords (:class:`~pandas.DataFrame`): table with a genomic coordinates table

    Returns:
        :class:`~pandas.DataFrame`.

    """
    replication_time_df = pd.read_csv(replication_time_path, sep='\t', header=None)
    replication_time_df[0] = 'chr' + replication_time_df[0].astype(str)

    replication_time_df.columns = ['chr', 'position', 'rep_time']
    replication_time_df['position'] = replication_time_df['position'].astype(np.int64)

    replication_time_df['start'] = replication_time_df['position'] - 1
    replication_time_df_tointersect = replication_time_df[['chr', 'start', 'position']]

    replication_time_bed = pybedtools.BedTool.from_dataframe(replication_time_df_tointersect)
    whole_gene_coords_bed = pybedtools.BedTool.from_dataframe(whole_gene_coords)

    rep_time_gene_bed = whole_gene_coords_bed.intersect(replication_time_bed)
    rep_time_gene_df = pd.read_table(rep_time_gene_bed.fn, names=['chr', 'start', 'position', 'gene'])
    rep_time_gene_df = rep_time_gene_df[['chr', 'position', 'gene']]

    final_gene_rep_time_df = pd.merge(rep_time_gene_df, replication_time_df, on=['chr', 'position'])
    final_gene_rep_time_df = final_gene_rep_time_df[['chr', 'position', 'gene', 'rep_time']]
    sub_final_gene_rep_time_df = final_gene_rep_time_df[['gene', 'rep_time']]

    rep_time_mean_per_gene_df = sub_final_gene_rep_time_df.groupby('gene').mean()
    rep_time_mean_per_gene_df = rep_time_mean_per_gene_df.sort_values(by='rep_time', ascending=False)
    rep_time_mean_per_gene_df = rep_time_mean_per_gene_df[np.isfinite(rep_time_mean_per_gene_df['rep_time'])]

    whole_gene_coords_bed.delete_temporary_history(ask=False)
    replication_time_bed.delete_temporary_history(ask=False)
    rep_time_gene_bed.delete_temporary_history(ask=False)

    return rep_time_mean_per_gene_df

In [6]:
def precompute_counts(my_genes_list, exons_coords_symbol_df, introns_coords_symbol_df, nucleotides):
    """
    For each gene, count all trinucleotides seen. Set a counter where the key is the
    combination of the reference triplet and each of the possible alterations
    (taken from the remaining nuclotides).
    As the count is only done for the reference, each combination of reference triplet
    and possible alteration has the same count as the reference triplet.

    .. note::

       The first and last nucleotide of each exon or intron are do not contribute to the counts.


    Args:
        my_genes_list (list): list of gene Ensembl identifiers
        exons_coords_symbol_df (:class:`~pandas.DataFrame`): table with the genomic coordinates of the exons
        introns_coords_symbol_df (:class:`~pandas.DataFrame`): table with the genomic coordinates of the introns
        nucleotides (set): nucleotides to consider (typically A, C, G and T)

    Returns:
        dict. For each gene, the counts on exons and introns are retrieved.

    """

    counters_dict = dict()

    for my_gene in my_genes_list:

        introns_counter = Counter()
        exons_counter = Counter()

        # Exons
        try:
            exons_gene = exons_coords_symbol_df[exons_coords_symbol_df['ensembl'] == my_gene]
            exons_gene_lol = exons_gene.values.tolist()

            for my_exon in exons_gene_lol:

                my_chr = my_exon[0][3:]
                my_start = int(my_exon[1])
                my_end = int(my_exon[2])

                n_bases = my_end - my_start

                my_exon_bases = hg19(my_chr, my_start+1, n_bases)

                for i in range(len(my_exon_bases)):

                    my_relative_position = i + my_start

                    signature = hg19(my_chr, my_relative_position, 3)

                    my_ref_base = signature[1].upper()

                    my_alt_bases = nucleotides - set(my_ref_base)

                    for alt_base in my_alt_bases:

                        tri_ref = signature[0] + str(my_ref_base) + signature[-1]
                        tri_alt = signature[0] + str(alt_base) + signature[-1]

                        my_key = tuple([tri_ref, tri_alt])

                        exons_counter[my_key] += 1

        except ValueError:
            print("problem with exons from gene: ", my_gene)

        # Introns
        try:
            introns_gene = introns_coords_symbol_df[introns_coords_symbol_df['ensembl'] == my_gene]
            introns_gene_lol = introns_gene.values.tolist()

            for my_intron in introns_gene_lol:

                my_chr = my_intron[0][3:]
                my_start = int(my_intron[1])
                my_end = int(my_intron[2])

                n_bases = my_end - my_start

                my_intron_bases = hg19(my_chr, my_start+1, n_bases)

                for i in range(len(my_intron_bases)):

                    my_relative_position = i + my_start

                    signature = hg19(my_chr, my_relative_position, 3)

                    my_ref_base = signature[1].upper()

                    my_alt_bases = nucleotides - set(my_ref_base)

                    for alt_base in my_alt_bases:

                        tri_ref = signature[0] + str(my_ref_base) + signature[-1]
                        tri_alt = signature[0] + str(alt_base) + signature[-1]

                        my_key = tuple([tri_ref, tri_alt])

                        introns_counter[my_key] += 1

        except ValueError:
            print("problem with introns from gene: ", my_gene)

        counters_dict[(my_gene, 'exons_count')] = exons_counter
        counters_dict[(my_gene, 'introns_count')] = introns_counter

    return counters_dict

In [53]:
def get_muts_per_gene(mutations_df, exons_coords_df, introns_coords_df):
    """
    Intersects the mutations and the coordinates file getting mutations in introns and exons

    Args:
        mutations_df (:class:`~pandas.DataFrame`): table with the mutations
        exons_coords_df (:class:`~pandas.DataFrame`): table with the genomic coordinates of the exons
        introns_coords_df (:class:`~pandas.DataFrame`): table with the genomic coordinates of the introns

    Returns:
        :obj:`tuple` of :class:`~pandas.DataFrame`. One DataFrame with the mutations that fall in any of the exons
        and the other with the ones falling in introns.

    """

    sub_mutations_df = mutations_df[['CHROMOSOME', 'POSITION', 'SAMPLE', 'ID']].copy()
    sub_mutations_df['START'] = sub_mutations_df['POSITION'] - 1  # for pybedtools

    sub_mutations_df = sub_mutations_df[['CHROMOSOME', 'START', 'POSITION', 'SAMPLE', 'ID']]
    sub_mutations_df['CHROMOSOME'] = sub_mutations_df['CHROMOSOME'].astype(str)

    sub_mutations_df.columns = ['chr', 'start', 'end', 'sample', 'ID']
    exons_coords_df.columns = ['chr', 'start', 'end', 'ensembl']
    introns_coords_df.columns = ['chr', 'start', 'end', 'ensembl']

    sub_exons_coords_df = exons_coords_df[['chr', 'start', 'end', 'ensembl']]
    sub_introns_coords_df = introns_coords_df[['chr', 'start', 'end', 'ensembl']]

    exons_bed = pybedtools.BedTool.from_dataframe(sub_exons_coords_df)
    introns_bed = pybedtools.BedTool.from_dataframe(sub_introns_coords_df)
    sub_mutations_bed = pybedtools.BedTool.from_dataframe(sub_mutations_df)

    mutations_in_exons_bed = exons_bed.intersect(sub_mutations_bed, wao=True)
    mutations_in_introns_bed = introns_bed.intersect(sub_mutations_bed, wao=True)
    
    mutations_in_exons_symbol_df = pd.read_table(mutations_in_exons_bed.fn, names = ['range_chr', 'range_start',
                        'range_end', 'ensembl', 'mut_chr', 'mut_start', 'mut_end', 'mut_sample', 'mut_ID',
                        'overlap_bp'], sep="\s+", index_col=False)
    mutations_in_exons_symbol_df = mutations_in_exons_symbol_df[mutations_in_exons_symbol_df['overlap_bp'] != 0]
    mutations_in_exons_symbol_df = mutations_in_exons_symbol_df[['mut_chr', 'mut_start', 'mut_end', 'ensembl',
                                                                 'mut_sample', 'mut_ID']]
    mutations_in_exons_symbol_df.columns = ['chr', 'start', 'end', 'ensembl', 'sample', 'ID']
    
    mutations_in_introns_symbol_df = pd.read_table(mutations_in_introns_bed.fn, names = ['range_chr', 'range_start',
                        'range_end', 'ensembl', 'mut_chr', 'mut_start', 'mut_end', 'mut_sample', 'mut_ID',
                        'overlap_bp'], sep="\s+", index_col=False)
    mutations_in_introns_symbol_df = mutations_in_introns_symbol_df[mutations_in_introns_symbol_df['overlap_bp'] != 0]
    mutations_in_introns_symbol_df = mutations_in_introns_symbol_df[['mut_chr', 'mut_start', 'mut_end', 'ensembl',
                                                                 'mut_sample', 'mut_ID']]
    mutations_in_introns_symbol_df.columns = ['chr', 'start', 'end', 'ensembl', 'sample', 'ID']

    exons_bed.delete_temporary_history(ask=False)
    introns_bed.delete_temporary_history(ask=False)
    sub_mutations_bed.delete_temporary_history(ask=False)
    mutations_in_exons_bed.delete_temporary_history(ask=False)
    mutations_in_introns_bed.delete_temporary_history(ask=False)

    return mutations_in_exons_symbol_df, mutations_in_introns_symbol_df

In [8]:
def build_my_positions_result2(genes_list, exons_coords_symbol_df, tb_consequence_type, CONSEQUENCE_RANK):
    """
    For a list of genes, get the consequence type associated with each combination of reference and
    alternate triplets.

    Args:
        genes_list (list): list of genes Ensembl identifiers
        exons_coords_symbol_df (:class:`~pandas.DataFrame`): table with the exons genomic coordinates
        tb_consequence_type: tabix with the consequence types
        CONSEQUENCE_RANK (:class:`~pandas.DataFrame`): table that maps each consequence type with its type
        (synonymous or non-synonymous)

    Returns:
        :class:`~pandas.DataFrame`. Table with the following fiels:
        chromosome, position, ref, alt, ensembl ID, consequence type, type of the consequence,
        reference triplet, alternate triplet

    """
    my_position_results = list()

    for my_gene in genes_list:

        try:
            exons_gene = exons_coords_symbol_df[exons_coords_symbol_df['ensembl'] == my_gene]
            exons_gene_lol = exons_gene.values.tolist()

            for my_exon in exons_gene_lol:

                my_chr = my_exon[0][3:]
                my_start = int(my_exon[1])
                my_end = int(my_exon[2])

                res_exons = tb_consequence_type.querys("{}:{}-{}".format(my_chr, my_start, my_end))

                for my_result in res_exons:
                    if len(my_result) < 6:
                        continue

                    my_result[5] = my_result[5].rstrip('\r')

                    signature = hg19(my_result[0], int(my_result[1])-1, 3)

                    tri_ref = signature[0] + my_result[2] + signature[-1]
                    tri_alt = signature[0] + my_result[3] + signature[-1]

                    my_result.append(tri_ref)
                    my_result.append(tri_alt)
                    my_result.append(my_gene)

                    my_position_results.append(my_result)

        except ValueError:
            print("problem with gene: ", my_gene)

    my_position_results_df = pd.DataFrame(my_position_results)
    my_position_results_df.columns = ['CHROMOSOME', 'POSITION', 'REF', 'ALT', 'ENSEMBL_NOVER', 'CONSEQUENCE', 'TRI_REF',
                                      'TRI_ALT', 'ENSEMBL']

    my_position_results_df = my_position_results_df.drop('ENSEMBL_NOVER', 1)
    my_position_results_df = my_position_results_df[
        ['CHROMOSOME', 'POSITION', 'REF', 'ALT', 'ENSEMBL', 'CONSEQUENCE', 'TRI_REF',
         'TRI_ALT']]

    my_position_results_df = pd.merge(my_position_results_df, CONSEQUENCE_RANK, on='CONSEQUENCE')
    my_position_results_df['POSITION'] = my_position_results_df['POSITION'].astype(int)
    my_position_results_df['CHROMOSOME'] = my_position_results_df['CHROMOSOME'].astype(str)

    return my_position_results_df

In [9]:
def compute_syn_nonsyn_muts2(genes_list, mutations_in_exons_symbol_df, tb_consequence_type, mutations_df, CONSEQUENCE_RANK):
    """
    For each gene in the list, get the consequence type associated with each combination of reference and
    alternate triplets and merge it with the mutations.

    Args:
        genes_list (list): list of genes Ensembl identifiers
        mutations_in_exons_symbol_df (:class:`~pandas.DataFrame`): table with the exons genomic coordinates
        tb_consequence_type: tabix with the consequence types
        mutations_df :class:`~pandas.DataFrame`): table with the mutations
        CONSEQUENCE_RANK (:class:`~pandas.DataFrame`): table that maps each consequence type with its type
        (synonimous or non-synonimous)

    Returns:
        :class:`~pandas.DataFrame`. Table

    """

    my_position_results = list()

    for symbol_gene in genes_list:

        gene_mutations = mutations_in_exons_symbol_df[mutations_in_exons_symbol_df['ensembl'] == symbol_gene]
        gene_mutations_list = gene_mutations.values.tolist()

        # For every exonic mutation in every gene, know if it is synonymous or nonsynonymous

        if len(gene_mutations_list) > 0:

            for gene_mutation in gene_mutations_list:

                my_chr = gene_mutation[0][3:]
                my_start = int(gene_mutation[1])
                my_end = int(gene_mutation[2])

                res_exons = tb_consequence_type.querys("{}:{}-{}".format(my_chr, my_end, my_end))  # mutations are expected to have only 1 position so start and end are equal

                for my_result in res_exons:
                    my_result[5] = my_result[5].rstrip('\r')

                    signature = hg19(my_result[0], int(my_result[1])-1, 3)

                    tri_ref = signature[0] + my_result[2] + signature[-1]
                    tri_alt = signature[0] + my_result[3] + signature[-1]

                    my_result.append(tri_ref)
                    my_result.append(tri_alt)
                    my_result.append(symbol_gene)

                    my_position_results.append(my_result)

    my_position_results_df = pd.DataFrame(my_position_results)
    my_position_results_df.columns = ['CHROMOSOME', 'POSITION', 'REF', 'ALT', 'ENSEMBL_ONLY',
                                      'CONSEQUENCE', 'TRI_REF', 'TRI_ALT', 'ENSEMBL']

    my_position_results_df = my_position_results_df.drop('ENSEMBL_ONLY', 1)

    my_position_results_df = pd.merge(my_position_results_df, CONSEQUENCE_RANK, on='CONSEQUENCE')
    my_position_results_df['POSITION'] = my_position_results_df['POSITION'].astype(int)
    my_position_results_df['CHROMOSOME'] = my_position_results_df['CHROMOSOME'].astype(str)
    my_position_results_df.drop_duplicates(inplace=True)
    my_position_results_df['CHROMOSOME'] = 'chr' + my_position_results_df['CHROMOSOME'].astype(str)

    mutations_in_exons = pd.merge(my_position_results_df, mutations_df, on=['CHROMOSOME', 'POSITION', 'REF', 'ALT'])

    return mutations_in_exons

In [64]:
def compute_results_per_gene(sub_mutations_in_exons, my_gene, sub_exon, sub_intron, my_sample):
    """
    Extract the number of synonymous and non-synonymous mutations
    and their proportion and the exon and intron proportion.
    Additionally, the number of mutation in exons and introns is computed.

    Args:
        sub_mutations_in_exons (:class:`~pandas.DataFrame`): table with the exonic mutations and their type (syn vs. non-syn)
        my_gene (str): Ensembl identifier
        sub_exon (:class:`~pandas.DataFrame`): table with the mutations in exons
        sub_intron (:class:`~pandas.DataFrame`): table with the mutations in introns
        my_sample (str): sample identifier

    Returns:
        list. Gene identifier, number of mutations in exons and introns, number of mutations synonymous and
        non-synonymous mutations and sample identifier.

    """
    sub_gene_mutations = sub_mutations_in_exons[sub_mutations_in_exons['ENSEMBL'] == my_gene]

    n_syn_muts = len(sub_gene_mutations[sub_gene_mutations['TYPE_x'] == 'synonymous'])
    n_nonsyn_muts = len(sub_gene_mutations[sub_gene_mutations['TYPE_x'] == 'non_synonymous'])

    n_exon_muts = len(sub_exon[sub_exon['ensembl'] == my_gene])
    n_intron_muts = len(sub_intron[sub_intron['ensembl'] == my_gene])

    my_row = [my_gene, n_exon_muts, n_intron_muts, n_syn_muts,
              n_nonsyn_muts, my_sample]

    return my_row

## By gene analysis

In [19]:
nucleotides = set(['A', 'T', 'C', 'G'])

# Read common files
## Load consequences
tb_consequence_type = tabix.open(consequence_type_file)

## Obtain consequence rank
CONSEQUENCE_RANK = pd.read_csv(path.join('~/.bgdata/intogen/ensembl/1.0-20150729', 'consequence_ranking.tsv'), delimiter='\t')

## Classify each consequence according to the rank
CONSEQUENCE_RANK['TYPE'] = CONSEQUENCE_RANK.apply(lambda x:synonymous_or_not(x) ,1)
CONSEQUENCE_RANK = CONSEQUENCE_RANK[['CONSEQUENCE', 'TYPE']]

## Load signatures
signatures_path = path.join('results', '3mer_signatures_DNM_new.txt')
all_signatures = pd.read_csv(signatures_path, sep='\t')

## Load mutations
mutations_path = mutations_folder + 'germinal_ultimate_dataset.bed.gz'
mutations_df = pd.read_csv(mutations_path, sep="\t", header=None) 
mutations_df.columns = ['CHROMOSOME', 'START', 'POSITION', 'REF', 'ALT', 'SAMPLE', 'TYPE', 'CLASS']
mutations_df = mutations_df[mutations_df['TYPE'] == 'subs']
mutations_df['ID'] = mutations_df.index + 1

## Load coordinates
introns_coords = path.join(coordinates_folder, 'germinal' + '_filtered_introns_coords.txt') # Output from other notebook
middle_exons_coords = path.join(coordinates_folder, 'germinal' + '_filtered_exons_coords.txt') # Output from other notebook
exons_coords_df = pd.read_csv(middle_exons_coords, sep="\t", low_memory=False)
introns_coords_df = pd.read_csv(introns_coords, sep="\t", low_memory=False)
exons_coords_df['ensembl'] = exons_coords_df.apply(lambda x: remove_version_ensembl(x, 'ensembl'), axis=1)
introns_coords_df['ensembl'] = introns_coords_df.apply(lambda x: remove_version_ensembl(x, 'ensembl'), axis=1)
exons_coords_symbol_df = exons_coords_df[['chr', 'start', 'position', 'ensembl']]
introns_coords_symbol_df = introns_coords_df[['chr', 'start', 'position', 'ensembl']]
exons_coords_symbol_df.columns = ['chr', 'start', 'end', 'ensembl']
introns_coords_symbol_df.columns = ['chr', 'start', 'end', 'ensembl']
    
## Find mean replication time for all genes
whole_gene_coords = pd.concat([exons_coords_symbol_df, introns_coords_symbol_df])
rep_time_mean_per_gene_df = parse_rep_time_data(replication_time_path, whole_gene_coords)

In [None]:
# Gene by gene
my_genes_list = list(set(exons_coords_df['ensembl'].tolist()))
print('{} genes'.format(len(my_genes_list)))
    
# For each gene, get the counts of each trinucleotide. Useful for expected exon-intron muts.
# note that the counts are returned for pairs ref-alt triplets, where all possible alt have the same counts.
# This is done to make our life easier later on when doing some merging.
counters_dict = precompute_counts(my_genes_list, exons_coords_symbol_df, introns_coords_symbol_df, nucleotides)
    
# Assing syn/non-syn to all possible alterations in all exonic coordines
gene_position_results_df = build_my_positions_result2(my_genes_list,   #USING ANOTHER IMPORTED FUNCTION
                                                   exons_coords_symbol_df,
                                                   tb_consequence_type,
                                                   CONSEQUENCE_RANK)

# Choose signature (useless, only one signature)
probability_name = 'Probability_germline'
sub_signatures_df = all_signatures[['mutation', probability_name]]

prob_name = 'Probability_germline'

# Lists to store results
final_results_lol = list() 
        
genes_other_info = list()
        
# Expected proportions at gene level (using the signature of the cluster)
for my_gene in my_genes_list:

    try:

        sub_positions_results_df = gene_position_results_df[gene_position_results_df['ENSEMBL'] == my_gene]

        # Count possible synonymous and non-synonymous mutations in the gene                
        n_synonimous_variants = sub_positions_results_df[sub_positions_results_df['TYPE'] == 'synonymous'].shape[0]
        n_non_synonimous_variants = sub_positions_results_df[sub_positions_results_df['TYPE'] == 'non_synonymous'].shape[0]
                
        # Exclude genes with 0 synonimous and non synomimous variants
        if n_synonimous_variants == 0 and n_non_synonimous_variants == 0:
#               print('Error with gene: ', my_gene)
            pb.value += 1
            continue

        # Compute probability of a syn/non-syn mutation to occur given its context
        # (and the probability associated from the signature)
        non_syn_sub_df = sub_positions_results_df[sub_positions_results_df['TYPE'] == 'non_synonymous'].copy()
        non_syn_sub_df['MY_KEY'] = list(zip(non_syn_sub_df.TRI_REF, non_syn_sub_df.TRI_ALT))  # identify by ref triplet - alt triplet
        non_syn_sub_df['MY_KEY'] = non_syn_sub_df['MY_KEY'].astype(str)
        nonsyn_cnt = non_syn_sub_df['MY_KEY'].value_counts()  # counts of each (ref triplet, alt triplet)
        nonsyn_df = pd.DataFrame(nonsyn_cnt)
        nonsyn_df['mutation'] = nonsyn_df.index
        nonsyn_df = pd.merge(nonsyn_df, sub_signatures_df, on='mutation')
        nonsyn_df['product'] = nonsyn_df['MY_KEY']*nonsyn_df[prob_name]  # multiply counts by its probability
        nonsyn_products_sum = sum(nonsyn_df['product'])

        syn_sub_df = sub_positions_results_df[sub_positions_results_df['TYPE'] == 'synonymous'].copy()
        syn_sub_df['MY_KEY'] = list(zip(syn_sub_df.TRI_REF, syn_sub_df.TRI_ALT))
        syn_sub_df['MY_KEY'] = syn_sub_df['MY_KEY'].astype(str)
        syn_cnt = syn_sub_df['MY_KEY'].value_counts()
        syn_df = pd.DataFrame(syn_cnt)
        syn_df['mutation'] = syn_df.index
        syn_df = pd.merge(syn_df, sub_signatures_df, on='mutation')
        syn_df['product'] = syn_df['MY_KEY']*syn_df[prob_name]
        syn_products_sum = sum(syn_df['product'])

        # Compute probability of 
        # any exon and intron mutation
        exons_counter = counters_dict[(my_gene, 'exons_count')]
        exons_counter_df = pd.DataFrame.from_dict(exons_counter, orient='index').reset_index()
        exons_counter_df.columns = ['mutation', 'count']
        exons_counter_df['mutation'] = exons_counter_df['mutation'].astype(str)
        exons_counter_df = pd.merge(exons_counter_df, sub_signatures_df, on='mutation')
        exons_counter_df['product'] = exons_counter_df['count']*exons_counter_df[prob_name]
        exons_products_sum = sum(exons_counter_df['product'])
                
        introns_counter = counters_dict[(my_gene, 'introns_count')]
        introns_counter_df = pd.DataFrame.from_dict(introns_counter, orient='index').reset_index()
        introns_counter_df.columns = ['mutation', 'count']
        introns_counter_df['mutation'] = introns_counter_df['mutation'].astype(str)
        introns_counter_df = pd.merge(introns_counter_df, sub_signatures_df, on='mutation')
        introns_counter_df['product'] = introns_counter_df['count']*introns_counter_df[prob_name]
        introns_products_sum = sum(introns_counter_df['product'])

        my_row = [my_gene,n_synonimous_variants, n_non_synonimous_variants, 
                          exons_products_sum, introns_products_sum, syn_products_sum, nonsyn_products_sum]

        final_results_lol.append(my_row)
                
                
        # Other gene information (number of base pairs, replicaton time)
                
        gene_exons_coords = exons_coords_symbol_df[exons_coords_symbol_df['ensembl'] == my_gene]
        exons_bp = sum(gene_exons_coords['end'] - gene_exons_coords['start'])

        gene_introns_coords = introns_coords_symbol_df[introns_coords_symbol_df['ensembl'] == my_gene]
        introns_bp = sum(gene_introns_coords['end'] - gene_introns_coords['start'])

        sub_gene_reptime = rep_time_mean_per_gene_df[rep_time_mean_per_gene_df.index == my_gene]['rep_time']

        if len(sub_gene_reptime) > 0:
            gene_reptime = sub_gene_reptime.values[0]

        elif len(sub_gene_reptime) == 0:
            gene_reptime = np.nan

        genes_other_info.append([my_gene, exons_bp, introns_bp, gene_reptime])


    except:
        pass

    pb.value += 1

print('{} valid genes'.format(len(final_results_lol)))

final_results_df = pd.DataFrame(final_results_lol)
final_results_df.columns = ['ensembl', 'n_synonimous_variants', 'n_non_synonimous_variants',
                                    'exons_products_sum', 'introns_products_sum', 'syn_products_sum',
                                    'nonsyn_products_sum']

genelevel_results = final_results_df
        
new_genes_list = list(set(genelevel_results['ensembl'].tolist()))
        
genes_other_info_df = pd.DataFrame(genes_other_info)
genes_other_info_df.columns = ['ensembl', 'exons_bp', 'introns_bp', 'reptime']
        
genelevel_results_toprint = genelevel_results
genelevel_results_toprint = pd.merge(genelevel_results_toprint, genes_other_info_df, on='ensembl')

genelevel_results_toprint.to_csv(path.join('results', 'germinal' + '_gene_dependent_results.txt'), sep='\t', header=True, index=None)

## By sample analysis

In [70]:
by_gene_path = path.join('results', 'germinal' + '_gene_dependent_results.txt')
genelevel_results = pd.read_csv(by_gene_path, sep="\t", header='infer', low_memory=False)
new_genes_list = list(set(genelevel_results['ensembl'].tolist()))

# Get exon and intron mutations in the samples of the cluster (without sample identifier)    
mutations_in_exons_symbol_df, mutations_in_introns_symbol_df = get_muts_per_gene(mutations_df=mutations_df,
                                                                                 exons_coords_df=exons_coords_df,
                                                                                 introns_coords_df=introns_coords_df)

# For every exonic mutation in every gene, know if it is synonymous or nonsynonymous
mutations_in_exons_cons = compute_syn_nonsyn_muts2(new_genes_list,
                                                          mutations_in_exons_symbol_df,
                                                          tb_consequence_type,
                                                          mutations_df,
                                                          CONSEQUENCE_RANK)

# All samples together

common_samples = ['subs']

# Analysis at sample level (per cluster)
allgenes_results = list()
print('{} samples'.format(len(common_samples)))
for my_sample in common_samples:

    sub_mutations_in_exons = mutations_in_exons_cons[mutations_in_exons_cons['TYPE_y'] == my_sample]

    sub_exon = mutations_in_exons_symbol_df.copy()
    sub_intron = mutations_in_introns_symbol_df.copy()

    for my_gene in new_genes_list:

        # Per each gene in each sample, compute the number of exon, intron, syn,
        # nonsyn mutations and their expected proportions
                
        allgenes_results.append(compute_results_per_gene(sub_mutations_in_exons,
                                         my_gene, sub_exon, sub_intron, my_sample))

allgenes_results_df = pd.DataFrame(allgenes_results)

allgenes_results_df.columns = ['ensembl', 'obs_exon_muts', 'obs_intron_muts',
                                       'obs_syn_muts', 'obs_nonsyn_muts', 'sample']

1 samples


ensembl            ENSG00000161573ENSG00000137941ENSG00000163508E...
obs_exon_muts                                                   4452
obs_intron_muts                                               158492
obs_syn_muts                                                    1116
obs_nonsyn_muts                                                 3331
dtype: object

In [72]:
# Add sample to mutations in introns and exons
formated_mutations_df = mutations_df[['CHROMOSOME', 'POSITION', 'CLASS']]
formated_mutations_df.columns = ['chr', 'end', 'class']

sample_mutations_in_exons_symbol_df = pd.merge(mutations_in_exons_symbol_df,
                                                      formated_mutations_df, on=['chr', 'end'])

sample_mutations_in_introns_symbol_df = pd.merge(mutations_in_introns_symbol_df,
                                                         formated_mutations_df, on=['chr',
                                                            'end'])

# All samples together

common_samples = ['mixed', 'autism', 'healthy']

# Analysis at sample level (per cluster)
allgenes_results = list()
print('{} samples'.format(len(common_samples)))
for my_sample in common_samples:

    sub_mutations_in_exons = mutations_in_exons_cons[mutations_in_exons_cons['CLASS'] == my_sample]

    sub_exon = sample_mutations_in_exons_symbol_df[sample_mutations_in_exons_symbol_df['class'] == my_sample]
    sub_intron = sample_mutations_in_introns_symbol_df[sample_mutations_in_introns_symbol_df['class'] == my_sample]

    for my_gene in new_genes_list:

        # Per each gene in each sample, compute the number of exon, intron, syn,
        # nonsyn mutations and their expected proportions
                
        allgenes_results.append(compute_results_per_gene(sub_mutations_in_exons,
                                         my_gene, sub_exon, sub_intron, my_sample))

allgenes_results_df = pd.DataFrame(allgenes_results)

allgenes_results_df.columns = ['ensembl', 'obs_exon_muts', 'obs_intron_muts',
                                       'obs_syn_muts', 'obs_nonsyn_muts', 'sample']
            

allgenes_results_df_toprint = allgenes_results_df

allgenes_results_df_toprint.to_csv(path.join('results', 'germinal_sample_dependent_results.txt'),
                                          sep='\t', header=True, index=None)

3 samples
