In [53]:
# Mutations_analysis by Marco Fumasoni

# This code has been written to analyze the list of mutations obtained from mutantanalysis by John Koschwanez (https://github.com/koschwanez/mutantanalysis)
# The script identify putative adaptive mutations when a gene is mutated more frequently than expected by chance across parallel
# and independent populations. The script run the same analysis in a list of mutations obtained from evolved wt, to remove from the 
# candidate adaptive genes, the one that are likely due to experimental conditions


# NOTE
# The code is made available for transparency reasons. At present, it is not intended to be readily usable on different datasets. 
# Also, it was not annotated and compiled to be user-friendly. Please, contact me privately for any inquiry related to the code usage.
# I will maintain this code with improved versions as soon as they are developed.

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import seaborn as sns
import xlsxwriter
import csv
from scipy.stats import nanmean
from scipy.special import gammaincc
from scipy.special import factorial

In [3]:
sns.set_context('paper', font_scale=2.2)

In [4]:
sns.set_style('ticks')

In [5]:
import pandas as pd

In [6]:
from collections import Counter

In [7]:
def hits_count(population, clones, wt, genotype):
    # Import the list of yeast ORFs with size and description and merge it with the list of mutations
    ORFs = pd.read_csv('ver_ORFs.tsv', sep='\t')
    pop= pd.merge(population, ORFs, on='sgdid')
    clo= pd.merge(clones, ORFs, on='sgdid')
    pop2=pop[['sys_name','sgdid','gene_name','size','chr_num','position','sample_name','snp_indel','ref','read','fraction','tot_reads','mutation_type','origin','description']]
    clo2=clo[['sys_name','sgdid','gene_name','size','chr_num','position','sample_name','snp_indel','ref','read','fraction','tot_reads','mutation_type','origin','description']]

    # Save the mutations to files before applying any other filter
    pop2.to_csv('output/'+genotype+'populations.csv')
    clo2.to_csv('output/'+genotype+'clones.csv')

    # change the name of the sample in 'clone'
    clo2.rename(columns = {'sample_name':'clone'}, inplace = True) 
    # put the fraction to 0
    clo2['fraction'] = 'NaN'
    # remove the secon part of the sample_name and create a population column so that after we can count only one mutation per population
    clo2['sample_name'] = clo2.clone.str.split('_').str[0]
    clo2['sample_name'] = clo2['sample_name'] + '_'

    # Join the population dataset with the clone dataset
    frames = [pop2, clo2]
    popclo = pd.concat(frames, ignore_index=True)

    # In case one gene has been mutated multiple times in the same POPULATION consider only one hit
    desired_rows = []
    for name, cur_data in popclo.groupby(['sys_name', 'sample_name']):
        cur_row = cur_data.iloc[0]
        desired_rows.append(cur_row)
    popclo_s = pd.DataFrame(desired_rows)

    # COUNT HOW MANY POPULATIONS HAVE A MUTATION IN A GIVEN GENE

    #Count the number of hits per genes and order them from highest to lowest
    counter_pop=Counter(popclo_s.sys_name) 
    genes = pd.DataFrame.from_dict(counter_pop, orient='index').reset_index() 
    genes = genes.rename(columns={'index':'sys_name', 0:'pop_hits'}) 
    genes=genes.dropna()

    #Merge the counter list with the original list
    list1= pd.merge(popclo_s, genes, on='sys_name', how='left')

    desired_rows = []
    for name, cur_data in list1.groupby(['sys_name', 'pop_hits']):
        cur_row = cur_data.iloc[0]
        desired_rows.append(cur_row)
    popclo_s_h = pd.DataFrame(desired_rows)
    hits=popclo_s_h[['sys_name','sgdid','gene_name','size','pop_hits','origin','description']]
    hits= hits.sort_values(by='pop_hits', ascending=0)
    hits.to_csv('output/' + genotype + 'pop_hits.csv')

    # COUNT HOW MANY INDEPENDENT HITS HAVE BEEN OBTAINED (taking into account diferent mutations within the same population)
    # Now, in the full list, remove the clones with the mutation in the same position otherwise they count as indivisual hits
    desired_rows = []
    for name, cur_data in popclo.groupby(['sys_name', 'sample_name','position']): # This is a bit dirty, i'm assuming two different genes where not mutated in the same population at the same position, basically if two clones from the same populations have mutations in the same position, I'm assuming is the same gene and therefore I count it as 1 hit that spread.
        cur_row = cur_data.iloc[0]
        desired_rows.append(cur_row)
    popclo_ss = pd.DataFrame(desired_rows)

    #Count the number of hits per genes and order them from highest to lowest
    counter_pop=Counter(popclo_ss.sys_name) 
    genes = pd.DataFrame.from_dict(counter_pop, orient='index').reset_index() 
    genes = genes.rename(columns={'index':'sys_name', 0:'hits'}) 
    genes= genes.dropna()

    #Merge the counter list with the original list
    list2= pd.merge(popclo_ss, genes, on='sys_name', how='left')

    desired_rows = []
    for name, cur_data in list2.groupby(['sys_name', 'hits']):
        cur_row = cur_data.iloc[0]
        desired_rows.append(cur_row)
    popclo_ss_h = pd.DataFrame(desired_rows)
    hits_all=popclo_ss_h[['sys_name','sgdid','gene_name','size','hits','origin','description']]
    hits_all= hits_all.sort_values(by='hits', ascending=0)
    hits_all.to_csv('output/'+ genotype + 'clones_hits.csv')

    #combine the two lists in a summary of hits
    summary = pd.merge(hits, hits_all, on='sys_name')

    # calculate the average frequency of the hits found in the populations
    summary['frequency']= 0
    for index, row in summary.iterrows():
        gene = pop2.loc[pop2['sys_name'] == row['sys_name'], :]
        freq=gene['fraction'].mean()
        if numpy.isnan(freq) == False:
            summary.set_value(index,'frequency',freq)
            
    #This is to make frequencies of 0 to <25
    #summary['frequency'] = summary['frequency'].replace(0,'<25')
    
    #order the list based on unique hits
    summary= summary.sort_values(by='hits', ascending=0)

    #save on a file
    summary = summary[['sys_name', 'sgdid_x', 'gene_name_x', 'size_x','pop_hits', 'hits','frequency', 'origin_x', 'description_x']]
    summary.to_csv('output/' + genotype + 'summary.csv')

    # removal of mutations selected in wt from the mutations found in the ctf4 evo experiment
    summary_filtered = summary[~summary.sys_name.isin(wt.sys_name)]
    summary_filtered.to_csv('output/' + genotype +'summary_filtered.csv')
    
    return summary_filtered

In [8]:
def hits_count_dip(population, clones, wt, genotype):
    # Import the list of yeast ORFs with size and description and merge it with the list of mutations
    ORFs = pd.read_csv('ver_ORFs.tsv', sep='\t')
    pop= pd.merge(population, ORFs, on='sgdid')
    clo= pd.merge(clones, ORFs, on='sgdid')
    pop2=pop[['sys_name','sgdid','gene_name','size','chr_num','position','sample_name','snp_indel','ref','read','fraction','tot_reads','mutation_type','origin','description']]
    clo2=clo[['sys_name','sgdid','gene_name','size','chr_num','position','sample_name','snp_indel','ref','read','fraction','tot_reads','mutation_type','origin','description']]

    # Save the mutations to files before applying any other filter
    pop2.to_csv('output/'+genotype+'populations.csv')
    clo2.to_csv('output/'+genotype+'clones.csv')

    # change the name of the sample in 'clone'
    clo2.rename(columns = {'sample_name':'clone'}, inplace = True) 
   
    # remove the secon part of the sample_name and create a population column so that after we can count only one mutation per population
    clo2['sample_name'] = clo2.clone.str.split('_').str[0]
    clo2['sample_name'] = clo2['sample_name'] + '_'

    # Join the population dataset with the clone dataset
    frames = [pop2, clo2]
    popclo = pd.concat(frames, ignore_index=True)

    # In case one gene has been mutated multiple times in the same POPULATION consider only one hit
    desired_rows = []
    for name, cur_data in popclo.groupby(['sys_name', 'sample_name']):
        cur_row = cur_data.iloc[0]
        desired_rows.append(cur_row)
    popclo_s = pd.DataFrame(desired_rows)

    # COUNT HOW MANY POPULATIONS HAVE A MUTATION IN A GIVEN GENE

    #Count the number of hits per genes and order them from highest to lowest
    counter_pop=Counter(popclo_s.sys_name) 
    genes = pd.DataFrame.from_dict(counter_pop, orient='index').reset_index() 
    genes = genes.rename(columns={'index':'sys_name', 0:'pop_hits'}) 
    genes=genes.dropna()

    #Merge the counter list with the original list
    list1= pd.merge(popclo_s, genes, on='sys_name', how='left')

    desired_rows = []
    for name, cur_data in list1.groupby(['sys_name', 'pop_hits']):
        cur_row = cur_data.iloc[0]
        desired_rows.append(cur_row)
    popclo_s_h = pd.DataFrame(desired_rows)
    hits=popclo_s_h[['sys_name','sgdid','gene_name','size','pop_hits','origin','description']]
    hits= hits.sort_values(by='pop_hits', ascending=0)
    hits.to_csv('output/' + genotype + 'pop_hits.csv')

    # COUNT HOW MANY INDEPENDENT HITS HAVE BEEN OBTAINED (taking into account diferent mutations within the same population)
    # Now, in the full list, remove the clones with the mutation in the same position otherwise they count as indivisual hits
    desired_rows = []
    for name, cur_data in popclo.groupby(['sys_name', 'sample_name','position']): # This is a bit dirty, i'm assuming two different genes where not mutated in the same population at the same position, basically if two clones from the same populations have mutations in the same position, I'm assuming is the same gene and therefore I count it as 1 hit that spread.
        cur_row = cur_data.iloc[0]
        desired_rows.append(cur_row)
    popclo_ss = pd.DataFrame(desired_rows)

    #Count the number of hits per genes and order them from highest to lowest
    counter_pop=Counter(popclo_ss.sys_name) 
    genes = pd.DataFrame.from_dict(counter_pop, orient='index').reset_index() 
    genes = genes.rename(columns={'index':'sys_name', 0:'hits'}) 
    genes= genes.dropna()

    #Merge the counter list with the original list
    list2= pd.merge(popclo_ss, genes, on='sys_name', how='left')

    desired_rows = []
    for name, cur_data in list2.groupby(['sys_name', 'hits']):
        cur_row = cur_data.iloc[0]
        desired_rows.append(cur_row)
    popclo_ss_h = pd.DataFrame(desired_rows)
    hits_all=popclo_ss_h[['sys_name','sgdid','gene_name','size','hits','origin','description']]
    hits_all= hits_all.sort_values(by='hits', ascending=0)
    hits_all.to_csv('output/'+ genotype + 'clones_hits.csv')

    #combine the two lists in a summary of hits
    summary = pd.merge(hits, hits_all, on='sys_name')

    # calculate the average frequency of the hits found in the populations
    summary['frequency']= 0
    for index, row in summary.iterrows():
        gene = pop2.loc[pop2['sys_name'] == row['sys_name'], :]
        freq=gene['fraction'].mean()
        if numpy.isnan(freq) == False:
            summary.set_value(index,'frequency',freq)
            
    # calculate the average frequency of the hits found in the clones
    summary['clone_frequency']= 0
    for index, row in summary.iterrows():
        gene = clo2.loc[clo2['sys_name'] == row['sys_name'], :]
        freq=gene['fraction'].mean()
        if numpy.isnan(freq) == False:
            summary.set_value(index,'clone_frequency',freq)
            
    # determine if on average the gene is found in het or hom in clones
    summary['zygosity'] = 0
    summary.loc[summary['clone_frequency'] < 76, 'zygosity'] = 'het'
    summary.loc[summary['clone_frequency'] > 76, 'zygosity'] = 'hom'
    
    # if the gene is often mutated in het, then mulifly it's population frequency X2 to obtain a virtual frequency
    summary['virtual_freq'] = summary['frequency']
    summary.loc[summary['zygosity'] == 'het', 'virtual_freq'] =summary.frequency * 2

    #This is to make frequencies of 0 to <25
    #summary['frequency'] = summary['frequency'].replace(0,'<25')
    
    #order the list based on unique hits
    summary= summary.sort_values(by='hits', ascending=0)

    #save on a file
    summary = summary[['sys_name', 'sgdid_x', 'gene_name_x', 'size_x','pop_hits', 'hits','frequency','virtual_freq','clone_frequency','zygosity', 'origin_x', 'description_x']]
    summary.to_csv('output/' + genotype + 'summary.csv')

    # removal of mutations selected in wt from the mutations found in the ctf4 evo experiment
    summary_filtered = summary[~summary.sys_name.isin(wt.sys_name)]
    summary_filtered.to_csv('output/' + genotype +'summary_filtered.csv')

    return summary_filtered

In [9]:
def stat(summary, genotype):

    #get the total mutations obtained at the end of the experiment (counting pop_hits I only consider unique mutations in pop)
    n=summary['hits'].sum()
    #get the size of all the genes mutated + 1000bp for promoters/terminators
    target_size = (summary['size_x'].sum())+(len(summary)*1000)
    #get the size of all the genes in the genome + 500bp for promoters/terminators
    genome_size = (ORFs['size'].sum())+(len(ORFs)*1000)
    #calculate the rate of mutations obtained at the end of the experiment
    random_rate=float(n)/genome_size
    
    #for each gene hit, calculate a mutation rate
    # Probability of having >=n number of hits
    summary['P']= 1-gammaincc(summary['hits']+1, random_rate*(summary['size_x']+1000))
    # Probability of having =n number of hits
    summary['Pe']= (random_rate*(summary['size_x']+1000))**summary['hits']*exp(-random_rate*(summary['size_x']+1000))/factorial(summary['hits'])

    #Benjamini-Hochberg correction
    positive_bh=summary.sort_values(by='P')
    positive_bh=positive_bh.reset_index()
    positive_bh2=positive_bh.loc[positive_bh['P'] < (0.05*(positive_bh.index+1)/len(ORFs)), :]
    #positive_bh=positive_bh.head(len(positive_bh2))
    positive_bh.to_csv('output/' + genotype + 'Pvalue_all.csv')
    positive_bh2.to_csv('output/'+ genotype + 'Benjamini-Hochberg.csv')

    #Bonferroni correction
    positive_bo = summary.loc[summary['P'] < (0.1/len(ORFs)), :]
    #positive_bo=positive_bo.reset_index()
    positive_bo.to_csv('output/'+ genotype + 'haploids_bonferroni.csv')

In [10]:
#Import the mutations selected in haploid and diploids
hap_sel = pd.read_csv('output/hap_hits_summary_Pvalue_all.csv')
dip_sel = pd.read_csv('output/dip_hits_summary_Pvalue_all.csv')
ORFs = pd.read_csv('ver_ORFs.tsv', sep='\t')

In [11]:
#Import the mutations found at 10% in the ctf4 evolved haploid populations (populations - 1000 generations)
ListByGenes1 = pd.read_csv('tabbed_output_by_gene_117f3_10.txt', sep='\t')
ListByGenes2 = pd.read_csv('tabbed_output_by_gene_124f3_10.txt', sep='\t')

#Import the mutations found at 90% in the ctf4 evolved haploid clones (populations - 1000 generations)
ListByGenes3 = pd.read_csv('tabbed_output_by_gene_117f3_75.txt', sep='\t')
ListByGenes4 = pd.read_csv('tabbed_output_by_gene_124f3_75.txt', sep='\t')

#Import the mutations found at 10% in the ctf4 evolved diploid populations (populations - 1000 generations)
ListByGenes5 = pd.read_csv('tabbed_output_by_gene_149f3_10.txt', sep='\t')
ListByGenes6 = pd.read_csv('tabbed_output_by_gene_150f3_10_noEVO14.txt', sep='\t')

#Import the mutations found at 40% in the ctf4 evolved dipolid clones (populations - 1000 generations)
ListByGenes7 = pd.read_csv('tabbed_output_by_gene_149f3_25.txt', sep='\t')
ListByGenes8 = pd.read_csv('tabbed_output_by_gene_150f3_25_noEVO14.txt', sep='\t')

#Import the mutations found at 10% in the ctf4 rad52 evolved haploid populations (populations - 1000 generations)
ListByGenes9 = pd.read_csv('tabbed_output_by_gene_135f3_10.txt', sep='\t')
ListByGenes10 = pd.read_csv('tabbed_output_by_gene_139f3_10.txt', sep='\t')

#Import the mutations found at 90% in the ctf4 rad52 evolved haploid clones (populations - 1000 generations)
ListByGenes11 = pd.read_csv('tabbed_output_by_gene_135f3_75.txt', sep='\t')
ListByGenes12 = pd.read_csv('tabbed_output_by_gene_139f3_75.txt', sep='\t')

# Create separate lists for haploids (hap), diploids (dip) and double (rad)

frames = [ListByGenes1, ListByGenes2]
hap_pop = pd.concat(frames, ignore_index=True)

frames = [ListByGenes3, ListByGenes4]
hap_clo = pd.concat(frames, ignore_index=True)

frames = [ListByGenes5, ListByGenes6]
dip_pop = pd.concat(frames, ignore_index=True)

frames = [ListByGenes7, ListByGenes8]
dip_clo = pd.concat(frames, ignore_index=True)

frames = [ListByGenes9, ListByGenes10]
rad_pop = pd.concat(frames, ignore_index=True)

frames = [ListByGenes11, ListByGenes12]
rad_clo = pd.concat(frames, ignore_index=True)

In [12]:
hap_pop25 = hap_pop[hap_pop.fraction >24]
haploids = hits_count(hap_pop25,hap_clo,hap_sel,'ctf4_haploids_ver_')
hap_pop_CDS = hap_pop25.loc[hap_pop25['mutation_type'] == 'Clone variant, nonsynonymous', :]
hap_clo_CDS = hap_clo.loc[hap_clo['mutation_type'] == 'Clone variant, nonsynonymous', :]
haploids_CDS = hits_count(hap_pop_CDS,hap_clo_CDS,hap_sel,'ctf4_haploids_CDS_ver_')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
rad_pop25 = rad_pop[rad_pop.fraction >24]
double = hits_count(rad_pop25,rad_clo,hap_sel,'ctf4_rad52_haploids_ver_')
rad_pop_CDS = rad_pop25.loc[rad_pop25['mutation_type'] == 'Clone variant, nonsynonymous', :]
rad_clo_CDS = rad_clo.loc[rad_clo['mutation_type'] == 'Clone variant, nonsynonymous', :]
double_CDS = hits_count(rad_pop_CDS,rad_clo_CDS,hap_sel,'ctf4_rad52_haploids_CDS_ver_')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
dip_pop25 = dip_pop[dip_pop.fraction >24]
dip_clo35 = dip_clo[dip_clo.fraction >34]
diploids = hits_count_dip(dip_pop25,dip_clo35,dip_sel,'ctf4_diploids_ver_')
dip_pop_CDS = dip_pop25.loc[dip_pop25['mutation_type'] == 'Clone variant, nonsynonymous', :]
dip_clo_CDS = dip_clo35.loc[dip_clo35['mutation_type'] == 'Clone variant, nonsynonymous', :]
diploids_CDS = hits_count_dip(dip_pop_CDS,dip_clo_CDS,dip_sel,'ctf4_diploids_CDS_ver_')

In [15]:
# try to put the list together and calculate total hits

frames = [hap_pop25, dip_pop25, rad_pop25]
joint_pop = pd.concat(frames, ignore_index=True)

frames = [hap_clo, dip_clo35, rad_clo]
joint_clo = pd.concat(frames, ignore_index=True)

joint = hits_count(joint_pop,joint_clo,hap_sel,'joint_hits_ver_')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
stat(haploids,'ctf4_haploids_ver_')
stat(double,'ctf4_rad52_haploids_ver_')
stat(diploids,'ctf4_diploids_ver_')
stat(joint,'ctf4_joint_ver_')

In [17]:
stat(haploids_CDS,'ctf4_haploids_CDS_ver_')
stat(double_CDS,'ctf4_rad52_haploids_CDS_ver_')
stat(diploids_CDS,'ctf4_diploids_CDS_ver_')