In [1]:
# Modified by Marco Fumasoni from "plot_tumor_depth_all_chromosome.py" of unknown origin

# This version of the script subtracts the 'normal' depth ratio (called anc) to the 'tumor' depth ratio (called evo) 
# from a varscan .copynumber file. In that way naturally occurring repeated sequences are buffered. 
# This result in more linear chromosome visualization and highlights only the fragments that have a real change in 
# read depth across the genome. To facilitate the identification the script uses a color gradient where red means gain 
# of copies and blue means loss. 
# The script generate 3 pdf files, one for the anc genome, one for the evo genome and one for the evo genome normalized to
# to the ancestor

# NOTE
# The code is made available for transparency reasons. At present, it is not intended to be readily usable on different datasets. 
# Also, it was not annotated and compiled to be user-friendly. Please, contact me privately for any inquiry related to the code usage.
# I will maintain this code with improved versions as soon as they are developed.

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import cm
import glob, os
import sys
#import seaborn as sns
from scipy.stats import mode
from matplotlib.patches import Rectangle


# Find the .copynumber files in the folder and generate 3 pdf for each file
os.chdir("../clones")
for file in glob.glob("*.copynumber"):
    # load the varscan dataset
    datafn=file

    # Read in the start position and log2_ratio for each window, saving each chromosome separately
    # Initialize a dictionary to hold the data, arranged by chromosome
    chrom_dict = {}
    with open(datafn) as datafile:
        # The first line of the file is the header line. Store in variable `header,' which we will ignore.
        header = datafile.readline()
        # Read the file one line at a time
        for line in datafile:
            # The first position in the line gives the chromosome
            chrom = line.split()[0]
            # If we don't already have an entry for the chromosome in our dictionary, create an entry.
            # The entry consists of two empty lists, one will hold the start positions and one will hold the coverage
            if chrom not in chrom_dict:
                chrom_dict[chrom] = ([], [],[],[])
            # Store the start position, coverage_ratio,normal_depth,tumor_depth in their respective lists
            chrom_dict[chrom][0].append(int(line.split()[1]))
            chrom_dict[chrom][1].append(float(line.split()[-2]))
            chrom_dict[chrom][2].append(float(line.split()[-4]))
            chrom_dict[chrom][3].append(float(line.split()[-3]))

    genome_list_anc = []
    genome_list_evo = []

    W = 1000 #set the size of the window
    
    #store anc depth for entire genome
    #calculate the median of normal depth across the whole genome
    for value in chrom_dict.itervalues():
        genome_list_anc.extend(value[2])

    genome_array_anc = np.array(genome_list_anc)
    genome_median_anc = np.median(genome_array_anc)

    # store evo depth for entire genome
    #calculate the median of tumor depth across the whole genome
    for value in chrom_dict.itervalues():
        genome_list_evo.extend(value[3])

    genome_array_evo = np.array(genome_list_evo)
    genome_median_evo = np.median(genome_array_evo)

    # Loop over the different chromosomes, plot the coverage, and save the figure

    chrom_list = sorted(chrom_dict.keys())
    x_vals = []
    y_vals = []
    x_vals_anc = []
    y_vals_anc = []
    x_vals_evo = []
    y_vals_evo = []
    for chrom in chrom_list:
        # Get the start positions and coverage ratio from our dictionary.
        # Convert the data from lists to numpy arrays (better for mathematical operations and plotting)
        start = np.array(chrom_dict[chrom][0])
        cov_ratio = np.array(chrom_dict[chrom][1])
        anc_depth = np.array(chrom_dict[chrom][2])
        evo_depth = np.array(chrom_dict[chrom][3])

        # Normalize by median of genome depth to control for difference sequencing depths
        normalized_depth_anc = anc_depth / genome_median_anc
        normalized_depth_evo = evo_depth / genome_median_evo
        normalized_depth = (normalized_depth_evo) - (normalized_depth_anc)
        
        # Calculate smoothed coverage over different window sizes for the ancestor

        smoothed = np.zeros((start[-1] / W) + 1)
        n_bins = np.zeros_like(smoothed)
        for i in range(len(normalized_depth)):
            smoothed[start[i]/W] += normalized_depth_anc[i]
            n_bins[start[i]/W] += 1
        smoothed /= n_bins

        x_vals_anc.append(W*np.arange(len(smoothed)))
        y_vals_anc.append(smoothed)
        
        # Calculate smoothed coverage over different window sizes for the evolved line
       
        smoothed = np.zeros((start[-1] / W) + 1)
        n_bins = np.zeros_like(smoothed)
        for i in range(len(normalized_depth)):
            smoothed[start[i]/W] += normalized_depth_evo[i]
            n_bins[start[i]/W] += 1
        smoothed /= n_bins

        x_vals_evo.append(W*np.arange(len(smoothed)))
        y_vals_evo.append(smoothed)
        
        # Calculate smoothed coverage over different window sizes for the normalized evo genome
        
        smoothed = np.zeros((start[-1] / W) + 1)
        n_bins = np.zeros_like(smoothed)
        for i in range(len(normalized_depth)):
            smoothed[start[i]/W] += normalized_depth[i]
            n_bins[start[i]/W] += 1
        smoothed /= n_bins

        x_vals.append(W*np.arange(len(smoothed)))
        y_vals.append(smoothed)


    # create a 2D array for the coordinates of the 16 centromeres
    centromeres = [[0 for x in range(2)] for y in range(16)] 
    # populate the array with the coordinates +- the windows size (to be sure to visualize the centromere)
    centromeres[0][0]= 151465 - W
    centromeres[0][1]= 151582 + W
    centromeres[1][0]= 238207 - W
    centromeres[1][1]= 238323 + W
    centromeres[2][0]= 114385 - W
    centromeres[2][1]= 114501 + W
    centromeres[3][0]= 449711 - W
    centromeres[3][1]= 449711 + W
    centromeres[4][0]= 151987 - W
    centromeres[4][1]= 152104 + W
    centromeres[5][0]= 148510 - W
    centromeres[5][1]= 148627 + W
    centromeres[6][0]= 496920 - W
    centromeres[6][1]= 497038 + W
    centromeres[7][0]= 105586 - W
    centromeres[7][1]= 105703 + W
    centromeres[8][0]= 355629 - W
    centromeres[8][1]= 355745 + W
    centromeres[9][0]= 436307 - W
    centromeres[9][1]= 436425 + W
    centromeres[10][0]= 440129 - W
    centromeres[10][1]= 440246 + W
    centromeres[11][0]= 150828 - W
    centromeres[11][1]= 150947 + W
    centromeres[12][0]= 268031 - W
    centromeres[12][1]= 268149 + W
    centromeres[13][0]= 628758 - W
    centromeres[13][1]= 628875 + W
    centromeres[14][0]= 326584 - W
    centromeres[14][1]= 326702 + W
    centromeres[15][0]= 555957 - W
    centromeres[15][1]= 556073 + W

    #get the names of the two genomes compared
    anc_text = file.partition("_vs_")[0]
    evo_text = file.partition("_vs_")[2]
    
    #Open a pdf for the anc figure
    fig_fn = anc_text + '.pdf'
    pdf_pages = PdfPages(fig_fn)
    
    #Plot the figure for the ancestor 
    #These are to make the y scale of all panels the same
    ymin = -1 
    ymax = 4 
    
    fig, ax = plt.subplots(4,4, sharey ='row')
    fig.text(0.5, 0.01, 'chromosome coordinate', ha='center', va='center',fontsize=8)
    fig.text(0.01, 0.5, 'normalized read depth', ha='center', va='center', rotation='vertical',fontsize=8)
    for i, a in enumerate(ax.flatten()):
        a.plot(x_vals_anc[i], y_vals_anc[i], alpha=0.3)
        a.scatter(x_vals_anc[i], y_vals_anc[i], marker='.', s=10, linewidth='0', vmin=-1, vmax=1, zorder=10)
        a.set_ylim([ymin, ymax])
        a.yaxis.set_ticks(np.arange(ymin,(ymax+0.5),0.5))
        xmax=max(x_vals[i])
        a.set_xlim(-15000, xmax+15000)
        a.xaxis.set_tick_params(labelsize=3)
        a.yaxis.set_tick_params(labelsize=3)
        a.set_title('chromosome '+ str(i+1), fontsize=8)  
    fig.tight_layout()
    pdf_pages.savefig(fig)
    #plt.show()
    #Save the pdf file
    pdf_pages.close()
    
    #Open a pdf for the evo figure
    fig_fn = evo_text + '.pdf'
    pdf_pages = PdfPages(fig_fn)
    
    #Plot the figure for the evolved line
    # These are to make the y scale of all panels the same
    ymin = -1 
    ymax = 4 
    
    fig, ax = plt.subplots(4,4, sharey ='row')
    fig.text(0.5, 0.01, 'chromosome coordinate', ha='center', va='center',fontsize=8)
    fig.text(0.01, 0.5, 'normalized read depth', ha='center', va='center', rotation='vertical',fontsize=8)
    for i, a in enumerate(ax.flatten()):
        a.plot(x_vals_evo[i], y_vals_evo[i], alpha=0.3)
        a.scatter(x_vals_evo[i], y_vals_evo[i], marker='.', s=10, linewidth='0', vmin=-1, vmax=1, zorder=10)
        a.set_ylim([ymin, ymax])
        a.yaxis.set_ticks(np.arange(ymin,(ymax+0.5),0.5))
        xmax=max(x_vals[i])
        a.set_xlim(-15000, xmax+15000)
        a.xaxis.set_tick_params(labelsize=3)
        a.yaxis.set_tick_params(labelsize=3)
        a.set_title('chromosome '+ str(i+1), fontsize=8)  
    fig.tight_layout()
    pdf_pages.savefig(fig)
    #plt.show()
    #Save the pdf file
    pdf_pages.close()
    
    # create two 2D arrays to store the centromere coordinates and the relative read depth
    x_vals_cen = []
    y_vals_cen = []
    # find the data for each chromosome of the normalized genome
    for i in range(0,16):
        cen_coordinate = ((x_vals[i] >= centromeres[i][0]) & (x_vals[i] <= centromeres[i][1]))
        chromosome = x_vals[i]
        chr_reads = y_vals[i]
        x_vals_cen.append([])
        y_vals_cen.append([])
        x_vals_cen[i].append(chromosome[cen_coordinate])
        y_vals_cen[i].append(chr_reads[cen_coordinate])
        x_vals[i] = chromosome[~cen_coordinate]
        y_vals[i] = chr_reads[~cen_coordinate]

    #Open a pdf for the normalized evo genome figure
    fig_fn = file + '_black.pdf'
    pdf_pages = PdfPages(fig_fn)
    
    #Plot the figure of the normalized genome with a color gradient
    fig, ax = plt.subplots(4,4,dpi=900)
    fig.text(0.5, 0.01, 'chromosome coordinate', ha='center', va='center',fontsize=8)
    fig.text(0.01, 0.5, 'normalized read depth', ha='center', va='center', rotation='vertical',fontsize=8)
    
    #x_max_temp=0
    #x_max=0
    #for i in range (0,len(x_vals)):
    #    x_max_temp = max(x_vals[i])
    #    x_max = max([x_max, x_max_temp])
    
    ymax=3.5
    ymin=-1
    
    for i, a in enumerate(ax.flatten()):
        cp= y_vals[i]
        
        
        x_vals[i]=x_vals[i]/1000
        red= [x_vals[i][np.where(y_vals[i]>0.5)], y_vals[i][np.where(y_vals[i]>0.5)]]
        blue= [x_vals[i][np.where(y_vals[i]<-0.5)], y_vals[i][np.where(y_vals[i]<-0.5)]]
        black= [x_vals[i][np.where((y_vals[i]>=-0.5) & (y_vals[i]<=0.5))], y_vals[i][np.where((y_vals[i]>=-0.5) & (y_vals[i]<=0.5))]]
        
        a.plot(x_vals[i], y_vals[i], color='black', alpha=0.3)
        #a.scatter(x_vals[i], y_vals[i], marker='.', s=10, linewidth='0', vmin=-1, vmax=1, color='black', alpha=0.4, zorder=10)
        a.scatter(red[0],red[1], marker='.', s=10, linewidth='0', vmin=-1, vmax=1, alpha=0.7, color='black', zorder=10)
        a.scatter(blue[0],blue[1], marker='.', s=10, linewidth='0', vmin=-1, vmax=1, alpha=0.7, color='black', zorder=10)
        a.scatter(black[0],black[1], marker='.', s=10, linewidth='0', vmin=-1, vmax=1, alpha=0.4, color='black', zorder=10)
        cen_avx=np.nanmean(x_vals_cen[i])
        cen_avy=np.nanmean(y_vals_cen[i])
        a.plot(cen_avx, cen_avy, 'ko', alpha=1, ms=2, zorder=20) 
        a.add_patch(Rectangle((cen_avx, cen_avy-1), (centromeres[i][1]-centromeres[i][0]) , 2, fill=None, alpha=0.4,zorder=20))
        a.set_ylim([ymin, ymax])
        a.yaxis.set_ticks(np.arange(ymin,(ymax+0.5),0.5))
        x_max=max(x_vals[i])
        a.set_xlim(-15, x_max+15)
        a.xaxis.set_tick_params(labelsize=5, length=2)
        a.yaxis.set_tick_params(labelsize=5, length=2)
        a.set_title('chromosome '+ str(i+1), fontsize=8)  
    fig.tight_layout()
    pdf_pages.savefig(fig)
    #plt.show()

    #Save the pdf file
    pdf_pages.close()

