In [None]:
#Imports and other setup
import subprocess
import matplotlib.pyplot as plt
import numpy as np
import pandas
import os
import linecache
import matplotlib.ticker as mtick
from matplotlib.collections import BrokenBarHCollection

In [None]:
#Change variables here based on your file system
working_dir="/Users/ahb232/Desktop/sorghum_stats/"
wiggle_dir = working_dir + "/wiggle/coverage/"
bedfiles_dir = working_dir + "/ranges/"
refranges_file = bedfiles_dir + "validBedFile.bed"

In [None]:
#Get distribution of coverage for each chromosome

df = ""

is_first = True

for file_name in os.listdir(wiggle_dir):
    if (file_name.startswith("coverage") and file_name.endswith(".wig")):
        
        #Get chromosome name from file name
        chr_name = file_name.replace("coverage_", "").replace(".wig", "")
        
        wiggle_file = wiggle_dir + "/" + file_name
        
        #Count the number of lines with each level of coverage
        p1=subprocess.Popen(["sort", wiggle_file], stdout=subprocess.PIPE)
        p2=subprocess.run(["uniq", '-c'], stdin=p1.stdout, text=True, stdout=subprocess.PIPE)
        
        #Reformat and remove lines that are non-numeric
        cov = [x.split(" ") for x in p2.stdout.split("\n")]
        cov = [[int(x[0]), int(x[1])] for x in cov if x[0].isnumeric()]
        
        chr_len = sum([x[0] for x in cov])
        
        if is_first:
            #Create dataframe
            df = pandas.DataFrame({"coverage":[x[1] for x in cov], chr_name:[x[0]/chr_len for x in cov]})
            is_first=False
        else:
            #Add new column to existing dataframe
            df[chr_name] = [x[0]/chr_len for x in cov]
df = df.sort_values(by=["coverage"])

print(df)

In [None]:
#A list of chromosome names that we can use later
chrom = []
for file_name in os.listdir(wiggle_dir):
    if (file_name.startswith("coverage") and file_name.endswith(".wig")):
        
        #Get chromosome name from file name
        chrom.append(file_name.replace("coverage_", "").replace(".wig", ""))
chrom.sort()

In [None]:
#Plot coverage distribution across all chromosomes
for c in chrom:
    plt.plot(df["coverage"], df[c], label=c)

plt.xticks(np.arange(min(df["coverage"]), max(df["coverage"]+1)))        
plt.legend(title="chromosome", loc="upper left")
plt.xlabel("Coverage")
plt.ylabel("Percentage of total bases with the given coverage")
plt.title("Coverage across sorghum genomes")
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=None, symbol='%', is_latex=False))
plt.show()


In [None]:
#Cumulative distribution of coverage
df_cumulative = df.copy()

for c in chrom:
    df_cumulative[c] = [sum(df[c].iloc[x:len(df[c]) + 1]) for x in range(len(df[c]))]
    
df_cumulative.sort_values(by=["coverage"], ascending=False)


In [None]:
#Plot cumulative coverage distribution across all chromosomes
#Note the reversed x-axis!
for c in chrom:
    plt.plot(df_cumulative["coverage"], df_cumulative[c], label=c)

plt.xticks(np.arange(min(df_cumulative["coverage"]), max(df_cumulative["coverage"]+1)))        
plt.legend(title="chromosome", loc="lower right")
plt.axis([max(df_cumulative["coverage"]), min(df_cumulative["coverage"]), 0, 1])
plt.xlabel("Coverage")
plt.ylabel("Percentage of total bases with at least the given coverage")
plt.title("Cumulative coverage across sorghum genomes")
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=None, symbol='%', is_latex=False))
plt.show()