In [None]:
#Imports and other setup
import subprocess
import matplotlib.pyplot as plt
import numpy as np
import pandas
import os
import linecache
import matplotlib.ticker as mtick

In [None]:
###########
# EDIT ME #
###########

#Change paths here based on your file system

#Main working directory
working_dir = "/Users/ahb232/Desktop/maize_stats/"

#Directory containing coverage and identity wiggle files for each chromosome
#Files should be names coverage_X.wig or identity_X.wig
#Where X is the name of each chromosome
wiggle_dir = working_dir + "/wiggles/"

#Directory containing reference fasta and gff
reference_dir = working_dir + "/ref/"

#Species name, for displaying on plots
species_name = "maize"

In [None]:
###########
# EDIT ME #
###########

#A list of chromosome names that we can use later

# NOTE: The cell below attempts to automatically identify and sort chromosomes based on the names of the wiggle files.
# It will sort numerical chromosome names first, and then append any other chromosomes alphabetically.
# If you want to order the chromosomes in a different way, or use some subset of chromosomes,
# Please edit the list below, and skip the next code cell.

chrom = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10']

In [None]:
# Infer chromosome names from coverage wiggle files
for file_name in os.listdir(wiggle_dir):
    if (file_name.startswith("coverage") and file_name.endswith(".wig")):
        chrom.append(file_name.replace("coverage_", "").replace(".wig", ""))

# Split names into numeric and non-numeric
numeric_chrom = [int(x) for x in chrom if x.isnumeric()]
alphabetical_chrom = [x for x in chrom if not x.isnumeric() ]

# Sort numeric from lowest to highest, non-numeric alphabetically
numeric_chrom.sort()
alphabetical_chrom.sort()

# Concatenate both lists to return the full list of chromosomes in the desired order
chrom = [str(x) for x in numeric_chrom] + alphabetical_chrom

print(chrom)

In [None]:
#Get distribution of coverage for each chromosome

# for constructing the dataframe
coverage_df = ""
is_first = True

for c in chrom:
    
    # Get filepath for coverage file
    wiggle_file = wiggle_dir + "/coverage_" + c + ".wig"
        
    #Count the number of lines with each level of coverage
    p1=subprocess.Popen(["sort", wiggle_file], stdout=subprocess.PIPE)
    p2=subprocess.run(["uniq", '-c'], stdin=p1.stdout, text=True, stdout=subprocess.PIPE)
        
    #Reformat and remove lines that are non-numeric
    cov = [x.split(" ") for x in p2.stdout.split("\n")]
    cov = [[int(x[0]), int(x[1])] for x in cov if x[0].isnumeric()]
        
    chr_len = sum([x[0] for x in cov])
        
    if is_first:
        #Create dataframe
        coverage_df = pandas.DataFrame({"coverage":[x[1] for x in cov], c:[x[0]/chr_len for x in cov]})
        is_first=False
    else:
        #Add new column to existing dataframe
        coverage_df[c] = [x[0]/chr_len for x in cov]

# sort rows from lowest coverage to highest
coverage_df = coverage_df.sort_values(by=["coverage"])

# get range for coverage
min_cov = min(coverage_df["coverage"])
max_cov = max(coverage_df["coverage"]+1)

In [None]:
############
# OPTIONAL #
############

#Print coverage dataframe
print(coverage_df)

In [None]:
#Plot coverage distribution across all chromosomes
for c in chrom:
    plt.plot(coverage_df["coverage"], coverage_df[c], label=c)

plt.xticks(np.arange(min_cov, max_cov))        
plt.legend(title="chromosome", loc="upper left")
plt.xlabel("Coverage")
plt.ylabel("Percentage of total bases with the given coverage")
plt.title("Alignment coverage of " + species_name + " assemblies against reference")
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=None, symbol='%', is_latex=False))
plt.show()

In [None]:
# Same information as the above graph, but cumulative
coverage_df_cumulative = coverage_df.copy()

for c in chrom:
    coverage_df_cumulative[c] = [sum(coverage_df[c].iloc[x:len(coverage_df[c]) + 1]) for x in range(len(coverage_df[c]))]

# In this graph, descending coverage is easier to read
temp = coverage_df_cumulative.sort_values(by=["coverage"], ascending=False)

In [None]:
# Plot cumulative coverage distribution across all chromosomes
# Note the reversed x-axis!
for c in chrom:
    plt.plot(coverage_df_cumulative["coverage"], coverage_df_cumulative[c], label=c)

plt.xticks(np.arange(min_cov, max_cov))        
plt.legend(title="chromosome", loc="lower right")
plt.axis([max_cov-1, min_cov, 0, 1])
plt.xlabel("Coverage")
plt.ylabel("Percentage of total bases with at least the given coverage")
plt.title("Cumulative  alignment coverage of " + species_name + " assemblies against reference")
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=None, symbol='%', is_latex=False))
plt.show()

In [None]:
#Get distribution of identity for each chromosome

# for constructing the dataframe
identity_df = ""
is_first = True

for c in chrom:
    
    # Get filepath for identity file
    wiggle_file = wiggle_dir + "/identity_" + c + ".wig"
        
    #Count the number of lines with each level of identity
    p1=subprocess.Popen(["sort", wiggle_file], stdout=subprocess.PIPE)
    p2=subprocess.run(["uniq", '-c'], stdin=p1.stdout, text=True, stdout=subprocess.PIPE)
        
    #Reformat and remove lines that are non-numeric
    idy = [x.split(" ") for x in p2.stdout.split("\n")]
    idy = [[int(x[0]), int(x[1])] for x in idy if x[0].isnumeric()]
        
    chr_len = sum([x[0] for x in idy])
        
    if is_first:
        #Create dataframe
        identity_df = pandas.DataFrame({"identity":[x[1] for x in idy], c:[x[0]/chr_len for x in idy]})
        is_first=False
    else:
        #Add new column to existing dataframe
        identity_df[c] = [x[0]/chr_len for x in idy]

# sort rows from lowest identity to highest
identity_df = identity_df.sort_values(by=["identity"])

# get range for identity
min_idy = min(identity_df["identity"])
max_idy = max(identity_df["identity"]+1)

In [None]:
############
# OPTIONAL #
############

#Print identity dataframe
print(identity_df)

In [None]:
#Plot identity distribution across all chromosomes
for c in chrom:
    plt.plot(identity_df["identity"], identity_df[c], label=c)

plt.xticks(np.arange(min_idy, max_idy))        
plt.legend(title="chromosome", loc="upper left")
plt.xlabel("identity")
plt.ylabel("Percentage of total bases with the given identity")
plt.title("Alignment identity of " + species_name + " assemblies against reference")
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=None, symbol='%', is_latex=False))
plt.show()

In [None]:
# Same information as the above graph, but cumulative
identity_df_cumulative = identity_df.copy()

for c in chrom:
    identity_df_cumulative[c] = [sum(identity_df[c].iloc[x:len(identity_df[c]) + 1]) for x in range(len(identity_df[c]))]

# In this graph, descending identity is easier to read
temp = identity_df_cumulative.sort_values(by=["identity"], ascending=False)

In [None]:
# Plot cumulative identity distribution across all chromosomes
# Note the reversed x-axis!
for c in chrom:
    plt.plot(identity_df_cumulative["identity"], identity_df_cumulative[c], label=c)

plt.xticks(np.arange(min_idy, max_idy))        
plt.legend(title="chromosome", loc="lower right")
plt.axis([max_idy-1, min_idy, 0, 1])
plt.xlabel("identity")
plt.ylabel("Percentage of total bases with at least the given identity")
plt.title("Cumulative alignment identity of " + species_name + " assemblies against reference")
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=None, symbol='%', is_latex=False))
plt.show()