## Analyzing GA repeats in GAF-degron Hi-C data

### Main question: Do GA-repeats exhibit compartment switching when GAF is degraded

### General outline:
- Identify reads where one end contains a (AAGAG)x5 repeat
- Map the other end of GA-repeat-containing reads, select those that map uniquely
- Determine whether there is a shift among these reads from A -> B or B -> A

*Identifying GA-repeat-containing reads is accomplished outside this notebook by sort_GArepeats.py. The complimentary reads are then mapped using bowtie via my script hic_iterative_mapping_bowtie.py. Subsequent processing is done in this notebook.*

To dos:
- control random sampling of mapping

In [1]:
from Bio import SeqIO
import gzip
import re
import numpy as np
import matplotlib.pyplot as plt
import stadc as sc
import seaborn as sns
import pandas as pd
import scipy

In [2]:
# Functions.

#from re import X


def count_bins(filepath, bin_size=25_000, max_chr_size=5e7):
    """Converts unique mapping file to binned counts (1D) for each chromosome.
    
    Args:
        filepath: str
            path to a bowtie alignment file with only unique reads
        bin_size: int
            Size in bp of bins
        max_chr_size: int
            Maximum size in bp allowable for chromosome (for memory alloc)

    Returns:
        bincounts: dict of ndarrays
            keys are chromosome names, arrays are bin counts (0-indexed)
        maxbins: dict
            keys are chromosome names, values are the maximum 0-indexed
            bin number oberved in the data for that chromosome
    """
    chroms = ['X', '2L', '2R', '3L', '3R']
    bincounts = {}
    maxbins = {}
    locs_seen = {}

    for c in chroms:
        bincounts[c] = np.zeros(round(max_chr_size / bin_size))
        maxbins[c] = 0
        locs_seen[c] = set()

    with open(filepath, 'r') as infile:
        for line in infile:
            line = line.rstrip()
            items = line.split('\t')
            chrom = items[2]
            pos = int(items[3])
            bin_num = int(pos / bin_size)
            if chrom in chroms:
                if pos not in locs_seen[chrom]:
                    bincounts[chrom][bin_num] += 1
                    if bin_num > maxbins[chrom]:
                        maxbins[chrom] = bin_num
                    locs_seen[chrom].add(pos)
    return bincounts, maxbins

#---------------------------------------------------------------------------
def process_plot_compartments(mat, start, end, plot=True):
    """Makes compartment vector from a supplied contact matrix file.
    
    Compartment vector is the 1st eigenvector of centered covariance matrix of
    distance-normalized contact matrix. Details are contained 
    within functions in stadc library.

    Args:
        mat: ndarray
            Raw counts matrix for a chromosome, chromosome arm, or region
        start: int
            First bin to include
        end: int
            Last bin to include
        plot: bool
            Whether to plot the contact matrix with compartment vector

    Returns:
        comp: ndarray
            Vector of compartment score (first eigenvector)
    """
    m = mat[start:end, start:end]
    m_norm = sc.norm_hic_matrix(m)
    distnorm = sc.distnorm(m_norm)
    centered, pca, X = sc.pca_matrix(distnorm)
    comp = np.linalg.eig(centered)[1][:,0]

    if plot:
        sc.plot_compartments(centered, X, invert=True)
    
    return comp

#---------------------------------------------------------------------------
def plot_1D_counts(bincounts1, bincounts2, maxbins, chrom, start=None, end=None):
    """Plot counts from two datasets as a function of chromosome position."""
    def process_bincounts(bc):
        """Add 0.5 to take care of 0, get rid of excess beyond max, normalize to mean."""
        x = bc[chrom] + 0.5
        x = x[:maxbins[chrom]]
        x = x / np.mean(x)
        return x

    plt.subplots(figsize=(15,5))
    x1 = process_bincounts(bincounts1)
    x2 = process_bincounts(bincounts2)
    if (start is not None) and (end is not None):
        x1 = x1[start:end]
        x2 = x2[start:end]

    plt.plot(np.log(x1), linewidth=1.5)
    plt.plot(np.log(x2), linewidth=1.5)
    plt.xlabel('Bin Number (25 kb)', fontsize=14)
    plt.ylabel('Log counts', fontsize=14)
    plt.title("Normalized Log counts for GA-repeat-linked reads (" + chrom + ")", fontsize=16)
    print(np.sum(bincounts1[chrom]), np.sum(bincounts2[chrom]))
    return x1, x2

#---------------------------------------------------------------------------
def plot_compartment_vs_foldchange(bincounts1, bincounts2, comp, chrom):
    """Plot compartment scores vs. foldchange in counts between two datasets."""
    if (len(bincounts1) != len(bincounts2)) or (len(bincounts1) != len(comp)):
        raise ValueError('All three vectors must be the same length')
    foldchange = np.log(bincounts1 / bincounts2)
    plt.scatter(foldchange, comp)
    plt.xlim(-2,2)
    plt.ylim(-0.12, 0.12)
    plt.xlabel('Log Fold Change', fontsize=14)
    plt.ylabel('Compartment Score', fontsize=14)
    plt.hlines(0, -2, 2, linestyles='dashed', color='grey')
    plt.vlines(0, -0.12, 0.12, linestyles='dashed', color='grey')
    plt.title('Compartment Score vs. Fold Change (' + chrom + ')', fontsize=16)

#---------------------------------------------------------------------------
def boxplots(bincounts_allchr_1, bincounts_allchr_2, comp, chrom, start, end):
    """Make a boxplot of the compartment scores for two datasets. The 
    compartment score is determined for the 25 kb containing each uniquely
    mapping read, the distribution of these scores for each dataset forms
    the input to the boxplot."""
    def make_vals(bincounts, comp):
        """Make vector containing bin compartment scores repeated according to the
        number of reads found in that bin."""
        return np.repeat(comp[bincounts > 0], bincounts[bincounts > 0].astype(int))

    bincounts1 = bincounts_allchr_1[chrom][start:end]
    bincounts2 = bincounts_allchr_2[chrom][start:end]
    vals1 = make_vals(bincounts1, comp)
    vals2 = make_vals(bincounts2, comp)
    wt = pd.DataFrame(np.random.normal(size=100))
    gaf = pd.DataFrame(np.random.normal(size=100))
    wt['class'] = 'gaf-gfp'
    gaf['class'] = 'gaf-degrad'
    combined = pd.concat([wt, gaf])
    sns.boxplot(data=combined, x='class', y=0)
    pval = scipy.stats.ttest_ind(vals1, vals2)[1]
    plt.text(-0.65, plt.ylim()[1] - 0.5, 'pval=' + str(round(pval, 3)))
    plt.xlabel('')
    plt.ylabel('Compartment Score', fontsize=14)
    plt.title("Compartment Score Distributions (" + chrom + ")", fontsize=16)
    return vals1, vals2

In [5]:
# Set these variables, run.
bin_size = 25_000
start = 200
end = 700
bowtie1 = '/Users/michaelstadler/Bioinformatics/Projects/insulators/data/mapping/GAreps/HiC-68-all.bowtie'
bowtie2 = '/Users/michaelstadler/Bioinformatics/Projects/insulators/data/mapping/GAreps/HiC-69-all.bowtie'
matrix_file_stem = '/Users/michaelstadler/Bioinformatics/Projects/insulators/outputs/hic-72_binCounts_25kB_chr' 
outstem = "/Users/michaelstadler/Bioinformatics/Projects/insulators/data/figure_data/HiC-68-69"

for chrom in ['2L', '2R', '3L', '3R', 'X']:
    bc1, maxbins = count_bins(bowtie1, bin_size)
    bc2, _ = count_bins(bowtie2, bin_size)
    mat = np.loadtxt(matrix_file_stem + chrom + '.txt.gz')
    comp = process_plot_compartments(mat, start, end, plot=True);
    plt.tight_layout()
    plt.savefig(outstem + '-' + chrom + '-compartments.png')
    plt.close()
    bc1_normed, bc2_normed = plot_1D_counts(bc1, bc2, maxbins, chrom, start, end);
    plt.tight_layout()
    plt.savefig(outstem + '-' + chrom + '-1Dcounts.png')
    plt.close()
    plot_compartment_vs_foldchange(bc1_normed, bc2_normed, comp, chrom)
    plt.tight_layout()
    plt.savefig(outstem + '-' + chrom + '-comp-vs-foldchange.png')
    plt.close()
    v1, v2 = boxplots(bc1, bc2, comp, chrom, start, end)
    plt.tight_layout()
    plt.savefig(outstem + '-' + chrom + '-boxplot.png')
    plt.close()

[0.73618618 0.0388958 ]


  plt.tight_layout()


30196.0 23021.0
[0.40604794 0.17087193]


  plt.tight_layout()


57517.0 48152.0
[0.66275786 0.05681775]


  plt.tight_layout()


27586.0 21073.0
[0.79668149 0.04013728]


  plt.tight_layout()


43352.0 35212.0
[0.71799179 0.01886924]


  plt.tight_layout()


20186.0 15772.0
