---
title: "Collection of Comparison Functions"
author: "Laura Vairus"
date: "2023-08-21"
---

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
# Comparison functions

def get_diffmats(mat1, mat2):
    # make diffmats
    diffmat = mat1 - mat2
    abs_diffmat = np.abs(diffmat)
    # rel_diffmat = abs_diffmat / (np.abs(pre_avg) + np.abs(post_avg) + 1**-16)
    # relmax_diffmat = get_relmax(abs_diffmat) 
    # relmed_diffmat = get_relmed(abs_diffmat) 
    # rel95_diffmat = get_rel99(abs_diffmat)

    # new diff mat
    mat1_colwise_maxes = np.max(mat1, axis=0)
    mat2_colwise_maxes = np.max(mat2, axis=0)

    tot_colwise_maxes = np.maximum(mat1_colwise_maxes, mat2_colwise_maxes)

    relmax3_diffmat = diffmat / tot_colwise_maxes

    return relmax3_diffmat

    #return (abs_diffmat, rel_diffmat, relmax_diffmat, relmed_diffmat, rel95_diffmat)


def get_summary(arr):
    """
    input: any np array 
    output: dict of its mean, median, std_dev, min, max, total sum, q1, q3, and iqr
    """
    summary = {
        "mean": np.mean(arr),
        "median": np.median(arr),
        "std_dev": np.std(arr),
        "minimum": np.min(arr),
        "maximum": np.max(arr),
        "total_sum": np.sum(arr),
        "q1": np.percentile(arr, 25),
        "q3": np.percentile(arr, 75),
        "iqr": np.percentile(arr, 75) - np.percentile(arr, 25)
    }
    return summary


def get_colwise_summary_df(arr):
    """
    input: a 2+ dim np array 
    output: summary of every column as a pd dataframe
    """
    # Initialize empty dictionary
    col_summaries = {}
    # Compute summary statistics for each column and store in dictionary
    for col_index in range(arr.shape[1]):
        col_name = col_index
        col_data = arr[:, col_index]
        col_summary = get_summary(col_data)
        col_summaries[col_name] = col_summary
    # Convert dictionary to DataFrame
    col_summaries_df = pd.DataFrame(col_summaries).transpose()

    return col_summaries_df


def plot_tols_vs_outs(arr, mintol, maxtol, steps):
    """
    Plots tolerance vs percentage of outliers for a np array
    This is to help you visualize the different tolerances and choose which one you want to use

    Parameters:
    arr (np array): array you're interested in
    mintol (int): minimum tolerance you want
    maxtol (int): maximum tolerance you want
    steps (int): amount of points you want plotted between your min and max tolerances

    Returns:
    None
    """
    # make x and y values
    x = np.linspace(maxtol, mintol, steps)
    y = []
    for tol in x:
        num_outs = len(get_outliers(arr, tol))
        out_perc = num_outs/arr.size
        print(f'tolerance: {round(tol, 2)}, num outs: {num_outs}, out percent: {out_perc}')
        y.append(out_perc)
    # plot x and y
    plt.scatter(x, y)
    plt.xlabel('tolerance')
    plt.ylabel('percentage of outliers')
    plt.show()

    return None


def get_outliers(arr, tol):
    """
    returns every value of a np array that is above a specified tolerance
    (if your tolerance is 1, every value above 1 is considered an outlier and returned)

    Parameters:
    arr (np array): array you're interested in
    tol (int): max value you tolerate

    Returns:
    a np array of every value above the tolerance
    """
    outs = arr[arr > tol]

    return outs


def get_outlier_inds(arr, tol):
    """
    returns the index of every value of a np array that is above a specified tolerance

    Parameters:
    arr (np array): array you're interested in
    tol (int): max value you tolerate, anything above this is considered an outlier

    Returns:
    a list of 0: a list of tuples of indicies, 1: a list of row indices, and 2: a list of column indicies, of every value above the tolerance
    """
    # get list of ind tuples
    out_inds = np.where(arr > tol)
    out_inds_arr = np.array(list(zip(out_inds[0], out_inds[1])))
    # get list of row and col inds
    row_inds = out_inds[0]
    col_inds = out_inds[1]

    return [out_inds_arr, row_inds, col_inds]


def plot_hist(arr, bin_num, xlab='Value', ylab='Frequency', title='Histogram'):
    """
    plots a histogram of any np array

    Parameters:
    arr (np array): array you're interested in
    bin_num (int): amount of bins you want plotted
    xlab (str): x-axis label
    ylab (str): y-axis label
    title (str): title label

    Returns:
    None
    """
    plt.hist(arr, bins=bin_num)
    plt.title(title)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.show()

    return None


def make_outs_df(arr, tol):
    # get outliers and their inds
    outs = get_outliers(arr, tol)
    all_inds, row_inds, col_inds = get_outlier_inds(arr, tol)
    # make data
    data = {
        'track': df_targets['description'].iloc[col_inds],
        'bin': row_inds,
        'diff': outs,
        'system_slims': df_targets['system_slims'].iloc[col_inds],
        'cell_slims': df_targets['cell_slims'].iloc[col_inds],
        'organ_slims': df_targets['organ_slims'].iloc[col_inds],
        'developmental_slims': df_targets['developmental_slims'].iloc[col_inds]
    }
    # make df
    df = pd.DataFrame(data)
    # sort by ascending difference
    df_sorted = df.sort_values('diff')

    return df_sorted


def get_bin_info(df):
    """
    gets all the track info from a df and
    plots a bar graph of which ones show up and how often

    Parameters:
    df (pd dataframe): dataframe from make_outs_df() you're interested in

    Returns (tuple): a list of every track, and a dictionary of every track and their counts
    """
    tracks = df['bin'].tolist()

    # get counts of every system
    track_counts = Counter(tracks)
    track_counts_sorted = dict(sorted(track_counts.items(), key=lambda item: item[1], reverse=True))
    
    # plot bar graph of systems
    keys = track_counts_sorted.keys()
    values = track_counts_sorted.values()
    plt.bar(keys, values)
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.xlabel('Bin')
    plt.ylabel('Count')
    plt.title('Bin Counts')
    plt.show()

    return (tracks, track_counts_sorted)


def get_spec_track_info(df, substr):
    """
    gets all the track info of a specified type from a df and
    plots a bar graph of which ones show up and how often
    (can be used to isolate CAGE tracks, DNase tracks, etc.)

    Parameters:
    df (pd dataframe): dataframe from make_outs_df() you're interested in 
    substr (str): substring you want to isolate (ex: 'CAGE')

    Returns (tuple): a list of every track with the specified substring, and a dictionary of every track and their counts
    """

    #get tracks with specfied substr
    tracks = df[df['track'].str.contains(substr, na=False)]['track'].tolist()

    # get counts of every system
    track_counts = Counter(tracks)
    track_counts_sorted = dict(sorted(track_counts.items(), key=lambda item: item[1], reverse=True))
    
    # plot bar graph of systems
    keys = track_counts_sorted.keys()
    values = track_counts_sorted.values()
    plt.bar(keys, values)
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.xlabel('Track')
    plt.ylabel('Count')
    plt.title('Track Counts')
    plt.show()

    return (tracks, track_counts_sorted)


def get_system_info(df):
    # df must be from from make_outs_df
    # prints a histogram of system counts
    # outputs a tuple of the full system list and a dictionary of their counts

    # make list of all systems and expand them
    sys_compressed = df['system_slims'].tolist()
    sys_expanded = []
    for sys in sys_compressed:
        if type(sys) == float:
            continue
        sys_list = sys.split(", ")
        sys_expanded += sys_list

    # get counts of every system
    sys_counts = Counter(sys_expanded)
    sys_counts_sorted = dict(sorted(sys_counts.items(), key=lambda item: item[1], reverse=True))
    
    # plot bar graph of systems
    keys = sys_counts_sorted.keys()
    values = sys_counts_sorted.values()
    plt.bar(keys, values)
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.xlabel('System')
    plt.ylabel('Count')
    plt.title('System Counts')
    plt.show()

    return (sys_expanded, sys_counts_sorted)


def get_cell_info(df):
    # df must be from from make_outs_df
    # prints a histogram of cell counts
    # outputs a tuple of the full cell list and a dictionary of their counts

    # make list of all cells and expand them
    cell_compressed = df['cell_slims'].tolist()
    cell_expanded = []
    for cell in cell_compressed:
        if type(cell) == float:
            continue
        cell_list = cell.split(", ")
        cell_expanded += cell_list

    # get counts of every cell
    cell_counts = Counter(cell_expanded)
    cell_counts_sorted = dict(sorted(cell_counts.items(), key=lambda item: item[1], reverse=True))

    # plot bar graph of cells
    keys = cell_counts_sorted.keys()
    values = cell_counts_sorted.values()
    plt.bar(keys, values)
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.xlabel('Cell')
    plt.ylabel('Count')
    plt.title('Cell Counts')
    plt.show()

    return (cell_expanded, cell_counts_sorted)


def get_organ_info(df):
    # input df must be from from make_outs_df
    # prints a histogram of organ counts
    # outputs a tuple of the full organ list and a dictionary of their counts

    # make list of all organs and expand them
    organ_compressed = df['organ_slims'].tolist()
    organ_expanded = []
    for organ in organ_compressed:
        if type(organ) == float:
            continue
        organ_list = organ.split(", ")
        organ_expanded += organ_list

    # get counts of every organ
    organ_counts = Counter(organ_expanded)
    organ_counts_sorted = dict(sorted(organ_counts.items(), key=lambda item: item[1], reverse=True))

    # plot bar graph of organs
    keys = organ_counts_sorted.keys()
    values = organ_counts_sorted.values()
    plt.bar(keys, values)
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.xlabel('Organ')
    plt.ylabel('Count')
    plt.title('Organ Counts')
    plt.show()

    return (organ_expanded, organ_counts_sorted)


def get_developmental_info(df):
    # df must be from from make_outs_df
    # prints a histogram of developmental stage counts
    # outputs a tuple of the full developmental stage list and a dictionary of their counts

    # make list of all developmental stages and expand them
    developmental_compressed = df['developmental_slims'].tolist()
    developmental_expanded = []
    for developmental in developmental_compressed:
        if type(developmental) == float:
            continue
        developmental_list = developmental.split(", ")
        developmental_expanded += developmental_list

    # get counts of every developmental stage
    developmental_counts = Counter(developmental_expanded)
    developmental_counts_sorted = dict(sorted(developmental_counts.items(), key=lambda item: item[1], reverse=True))

    # plot bar graph of developmental stages
    keys = developmental_counts_sorted.keys()
    values = developmental_counts_sorted.values()
    plt.bar(keys, values)
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.xlabel('Developmental Stage')
    plt.ylabel('Count')
    plt.title('Developmental Stage Counts')
    plt.show()

    return (developmental_expanded, developmental_counts_sorted)

