In [None]:
import pandas as pd
import numpy as np
from compositions import clr, multiplicative_aitchison, aitchison
from skbio.stats.composition import multiplicative_replacement
from scipy.spatial.distance import euclidean, braycurtis
import sys
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
cb_palette = sns.color_palette(as_cmap=True)
color_palette = {
    "Expected": cb_palette[0], 
    "expected": cb_palette[0], 
    "woltka": cb_palette[1], 
    "wol": cb_palette[1], 
    "jams": cb_palette[2], 
    "wgsa": cb_palette[3], 
    "wgsa2": cb_palette[3], 
    "biobakery3": cb_palette[4], 
    "bio3": cb_palette[4], 
    "biobakery4": cb_palette[5], 
    "bio4": cb_palette[5]
}

In [None]:
df = pd.read_csv('woltka_debug.csv', index_col=0, usecols=[0,1,2])

# Minimum value NONZERO value in x.
minimum = df['x'].values
minimum = minimum[minimum > 0]
minimum = minimum.min() / 100
# print(minimum)

x, y = multiplicative_replacement(df['x'].values, delta=minimum), multiplicative_replacement(df['y'].values, delta=minimum)
new_df = pd.DataFrame({'x': x, 'y': y})
new_df['x_clr'] = clr(x)
new_df['y_clr'] = clr(y)
new_df['(o-e)^2'] = (new_df['x_clr'] - new_df['y_clr'])**2
clr_value = np.sqrt(new_df['(o-e)^2'].sum())
# print(clr_value)
# print(multiplicative_aitchison(x, y))
# # display(df)
# display(new_df)

# new_df.to_csv("replaced.csv")

In [None]:
def clr_df(df: pd.DataFrame):
    """Centered log ratio transform"""
    df['RA_exp_adj'] = clr(df['RA_exp_adj'].values)
    df['RA_obs_adj'] = clr(df['RA_obs_adj'].values)
    return df

In [None]:
data = pd.read_csv('test_data/bivariate.csv', index_col=0)

def calc_minimum(df, col):
    minimum = df[col].values
    minimum = minimum[minimum > 0]
    minimum = minimum.min() / 10
    return minimum

def replace_df_values(df, minimum):
    df["RA_exp_adj"] = multiplicative_replacement(df['RA_expected'].values, delta=minimum)
    df["RA_obs_adj"] = multiplicative_replacement(df['RA_observed'].values, delta=minimum)

    return df

def simulate_left_join(df: pd.DataFrame):
    # Drop any rows that have a 0 in the expected column.
    df = df[df['RA_expected'] > 0]
    return df 

def linear_plot(input_df: pd.DataFrame, hue_category: str = 'Source_observed', colors: dict = color_palette):    
    fig = plt.figure(figsize=(10, 10))

    # Set to log log scale
    # ax.set(xscale="log", yscale="log")

    pipeline_offset = -0.1
    for heading, dataframe in input_df.groupby(hue_category):
        input_df = replace_df_values(dataframe, calc_minimum(dataframe, 'RA_expected'))
        display(input_df)

        max_x = input_df['RA_exp_adj'].max()
        max_y = input_df['RA_obs_adj'].max()
        max_val = max(max_x, max_y)

        ax = sns.lmplot(x="RA_exp_adj", y="RA_obs_adj", hue=hue_category, col=hue_category, col_wrap=2, data=input_df, fit_reg=False, height=7, aspect=1, ci=None, palette=colors)

        # Make ax a log scale.
        ax.set(xscale="log", yscale="log")

        for a in plt.gcf().axes:
            a.plot([0, max_val + 0.01], [0, max_val + 0.01], ls="--", c=".3")

        a_d = euclidean(dataframe['RA_exp_adj'], dataframe['RA_obs_adj'])
        # a_d = aitchison(dataframe['RA_exp_adj'], dataframe['RA_obs_adj'])
        plt.text(0.7, pipeline_offset, f'Aitchison = {a_d:.4f} for {heading}', transform=plt.gca().transAxes)
        try: 
            a_d = braycurtis(dataframe['RA_exp_adj'].values, dataframe['RA_obs_adj'].values)
            plt.text(0.3, pipeline_offset, f'1-BC = {a_d:.4f} for {heading}', transform=plt.gca().transAxes)
        except ValueError:
            plt.text(0.3, pipeline_offset, f'1-BC = N/A for {heading}', transform=plt.gca().transAxes)

        pipeline_offset -= 0.05

replace_df = data.copy()
# replace_df_values(replace_df, calc_minimum(data, 'RA_expected'))
# display(replace_df)

linear_plot(replace_df, hue_category='Source_observed', colors=color_palette)

In [None]:
left_df = data.copy()
left_df = simulate_left_join(left_df)
left_df = replace_df_values(left_df, calc_minimum(left_df, 'RA_expected'))
# display(left_df)
# left_df = clr_df(left_df)
# display(left_df)
linear_plot(left_df, hue_category='Source_observed', colors=color_palette)