In [None]:
import pandas as pd
import skbio
from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa
import altair as alt

In [None]:
df = pd.read_csv("../data/processed/emu/emu-combined-tax_id.tsv", sep="\t")
df_tax = pd.read_csv("../resources/database/silva/taxonomy_split.tsv", sep="\t", index_col=0)
tax_columns = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
for col in tax_columns:
    df[col] = df['tax_id'].apply(lambda x: df_tax.at[int(x), col] if x != "unassigned" and int(x) in df_tax.index else None)

In [None]:
def visualize_abundance(df, df_tax, level="tax_id"):
    full_tax_columns = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'tax_id']
    
    # Remove the specified level from tax_columns
    tax_columns = [col for col in full_tax_columns if col != level]
    
    df = df.drop(columns=tax_columns)
    
    # Set tax_id as the index
    df.set_index(level, inplace=True)
    
    # Calculate total abundance for each tax_id by summing across the rows
    total_abundance = df.sum(axis=1)
    
    # Identify the top 10 tax_id by total abundance
    top_10_tax_ids = total_abundance.nlargest(10).index
    
    # Define colors for the top 10 tax_id
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', 
              '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
    color_map = {tax_id: color for tax_id, color in zip(top_10_tax_ids, colors)}
    
    # Assign grey color for other tax_id
    color_map['other'] = 'grey'
    
    # Add a color column to the DataFrame
    df['color'] = df.index.to_series().apply(lambda x: color_map[x] if x in color_map else color_map['other'])
    
    # Reset the index to melt the DataFrame
    df.reset_index(inplace=True)
    
    # Melt the DataFrame to long format for Altair
    df_melted = df.melt(id_vars=[level, 'color'], var_name='barcode', value_name='relative_abundance')
    
    # Drop rows with NaN values in 'relative_abundance'
    df_melted = df_melted.dropna(subset=['relative_abundance'])
    # Enrich the melted DataFrame with taxonomic information using a lambda function
    for col in tax_columns:
        df_melted[col] = df_melted[level].apply(lambda x: df_tax.at[int(x), col] if x != "unassigned" and int(x) in df_tax.index else None)
    
    # Create the Altair chart
    chart = alt.Chart(df_melted).mark_bar().encode(
        x='barcode:N',
        y='relative_abundance:Q',
        color=alt.Color('color:N', scale=None),
        tooltip=full_tax_columns + ['relative_abundance']
    ).properties(
        title='Relative Distribution of Taxa ID'
    )
    
    # Display the chart
    chart.show()

visualize_abundance(df, df_tax, level="tax_id")

In [None]:
df_matrix = df.copy().drop(columns=['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species',]).set_index("tax_id")

In [None]:
df_matrix

In [None]:
# Calculate the Bray-Curtis distance matrix
df_matrix = df_matrix.fillna(0)
distance_matrix = beta_diversity('braycurtis', df_matrix.T, ids=df_matrix.columns)

# Perform PCoA
pcoa_results = pcoa(distance_matrix)

# Inspect eigenvalues
#print("Eigenvalues:\n", pcoa_results.eigvals)

# Create a DataFrame for the PCoA results
pcoa_df = pcoa_results.samples.reset_index()

# Plot the PCoA results using Altair
base = alt.Chart(pcoa_df).mark_circle(size=100).encode(
    x=alt.X('PC1', title=f'PC1 ({pcoa_results.proportion_explained.iloc[0]:.2%} variance)'),
    y=alt.Y('PC2', title=f'PC2 ({pcoa_results.proportion_explained.iloc[1]:.2%} variance)'),
    tooltip=['index', 'PC1', 'PC2']
).properties(
    title='Principal Coordinate Analysis (PCoA)',
    width=600,
    height=600
)

# Vertical line at x=0
vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='red').encode(
    x='x:Q'
)

# Horizontal line at y=0
hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='red').encode(
    y='y:Q'
)

# Combine the base chart with the lines
chart = base + vline + hline

# Display the chart
chart.show()

In [None]:
pcoa_results.proportion_explained