## Notebook to generate Circos plots for age regression results

In [None]:
!date

#### import libraries

#### set notebook variables

In [None]:
# parameters
modality = 'GEX' # 'GEX' or 'ATAC'
category = 'cluster_name' # 'curated_type' for broad and 'cluster_name' for specific
REGRESSION_TYPE = 'rlm' # 'glm', 'glm_tweedie', or 'rlm'

In [None]:
# parameters
project = 'aging_phase2'
if category == 'curated_type':
    prefix_type = 'broad'
elif category == 'cluster_name':
    prefix_type = 'specific' 

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
quants_dir = f'{wrk_dir}/quants'
results_dir = f'{wrk_dir}/results'
figures_dir = f'{wrk_dir}/figures'

# in files

# out files

# constants
DEBUG = True
pd_set_option('display.max_rows', 500)

In [None]:
import pandas as pd
from holoviews import Chord, extension

# Sample data (replace with your actual data)
data = {
    "result_set1": ["GeneA", "GeneB", "GeneC", "GeneD"],
    "result_set2": ["GeneB", "GeneC", "GeneE", "GeneF"],
    "result_set3": ["GeneC", "GeneD", "GeneG", "GeneH"]
}

# Convert data to a pandas DataFrame
df = pd.DataFrame(data)

# Count gene occurrences in each result set
gene_counts = df.stack().value_counts()

# Calculate the number of shared genes between each pair of result sets
chord_data = []
for gene_i in gene_counts.index:
  for gene_j in gene_counts.index):
      if gene_i == gene_j:
          continue
    shared_genes = len(set(gene_counts.iloc[:, i].tolist()) & set(gene_counts.iloc[:, j].tolist()))
    chord_data.append({"source": gene_counts.index[i], "target": gene_counts.index[j], "count": shared_genes})

chord_df = pd.DataFrame(chord_data)

# Create the chord diagram
chord = Chord(chord_df).opts(
    aspect="equal",  # Maintain a circular shape
    labels=gene_counts.columns.tolist()  # Set labels for each result set
)

# Display the chord diagram using Bokeh extension
extension("bokeh")
chord


## example from the Medium post [Probably the Best Data Visualisation for Showing Many-to-Many Proportion In Python](https://towardsdatascience.com/probably-the-best-data-visualisation-for-showing-many-to-many-proportion-in-python-40bdd24365d5) by Christopher Tao on using Circos from pycirclize to plot chord diagrams

In [None]:
from pycirclize import Circos

sectors = {"A": 100, "B": 200, "C": 150}
sector_colors = {"A": "red", "B": "blue", "C": "green"}
circos = Circos(sectors, space=5)

for sector in circos.sectors:
    track = sector.add_track((95, 100))
    track.axis(fc=sector_colors[sector.name])
    track.text("Sector " + sector.name, color="white", size=12)
    track.xticks_by_interval(10)

circos.link(("A", 0, 20), ("B", 50, 70))
circos.link(("A", 20, 40), ("C", 30, 50))
circos.link(("B", 80, 100), ("A", 40, 60))
circos.link(("C", 100, 120), ("B", 150, 170))

fig = circos.plotfig()

In [None]:
from pycirclize import Circos
import pandas as pd

# Initialise the data
row_names = ["Sydney", "Melbourne", "Brisbane"]
col_names = ["Property", "Life", "Automobile"]
data = [
    [100, 150, 200],
    [80, 120, 160],
    [60, 90, 130],
]

# Create a pandas dataframe
df = pd.DataFrame(data, index=row_names, columns=col_names)

# Define the Circos Diagram with links
circos = Circos.initialize_from_matrix(
    df,
    space=5,    # Space between sectors
    ticks_interval=50,  # Ticks every 50
    r_lim=(93, 100),# Radius limits for sectors
    cmap="tab10",   # Use a built-in color map to get better looking colour code
    label_kws=dict(r=94, size=12, color="white"),   # Font of the sector labels
    link_kws=dict(ec="black", lw=0.5),  # Style of the links
)

fig = circos.plotfig()

In [None]:
df

### human genome examples from pycirclize docs [4-2. Segmental Dups Link](https://moshi4.github.io/pyCirclize/circos_plot/#4-2-segmental-dups-link)

In [None]:
from pycirclize import Circos
from pycirclize.utils import ColorCycler, load_eukaryote_example_dataset

# Load hg38 dataset (https://github.com/moshi4/pycirclize-data/tree/main/eukaryote/hg38)
chr_bed_file, cytoband_file, chr_links = load_eukaryote_example_dataset("hg38")

# Initialize Circos from BED chromosomes
circos = Circos.initialize_from_bed(chr_bed_file, space=3)
circos.text("Homo sapiens\n(hg38)", deg=315, r=150, size=12)

# Add cytoband tracks from cytoband file
circos.add_cytoband_tracks((95, 100), cytoband_file)

# Create chromosome color dict
ColorCycler.set_cmap("hsv")
chr_names = [s.name for s in circos.sectors]
colors = ColorCycler.get_color_list(len(chr_names))
chr_name2color = {name: color for name, color in zip(chr_names, colors)}

# Plot chromosome name & xticks
for sector in circos.sectors:
    sector.text(sector.name, r=120, size=10, color=chr_name2color[sector.name])
    sector.get_track("cytoband").xticks_by_interval(
        40000000,
        label_size=8,
        label_orientation="vertical",
        label_formatter=lambda v: f"{v / 1000000:.0f} Mb",
    )

# Plot chromosome link
for link in chr_links:
    region1 = (link.query_chr, link.query_start, link.query_end)
    region2 = (link.ref_chr, link.ref_start, link.ref_end)
    color = chr_name2color[link.query_chr]
    if link.query_chr in ("chr1", "chr8", "chr16") and link.query_chr != link.ref_chr:
        circos.link(region1, region2, color=color)

fig = circos.plotfig()

In [None]:
v

## rename output

In [None]:
import os

wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
results_dir = f'{wrk_dir}/results'

In [None]:
for filename in os.listdir(results_dir):
    if filename.endswith('.glm_age.csv'):
        new_filename = filename.replace('.glm_age.csv', '.rlm.age.csv')
        old_path = os.path.join(results_dir, filename)
        new_path = os.path.join(results_dir, new_filename)
        # os.rename(old_path, new_path)

## test if distribution is normal or tweedie

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

def test_distribution(data):
    """
    Analyzes data for characteristics of Gaussian or Tweedie distribution.
    
    Args:
      data (np.ndarray): 1D array of data points.
    
    Prints:
      Analysis of distribution based on tests and visualizations.
    """
    
    # Shapiro-Wilk test for normality
    stat, pval = stats.shapiro(data)
    print(f"Shapiro-Wilk normality test: statistic = {stat:.4f}, p-value = {pval:.4f}")
    
    # QQ-plot for visual comparison
    plt.hist(data, density=True)
    stats.probplot(data, dist="norm")
    plt.title("QQ-plot for normality")
    plt.show()
    # Variance analysis (suggestive for Tweedie)
    variance = np.var(data)
    mean = np.mean(data)
    f = variance / mean**2
    print(f"Variance to mean ratio: {f:.4f}")
    
    # Print conclusion based on findings
    if pval > 0.05:
        print("Shapiro-Wilk test suggests normality (may not be conclusive).")
    else:
        print("Shapiro-Wilk test rejects normality.")
    if f > 1:
        print(f"High variance to mean ratio suggests potential for Tweedie distribution.")
    else:
        print(f"Variance to mean ratio inconclusive for Tweedie distribution.")
    
    print("\n**Note:** These tests provide insights but don't definitively confirm distributions. Consider domain knowledge and further exploration for a more robust conclusion.")

# Example usage
data = np.random.normal(size=100)  # Simulate Gaussian data
test_distribution(data)