In [1]:
import os
import time
import numpy as np
import pandas as pd
import itertools
from datetime import datetime
from typing import Iterable
from typing import List, Optional
import matplotlib.pyplot as plt  

# Functions

In [2]:
def detect_markers_type(df):
    global markers_type  # create/update global variable
    
    marker_cols = df.columns[1:]

    # Detect if any marker contains "/"
    contains_slash = any(
        "/" in str(v)
        for v in df[marker_cols].stack()
    )

    # Assign marker type
    if contains_slash:
        markers_type = "microsatellites"
        print("Type of markers detected: Microsatellites")
    else:
        markers_type = "SNPs"
        print("Type of markers detected: SNPs")

    return markers_type


In [228]:
import re
import numpy as np
import pandas as pd
from collections import Counter

_IUPAC = {
    'R': ('A', 'G'), 'Y': ('C', 'T'), 'S': ('G', 'C'),
    'W': ('A', 'T'), 'K': ('G', 'T'), 'M': ('A', 'C')
}

def normalize_snp_alleles(genotype):
    """
    Normalize any diploid SNP genotype to a tuple of two alleles.
    Accepts any separator or compact form:
        AA, A/A, A-T, A:T, A;T, A|T, "A;A"
    Returns:
        tuple('A','T') or None if not interpretable as diploid SNP
    """
    if pd.isna(genotype):
        return None
    s = str(genotype).upper()
    # Extract only valid nucleotide alleles, ignore separators
    alleles = re.findall(r'[ACGT]', s)
    if len(alleles) == 2:
        return tuple(alleles)
    return None


def detect_and_convert_markers_to_012(markers_df, sample_size=1000):
    """
    Detect marker format and convert SNP-like columns to 0/1/2 encoding when possible.

    Behavior
    --------
    - Keeps first column as ID.
    - Prints detected marker type and explanation.
    - Sets global variable `markers_type`.
    - RETURNS ONLY the converted DataFrame.
    """

    if not isinstance(markers_df, pd.DataFrame):
        raise TypeError("markers_df must be a pandas DataFrame")

    if markers_df.shape[1] == 0:
        raise ValueError("Empty dataframe (no columns).")

    id_col = markers_df.columns[0]
    data_cols = list(markers_df.columns[1:])

    if not data_cols:
        print("No marker columns found (only ID column). Nothing to convert.")
        globals()['markers_type'] = None
        return markers_df.copy()

    # ---- sampling helper ----
    def sample_values():
        stacked = markers_df[data_cols].astype(str).stack()
        if len(stacked) == 0:
            return []
        if len(stacked) <= sample_size:
            return stacked.tolist()
        return stacked.sample(n=sample_size, random_state=1).tolist()

    samples = sample_values()

    # ---- regex patterns ----
    re_numeric_012 = re.compile(r'^[0-2]$')
    re_numeric_01 = re.compile(r'^[01]$')
    re_vcf_gt = re.compile(r'^[01][\/|][01]$')
    re_slash_numeric = re.compile(r'^\s*\d+\s*\/\s*\d+\s*$')
    re_iupac = re.compile(r'^[RYSWKMryswkm]$')

    # ---- detect marker types ----
    counts = Counter()

    for v in samples:
        s = str(v).strip()

        if s == '' or s.lower() in ('nan', 'none'):
            counts['missing'] += 1
            continue

        alleles = re.findall(r'[ACGTacgt]', s)

        if len(alleles) == 2:
            counts['allelic_diploid'] += 1
        elif re_vcf_gt.match(s):
            counts['vcf_gt'] += 1
        elif re_slash_numeric.match(s):
            counts['microsatellite_like'] += 1
        elif re_numeric_012.match(s):
            counts['numeric_012'] += 1
        elif re_numeric_01.match(s):
            counts['numeric_01'] += 1
        elif re_iupac.match(s):
            counts['iupac'] += 1
        else:
            counts['other'] += 1

    most_common_count = counts.most_common(1)[0][1] if counts else 0

    # ---- decide marker type ----
    if counts['microsatellite_like'] and counts['microsatellite_like'] >= most_common_count:
        markers_type = 'microsatellites'
        explanation = "Detected numeric allele sizes (e.g. '150/152')."
    elif counts['vcf_gt'] and counts['vcf_gt'] >= most_common_count:
        markers_type = 'vcf_gt'
        explanation = "Detected VCF GT format (e.g. '0/1')."
    elif counts['numeric_012'] and counts['numeric_012'] >= most_common_count:
        markers_type = 'numeric_012'
        explanation = "Detected numeric SNP encoding (0/1/2)."
    elif counts['allelic_diploid'] and counts['allelic_diploid'] >= most_common_count:
        markers_type = 'allelic_diploid'
        explanation = "Detected diploid SNPs with arbitrary separators."
    elif counts['numeric_01'] and counts['numeric_01'] >= most_common_count:
        markers_type = 'presence_absence'
        explanation = "Detected presence/absence markers (0/1)."
    elif counts['iupac'] and counts['iupac'] >= most_common_count:
        markers_type = 'iupac'
        explanation = "Detected IUPAC ambiguity codes."
    else:
        markers_type = 'unknown'
        explanation = "Could not confidently determine marker format."

    pretty = {
        'microsatellites': 'Microsatellites',
        'vcf_gt': 'VCF GT (0/1)',
        'numeric_012': 'Numeric SNPs (0/1/2)',
        'allelic_diploid': 'Allelic SNPs (any separator)',
        'presence_absence': 'Presence/Absence (0/1)',
        'iupac': 'IUPAC codes',
        'unknown': 'Unknown'
    }

    print(f"Markers type detected: {pretty.get(markers_type, markers_type)}")
    print("Details:", explanation)

    globals()['markers_type'] = markers_type

    # ---- formats not auto-converted ----
    if markers_type in ('microsatellites', 'unknown', 'presence_absence'):
        print("No automatic conversion performed for this marker type.")
        return markers_df.copy()

    # ---- prepare output ----
    df_out = markers_df.copy()

    # ---- column-wise conversion ----
    for col in data_cols:
        col_series = markers_df[col]
        non_null = col_series.dropna().astype(str).str.strip()

        if non_null.empty:
            df_out[col] = np.nan
            continue

        # ---- VCF GT ----
        if non_null.str.match(re_vcf_gt).all():
            def map_vcf(x):
                if pd.isna(x):
                    return np.nan
                a, b = re.split(r'[\/|]', str(x))
                return int(a) + int(b)

            df_out[col] = col_series.map(map_vcf).astype(float)
            continue

        # ---- numeric 0/1/2 ----
        if non_null.str.fullmatch(r'[0-2]').all():
            df_out[col] = pd.to_numeric(col_series, errors='coerce').astype(float)
            continue

        # ---- diploid SNPs ----
        allele_list = []
        for g in non_null:
            norm = normalize_snp_alleles(g)
            if norm:
                allele_list.extend(norm)

        if allele_list:
            allele_counts = Counter(allele_list)
            ref = allele_counts.most_common(1)[0][0]
            alt = allele_counts.most_common(2)[1][0] if len(allele_counts) > 1 else ref

            def map_snp(g):
                if pd.isna(g):
                    return np.nan
                norm = normalize_snp_alleles(g)
                if not norm:
                    return np.nan
                return int(norm[0] == alt) + int(norm[1] == alt)

            df_out[col] = col_series.map(map_snp).astype(float)
            continue

        # ---- IUPAC ----
        if non_null.str.fullmatch(r'[RYSWKMryswkm]').all():
            df_out[col] = col_series.map(
                lambda x: 1.0 if str(x).upper() in _IUPAC else np.nan
            ).astype(float)
            continue

        # ---- fallback ----
        coerced = pd.to_numeric(col_series, errors='coerce')
        coerced = coerced.where(coerced.isin([0, 1, 2]), np.nan)
        df_out[col] = coerced.astype(float)
        print(f"Column '{col}': fallback numeric coercion.")

    print("Conversion complete.")
    print("Global markers_type:", markers_type)

    return df_out


In [229]:
def micro_heterozygosity(df, count_nans_as_invalid=True):
    """
    Computes observed and expected heterozygosity per column and the inbreeding
    coefficient F.

    - The FIRST column of df is treated as the sample/individual ID (ignored in calculations).
    - Remaining columns are processed as genotype strings "A/B".
    - Returns: DataFrame with rows ['observed_heterozygosity', 'expected_heterozygosity']
      and a column 'average' (mean across loci).
    - Creates global variables F and heterozygosity_table.

    Notes:
      - NaN cells are excluded from per-column denominators by default. Set
        count_nans_as_invalid=False to include NaNs as invalid entries (denominator = total rows).
    """

    # First column is always treated as ID
    id_col = df.columns[0]

    # All other columns are genotype columns
    cols = [c for c in df.columns if c != id_col]

    observed = {}
    expected = {}

    for col in cols:
        series_raw = df[col]

        # handle missing values
        if count_nans_as_invalid:
            series = series_raw.dropna().astype(str)
        else:
            series = series_raw.fillna('').astype(str)

        total = len(series)
        if total == 0:
            observed[col] = np.nan
            expected[col] = np.nan
            continue

        # observed heterozygosity (Ho): left allele != right allele
        def is_hetero(cell):
            if '/' not in cell:
                return False
            L, R = cell.split('/', 1)
            return L.strip() != R.strip()

        Ho = series.map(is_hetero).sum() / total
        observed[col] = Ho

        # expected heterozygosity (He)
        def is_homo(cell):
            if '/' not in cell:
                return False
            L, R = cell.split('/', 1)
            return L.strip() == R.strip()

        homozygotes = series[series.map(is_homo)]
        counts = homozygotes.value_counts()
        freqs = counts / total
        He = 1 - np.sum(freqs ** 2)
        expected[col] = He

    # build result table
    result = pd.DataFrame(
        [observed, expected],
        index=["observosed_heterozygosity", "expected_heterozygosity"]
    )
    result["average"] = result.mean(axis=1)

    # compute global averages and F
    Ho_avg = result.loc["observosed_heterozygosity", cols].mean()
    He_avg = result.loc["expected_heterozygosity", cols].mean()
    F_value = 1 - (Ho_avg / He_avg) if (He_avg is not None and not np.isnan(He_avg) and He_avg != 0) else np.nan
    F_value = round(F_value, 2)
    
    # global vars
    globals()['F'] = F_value
    globals()['heterozygosity_table'] = result

    # print only F
    print("Inbreeding coefficient F:", F_value)
    print("")

    return result


In [230]:
def micro_pairwise_relatedness(df, decimals: int = 4) -> pd.DataFrame:
    """
    For every pair of rows in df, compute relatedness = matches / total_possible_elements.
    Returns a DataFrame named r with columns: pair, relatedness.
    Also creates the global object r.

    Behavior:
    - If df is None, uses global df_gen by default (expects df_gen to exist).
    - If df has a column named 'individual', it is used as the individual ID.
      Otherwise, the first column is assumed to contain individual IDs.
    - Pair labels are "ind_i_ind_j".
    - The individual ID column is NOT used for the relatedness calculation.
    """

    # Default to df_gen if no df was passed
    global df_gen
    if df is None:
        df = df_gen

    # Determine which column contains individual identifiers
    if "individual" in df.columns:
        id_col = "individual"
    else:
        # Fallback: assume the first column contains individual IDs
        id_col = df.columns[0]

    # Columns used for relatedness calculation (exclude ID column)
    data_cols = [c for c in df.columns if c != id_col]

    def split_cell(cell: object) -> list:
        if pd.isna(cell):
            return []
        s = str(cell).strip()
        if s == "":
            return []
        return [elem.strip() for elem in s.split(",")]

    results = []
    nrows = len(df)
    row_pairs = list(itertools.combinations(range(nrows), 2))

    # Pre-split cache for data columns only
    split_cache = {
        r_i: {col: split_cell(df.iloc[r_i][col]) for col in data_cols}
        for r_i in range(nrows)
    }

    # Prepare individual names
    individual_names = df[id_col].astype(str).tolist()

    for i, j in row_pairs:
        # Pair label using individual identifiers
        pair_label = f"{individual_names[i]}_{individual_names[j]}"

        total_matches = 0
        total_possible = 0

        for col in data_cols:
            elems_i = split_cache[i][col]
            elems_j = split_cache[j][col]

            max_len = max(len(elems_i), len(elems_j))
            total_possible += max_len

            for k in range(max_len):
                a = elems_i[k] if k < len(elems_i) else None
                b = elems_j[k] if k < len(elems_j) else None
                if a is not None and b is not None and a == b:
                    total_matches += 1

        relatedness = (total_matches / total_possible) if total_possible > 0 else 0.0
        results.append({
            "pair": pair_label,
            "relatedness": round(relatedness, decimals)
        })

    # Export global relatedness
    global r, relatedness_table
    r = pd.DataFrame(results)
    relatedness_table = r
    return r


In [231]:
def snp_heterozygosity(df, count_nans_as_invalid=True):
    """
    Computes observed and expected heterozygosity per column and the inbreeding
    coefficient F.

    Accepts SNP genotypes encoded as:
      0 -> homozygote REF/REF
      1 -> heterozygote REF/ALT
      2 -> homozygote ALT/ALT

    PARAMETERS:
      df: DataFrame with the data
      count_nans_as_invalid: if True, NaNs are excluded from the denominator

    RETURNS:
      DataFrame with Ho and He per locus and average.

    It also creates:
      - global variable F
      - global variable heterozygosity_table
    """
    
    # Automatically use the first column's name as the column to ignore
    ignore_cols = (df.columns[0],)

    cols = [c for c in df.columns if c not in ignore_cols]

    observed = {}
    expected = {}

    for col in cols:
        series_raw = df[col]

        # Denominator based on parameter count_nans_as_invalid
        if count_nans_as_invalid:
            # Exclude NaNs from the denominator
            series_valid = series_raw.dropna()
            total = len(series_valid)
        else:
            # Include NaNs in the denominator
            series_valid = series_raw.dropna()
            total = len(series_raw)

        if total == 0:
            observed[col] = np.nan
            expected[col] = np.nan
            continue

        # Convert entries to 0/1/2 if valid
        def to_int_if_valid(x):
            try:
                if pd.isna(x):
                    return np.nan
                xi = int(x)
                if xi in (0, 1, 2):
                    return xi
                return np.nan
            except Exception:
                return np.nan

        genotypes_valid = series_valid.map(to_int_if_valid).dropna().astype(int)

        # Observed heterozygosity (Ho): proportion of heterozygotes (1)
        n_het = (genotypes_valid == 1).sum()
        Ho = n_het / total if total > 0 else np.nan
        observed[col] = Ho

        # Expected heterozygosity (He): 2*p*q
        n0 = (genotypes_valid == 0).sum()  # homozygote REF/REF
        n1 = (genotypes_valid == 1).sum()  # heterozygote REF/ALT
        n2 = (genotypes_valid == 2).sum()  # homozygote ALT/ALT
        n_valid = n0 + n1 + n2  # number of valid genotypes

        if n_valid == 0:
            He = np.nan
        else:
            # Allele counts: REF allele = 2*n0 + n1 ; ALT allele = 2*n2 + n1
            ref_count = 2 * n0 + n1
            alt_count = 2 * n2 + n1
            denom = 2 * n_valid  # Total number of alleles
            p = ref_count / denom
            q = alt_count / denom
            He = 1 - (p**2 + q**2)  # equivalent to 2*p*q

        expected[col] = He

    # Build final result table
    result = pd.DataFrame(
        [observed, expected],
        index=["observed_heterozygosity", "expected_heterozygosity"]
    )
    result["average"] = result.mean(axis=1)

    # Compute global averages and F
    Ho_avg = result.loc["observed_heterozygosity", cols].mean()
    He_avg = result.loc["expected_heterozygosity", cols].mean()
    F_value = 1 - (Ho_avg / He_avg) if (He_avg is not None and not np.isnan(He_avg) and He_avg != 0) else np.nan
    F_value = round(F_value, 2)
    
    # Create global variable F
    globals()['F'] = F_value

    # Create global variable heterozygosity_table
    globals()['heterozygosity_table'] = result

    print("Inbreeding coefficient F:", F_value)
    print("")

    return result

In [232]:
def snp_pairwise_relatedness(df=None, decimals: int = 4) -> pd.DataFrame:
    """
    Compute pairwise relatedness as Pearson correlation between rows (individuals).

    Notes:
      - First column is treated as the sample ID.
      - The remaining columns are SNPs coded as 0/1/2.
      - Uses pairwise-complete Pearson correlation.
      - Produces global variables r and relatedness_table.
    """

    print("Step 1/6: Checking input dataframe...")
    global df_gen
    if df is None:
        df = df_gen

    if df is None:
        raise ValueError("No dataframe provided and global df_gen is not defined.")

    nrows = len(df)
    if nrows < 2:
        print("Not enough rows to compute pairwise relatedness.")
        r = pd.DataFrame(columns=["pair", "relatedness"])
        globals()['r'] = r
        globals()['relatedness_table'] = r
        return r

    print("Step 2/6: Extracting individual IDs (first column)...")
    id_col = df.columns[0]
    ids = df[id_col].astype(str).tolist()

    print("Step 3/6: Converting SNP data to numeric format...")
    # Convert remaining columns to numeric (0/1/2), invalid -> NaN
    data = df.drop(columns=[id_col]).apply(pd.to_numeric, errors='coerce')

    print("Step 4/6: Computing correlation matrix (this may take a while)...")
    # Transpose so columns = individuals
    data_T = data.T
    data_T.columns = ids

    corr_matrix = data_T.corr(method='pearson')

    print("Step 5/6: Cleaning correlation matrix (replacing NaN with 0.0)...")
    corr_matrix_filled = corr_matrix.fillna(0.0)

    print("Step 6/6: Building pairwise relatedness output...")
    results = []
    cols = corr_matrix_filled.columns.tolist()

    for a, b in itertools.combinations(cols, 2):
        val = corr_matrix_filled.at[a, b]
        results.append({
            "pair": f"{a}_{b}",
            "relatedness": round(float(val), decimals)
        })

    r = pd.DataFrame(results)
    globals()['r'] = r
    globals()['relatedness_table'] = r

    print("Done! Pairwise relatedness successfully computed.")
    return r


In [233]:
def phenotypic_similarity(
    df: Optional[pd.DataFrame],
    trait_cols: Optional[List[str]] = None,
    decimals: int = 4,
) -> pd.DataFrame:
    """
    Compute bounded phenotypic similarity in [0,1] for each trait and each pair of rows,
    using R_k = max_k - min_k (range) for normalization.

    similarity_k(i,j) = (R_k - abs(x_i - x_j)) / R_k

    Behavior:
    - If df is None, the function will try to use a global DataFrame named `mor`.
      If `mor` is not found, a ValueError is raised.
    - If trait_cols is None, numeric columns are auto-selected excluding the FIRST
      column (treated as the sample/individual ID).
    - If R_k == 0 (all values identical) similarity is set to 1 for that trait.
    - If either x_i or x_j is NaN → similarity is NaN for that trait.
    - Output: DataFrame with "pair" column (using values from the FIRST column if present,
      otherwise 1-based indices "i_j") and one column per trait.
    - Final DataFrame is assigned to the global variable `z` and also returned.

    Parameters
    ----------
    df : pd.DataFrame or None
        Input dataframe containing phenotypic trait columns. If None, uses global `mor`.
    trait_cols : list[str] | None
        List of trait column names to use. If None, auto-select numeric columns
        (excluding the first column).
    decimals : int
        Number of decimals to round similarity values.

    Returns
    -------
    pd.DataFrame
        DataFrame named `z` with columns: 'pair' and one column per selected trait.
    """
    # fallback to global 'mor' if df not provided
    if df is None:
        if 'mor' not in globals():
            raise ValueError(
                "df not provided and no global DataFrame named 'mor' found. "
                "Please pass df or create 'mor'."
            )
        df = globals()['mor']

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be a pandas DataFrame or None.")

    cols = list(df.columns)
    if len(cols) == 0:
        raise ValueError("Provided DataFrame has no columns.")

    # The FIRST column is treated as the ID column and excluded from trait selection
    id_col = cols[0]
    data_cols = cols[1:]  # candidate trait columns

    # select traits
    if trait_cols is not None:
        if not isinstance(trait_cols, (list, tuple)):
            raise TypeError("trait_cols must be a list/tuple of column names or None.")
        missing = [c for c in trait_cols if c not in cols]
        if missing:
            raise KeyError(f"The following requested columns do not exist in the DataFrame: {missing}")
        # disallow using the id column as a trait
        if id_col in trait_cols:
            raise ValueError(f"The first column ('{id_col}') is treated as ID and cannot be used as a trait.")
        non_numeric = [c for c in trait_cols if not pd.api.types.is_numeric_dtype(df[c])]
        if non_numeric:
            raise TypeError(f"The following columns are not numeric: {non_numeric}")
        selected_traits = list(trait_cols)
    else:
        # auto-select numeric columns excluding the first column
        selected_traits = [c for c in data_cols if pd.api.types.is_numeric_dtype(df[c])]

    if not selected_traits:
        numeric_cols = [c for c in data_cols if pd.api.types.is_numeric_dtype(df[c])]
        raise ValueError(
            "No selectable traits found (selected_traits is empty). "
            f"Numeric columns available (excluding first column '{id_col}'): {numeric_cols}."
        )

    # compute ranges = max - min for the selected traits
    ranges = df[selected_traits].max() - df[selected_traits].min()
    ranges = ranges[selected_traits]  # ensure same order/index

    results = []
    nrows = len(df)

    # prepare individual names (use the values from the first column; fallback to 1-based indices)
    id_values = df.iloc[:, 0].astype(str).tolist()
    use_ids = True  # we always have a first column to use as label

    for i, j in itertools.combinations(range(nrows), 2):
        if use_ids:
            ind_i = id_values[i]
            ind_j = id_values[j]
            pair_label = f"{ind_i}_{ind_j}"
        else:
            pair_label = f"{i+1}_{j+1}"

        row = {"pair": pair_label}
        for col in selected_traits:
            xi = df.iloc[i][col]
            xj = df.iloc[j][col]
            R = ranges.loc[col]

            if pd.isna(xi) or pd.isna(xj):
                sim = float("nan")
            else:
                diff = abs(xi - xj)
                if pd.isna(R) or R == 0:
                    sim = 1.0
                else:
                    sim = (R - diff) / R
                    # clip to [0,1]
                    if sim < 0:
                        sim = 0.0
                    elif sim > 1:
                        sim = 1.0

            row[col] = round(sim, decimals) if pd.notna(sim) else sim

        results.append(row)

    # create and export global z
    global z, phenotypic_similarity_table
    z = pd.DataFrame(results)
    phenotypic_similarity_table = z
    return z


In [234]:
def heritability(
    r_df: Optional[pd.DataFrame] = None,
    z_df: Optional[pd.DataFrame] = None,
    r_col: str = "relatedness",
    key: str = "pair",
    F: Optional[float] = None,
    ddof: int = 0,
    n_bootstrap: int = 1000,
    ci: float = 0.95,
    random_state: Optional[int] = None
) -> pd.DataFrame:
    """
    Computes per-trait heritability with bootstrap standard errors and confidence intervals.

    Behavior:
      - If r_df is None, tries to use global variable `r`.
      - If z_df is None, tries to use global variable `z`.
      - If F is provided by the user, that value is always used.
      - If F is None, tries to use global variable `F`.
      - Raises ValueError with clear message if any required input is missing.

    Returns a DataFrame with columns:
      - trait
      - heritability (point estimate)
      - standard_error (bootstrap SE)
      - ci_lower (bootstrap percentile CI)
      - ci_upper

    Also assigns the returned DataFrame to:
      - global `heritability_table`
      - global `heritability_table_F_<F>`
    """

    # --- fallback to globals if not provided ---
    if r_df is None:
        if 'r' in globals():
            r_df = globals()['r']
        else:
            raise ValueError("r_df not provided and no global 'r' found. Please pass r_df or create global 'r'.")

    if z_df is None:
        if 'z' in globals():
            z_df = globals()['z']
        else:
            raise ValueError("z_df not provided and no global 'z' found. Please pass z_df or create global 'z'.")

    # If F is explicitly provided by the user, use it.
    # Otherwise, fall back to the global F if available.
    if F is None:
        if 'F' in globals():
            F = globals()['F']
        else:
            raise ValueError("F not provided and no global 'F' found. Please pass F or run heterozygosity() first.")
    F = float(F)

    # ------------ Merge ----------------------
    merged = pd.merge(r_df[[key, r_col]], z_df, on=key, how="inner")
    if merged.empty:
        raise ValueError("Merge resulted in empty dataframe. Check keys in r_df and z_df.")

    n_pairs = merged.shape[0]

    # ------------ Extract and validate r ------
    r = pd.to_numeric(merged[r_col], errors="coerce")
    if r.isna().any():
        raise ValueError(f"Non-numeric values found in relatedness column '{r_col}'.")

    var_r = r.var(ddof=ddof)
    if var_r == 0 or np.isclose(var_r, 0):
        raise ValueError("Variance of relatedness is zero; cannot compute heritability.")

    # ------------ Trait columns ---------------
    trait_cols = [c for c in merged.columns if c not in {key, r_col}]
    if not trait_cols:
        raise ValueError("No trait columns found after merging r_df and z_df.")

    # ------------ Center data -----------------
    r_centered = r - r.mean()
    z = merged[trait_cols].apply(pd.to_numeric, errors="coerce")
    if z.isna().any().any():
        raise ValueError("Non-numeric values detected in trait columns after conversion.")
    z_centered = z - z.mean()

    # ------------ Point estimates --------------
    covariances = (r_centered.values.reshape(-1, 1) * z_centered.values).mean(axis=0)
    h2_unadjusted = covariances / (2.0 * var_r)
    h2_point = h2_unadjusted * (1.0 + F)

    # ------------ Bootstrap --------------------
    rng = np.random.default_rng(random_state)
    B = int(n_bootstrap)
    boot_estimates = np.full((B, len(trait_cols)), np.nan)

    for b in range(B):
        idx = rng.integers(0, n_pairs, size=n_pairs)
        r_s = r.values[idx]
        z_s = z.values[idx, :]

        var_r_s = np.var(r_s, ddof=ddof)
        if var_r_s == 0 or np.isclose(var_r_s, 0):
            continue

        r_s_centered = r_s - r_s.mean()
        z_s_centered = z_s - z_s.mean(axis=0)

        cov_s = (r_s_centered.reshape(-1, 1) * z_s_centered).mean(axis=0)
        h2_unadj_s = cov_s / (2.0 * var_r_s)
        h2_s = h2_unadj_s * (1.0 + F)

        boot_estimates[b, :] = h2_s

    # Remove invalid bootstrap iterations
    valid = ~np.all(np.isnan(boot_estimates), axis=1)
    boot_valid = boot_estimates[valid]

    if boot_valid.shape[0] == 0:
        raise RuntimeError("All bootstrap replicates invalid (variance of r was zero in all replicates).")

    # ------------ Standard errors + CI ---------
    se = np.nanstd(boot_valid, axis=0, ddof=1)

    alpha = (1 - ci) / 2
    ci_lower = np.nanpercentile(boot_valid, 100 * alpha, axis=0)
    ci_upper = np.nanpercentile(boot_valid, 100 * (1 - alpha), axis=0)

    # ------------ Final output -----------------
    out = pd.DataFrame({
        "trait": trait_cols,
        "heritability": h2_point,
        "standard_error": se,
        "ci_lower": ci_lower,
        "ci_upper": ci_upper
    }).round(4)

    # Always keep the default global name (backward compatibility)
    globals()['heritability_table'] = out

    # Also assign a global name that includes the value of F
    F_label = str(F).replace('.', '_')
    globals()[f'heritability_table_F_{F_label}'] = out

    print('n_bootstrap:', n_bootstrap)
    print('')
    print(f'heritability_table_F_{F}:')
    print('')
    print('='*70)
    print(out)
    print('-'*70)
    #return out


In [236]:
def save_run():
    """
    Save results into a timestamped folder.

    This function also saves:
      - All heritability tables generated with different F values
      - The original inputs passed to the h2() pipeline
      - Metadata including marker type and total computation time
    """

    # Create folder name with timestamp
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    folder_name = f"heritable_run_{timestamp}"
    os.makedirs(folder_name, exist_ok=True)

    # Save the current/global F value (if present)
    if 'F' in globals():
        pd.DataFrame({"F": [globals()['F']]}).to_csv(
            f"{folder_name}/inbreeding_coefficient_F.csv",
            index=False
        )

    # Save main result tables
    heritability_table.to_csv(f"{folder_name}/heritability_table.csv", index=False)
    heterozygosity_table.to_csv(f"{folder_name}/heterozygosity_table.csv", index=False)
    relatedness_table.to_csv(f"{folder_name}/relatedness_table.csv", index=False)
    phenotypic_similarity_table.to_csv(
        f"{folder_name}/phenotypic_similarity_table.csv",
        index=False
    )

    # Save all heritability tables generated with explicit F values
    for name, obj in globals().items():
        if name.startswith("heritability_table_F_") and isinstance(obj, pd.DataFrame):
            obj.to_csv(f"{folder_name}/{name}.csv", index=False)

    # Save h2() inputs
    if '_h2_markers_input' in globals():
        globals()['_h2_markers_input'].to_csv(
            f"{folder_name}/h2_markers_input.csv",
            index=False
        )

    if '_h2_traits_input' in globals():
        globals()['_h2_traits_input'].to_csv(
            f"{folder_name}/h2_traits_input.csv",
            index=False
        )

    # Save h2() metadata
    metadata = {}

    if '_h2_markers_type' in globals():
        metadata['markers_type'] = globals()['_h2_markers_type']

    if '_h2_runtime_seconds' in globals():
        metadata['runtime_seconds'] = round(globals()['_h2_runtime_seconds'], 4)

    if metadata:
        pd.DataFrame([metadata]).to_csv(
            f"{folder_name}/h2_metadata.csv",
            index=False
        )

    print(f"Files successfully saved in: {folder_name}")
    print("Included all heritability tables, h2() inputs, and metadata.")

    return folder_name


# Compilation

In [245]:
def h2(markers, traits):
    """
    High-level pipeline to compute:
      1. Heterozygosity
      2. Pairwise relatedness
      3. Phenotypic similarity
      4. Heritability (h²)
    """

    # Start timing
    start_time = time.time()

    # Store copies of the original inputs internally
    # Independent of user variable names
    globals()['_h2_markers_input'] = markers.copy()
    globals()['_h2_traits_input'] = traits.copy()

    # Detect marker type
    markers_type = detect_markers_type(markers)
    globals()['_h2_markers_type'] = markers_type

    # Branch depending on marker type
    if markers_type == 'microsatellites':
        micro_heterozygosity(markers)
        micro_pairwise_relatedness(markers)

    elif markers_type == 'SNPs':
        snp_heterozygosity(markers)
        snp_pairwise_relatedness(markers)

    else:
        raise ValueError(
            f"Unrecognized markers_type '{markers_type}'.\n\n"
            "Valid marker types and required formats:\n"
            "  • microsatellites → cells contain '/', e.g. '150/158'\n"
            "  • SNPs            → numeric or allele codes without '/', e.g. 0/1/2 or 'AA'\n\n"
            "Please check your markers DataFrame format."
        )

    # Compute phenotypic similarity
    phenotypic_similarity(traits)

    # Compute heritability
    heritability()

    # Stop timing
    end_time = time.time()
    elapsed_time = end_time - start_time

    # Store computation time (seconds)
    globals()['_h2_runtime_seconds'] = elapsed_time

    # Print runtime information
    print('')
    print('h2 pipeline completed successfully.')
    print(f'Total computation time: {elapsed_time:.2f} seconds')
    print('')
    print('To recalculate heritability with a different F value, type: heritability(F=value); e.g. heritability(F=0.5)')
    print('')
    print('To access more results, type: F, heterozygosity_table, relatedness_table, phenotypic_similarity_table, heritability_table.')
    print('To save all data, type:  save_run()')


# Tests

### Chili - microsatellites

In [246]:
markers = pd.read_csv('./chili/gen.csv')
traits = pd.read_csv('./chili/mor.csv')

In [247]:
markers

Unnamed: 0,individual,Bd12,Ng10,Ng18,Ng20,Ng6,Ng7,Ng8
0,ind_1,2/2,1/1,4/4,1/1,2/2,0/0,2/2
1,ind_2,2/2,1/1,4/4,1/1,2/2,0/0,2/2
2,ind_3,2/2,1/1,3/3,1/1,2/2,2/2,2/2
3,ind_4,2/2,1/1,4/4,1/1,2/2,2/2,2/2
4,ind_5,2/2,1/1,4/4,1/1,2/2,2/2,2/2
...,...,...,...,...,...,...,...,...
109,ind_110,2/2,2/2,2/2,1/1,1/1,3/3,1/1
110,ind_111,0/0,2/2,2/2,1/1,1/1,3/3,1/1
111,ind_112,1/1,2/2,2/2,1/1,1/1,3/3,1/1
112,ind_113,1/1,2/2,2/2,1/1,1/1,3/3,1/1


In [248]:
traits

Unnamed: 0,individual,stem_width,stem_height,first_flower_day,fruits_number,fruit_length,fruit_diameter,fruit_weight,seeds_number,yield
0,ind_1,1.52,78.0,44.0,13,23.18,3.39,48.08,135.00,624.98
1,ind_2,1.36,74.0,45.0,9,25.05,3.08,48.10,164.00,432.90
2,ind_3,1.43,69.0,63.0,8,17.75,2.03,19.05,47.00,152.40
3,ind_4,1.53,68.0,44.0,12,21.50,3.04,46.58,156.00,558.90
4,ind_5,1.34,84.0,44.0,6,24.88,3.38,54.05,148.33,324.30
...,...,...,...,...,...,...,...,...,...,...
109,ind_110,1.29,45.0,83.0,0,3.69,2.59,8.25,51.64,0.00
110,ind_111,0.77,33.0,57.0,0,3.69,2.59,8.25,51.64,0.00
111,ind_112,1.52,59.0,69.0,0,3.69,2.59,8.25,51.64,0.00
112,ind_113,1.04,37.5,52.0,10,3.29,2.53,6.70,49.00,67.00


In [249]:
h2(markers,traits)

Type of markers detected: Microsatellites
Inbreeding coefficient F: 1.0

n_bootstrap: 1000

heritability_table_F_1.0:

              trait  heritability  standard_error  ci_lower  ci_upper
0        stem_width        0.0911          0.0073    0.0769    0.1046
1       stem_height        0.1483          0.0072    0.1339    0.1621
2  first_flower_day        0.2005          0.0075    0.1853    0.2151
3     fruits_number        0.1388          0.0105    0.1188    0.1593
4      fruit_length        0.9343          0.0087    0.9170    0.9504
5    fruit_diameter        0.4437          0.0105    0.4230    0.4645
6      fruit_weight        0.7902          0.0088    0.7733    0.8071
7      seeds_number        0.6425          0.0081    0.6263    0.6580
8             yield        0.1907          0.0089    0.1719    0.2073
----------------------------------------------------------------------

h2 pipeline completed successfully.
Total computation time: 7.63 seconds

To recalculate heritability with a 

In [179]:
heritability(F=0)

n_bootstrap: 1000

heritability_table_F_0.0:

              trait  heritability  standard_error  ci_lower  ci_upper
0        stem_width        0.0455          0.0037    0.0384    0.0529
1       stem_height        0.0742          0.0035    0.0675    0.0813
2  first_flower_day        0.1003          0.0038    0.0927    0.1073
3     fruits_number        0.0694          0.0053    0.0592    0.0799
4      fruit_length        0.4672          0.0046    0.4584    0.4762
5    fruit_diameter        0.2219          0.0053    0.2115    0.2324
6      fruit_weight        0.3951          0.0047    0.3862    0.4046
7      seeds_number        0.3212          0.0041    0.3133    0.3299
8             yield        0.0954          0.0043    0.0870    0.1033
----------------------------------------------------------------------


In [180]:
heritability(F = 0.35)

n_bootstrap: 1000

heritability_table_F_0.35:

              trait  heritability  standard_error  ci_lower  ci_upper
0        stem_width        0.0615          0.0049    0.0520    0.0711
1       stem_height        0.1001          0.0046    0.0914    0.1089
2  first_flower_day        0.1353          0.0050    0.1256    0.1455
3     fruits_number        0.0937          0.0068    0.0800    0.1067
4      fruit_length        0.6307          0.0059    0.6184    0.6425
5    fruit_diameter        0.2995          0.0068    0.2861    0.3126
6      fruit_weight        0.5334          0.0061    0.5210    0.5452
7      seeds_number        0.4337          0.0055    0.4226    0.4442
8             yield        0.1287          0.0059    0.1172    0.1404
----------------------------------------------------------------------


In [181]:
save_run()

Files successfully saved in: heritable_run_2025-12-22_15-01-28
Included all heritability tables, h2() inputs, and metadata.


'heritable_run_2025-12-22_15-01-28'

## Maize -SNPs

In [182]:
markers = pd.read_csv('./maize/maize_markers.csv')
traits = pd.read_csv('./maize/maize_mean_pheno.csv')

In [183]:
h2(markers,traits)

Type of markers detected: SNPs
Inbreeding coefficient F: 0.97

Step 1/6: Checking input dataframe...
Step 2/6: Extracting individual IDs (first column)...
Step 3/6: Converting SNP data to numeric format...
Step 4/6: Computing correlation matrix (this may take a while)...
Step 5/6: Cleaning correlation matrix (replacing NaN with 0.0)...
Step 6/6: Building pairwise relatedness output...
Done! Pairwise relatedness successfully computed.
n_bootstrap: 1000

heritability_table_F_0.97:

           trait  heritability  standard_error  ci_lower  ci_upper
0    grain.yield        0.2160          0.0077    0.2012    0.2308
1   grain.number        0.1712          0.0081    0.1549    0.1863
2      seed.size        0.1142          0.0066    0.1010    0.1261
3       anthesis        0.2045          0.0088    0.1874    0.2213
4        silking        0.2146          0.0082    0.1994    0.2307
5   plant.height        0.0468          0.0075    0.0318    0.0615
6  tassel.height        0.0677          0.0090

# Mice - SNPs

In [19]:
markers = pd.read_csv('./mice/mice_markers.csv')
traits = pd.read_csv('./mice/mice_traits.csv')

In [20]:
h2(markers,traits)

Type of markers detected: SNPs
Inbreeding coefficient F: 0.026500146367681388

Step 1/6: Checking input dataframe...
Step 2/6: Extracting individual IDs (first column)...
Step 3/6: Converting SNP data to numeric format...
Step 4/6: Computing correlation matrix (this may take a while)...
Step 5/6: Cleaning correlation matrix (replacing NaN with 0.0)...
Step 6/6: Building pairwise relatedness output...
Done! Pairwise relatedness successfully computed.
n_bootstrap: 1000

heritability_table:

                trait  heritability  standard_error  ci_lower  ci_upper
0  Obesity.BodyLength         0.002          0.0007    0.0005    0.0035

To access more results, type: F, heterozygosity_table, relatedness_table, phenotypic_similarity_table, heritability_table.
To save all data, type:  save_run()


# Wild chili

### Data merge

In [100]:
traits = pd.read_csv('./chili/wild/traits.csv')
micros = pd.read_csv('./chili/wild/micros.csv')
snps = pd.read_csv('./chili/wild/snps.csv')

In [63]:
traits_micro =  pd.merge(traits, micros, on='sample')
traits_in_micro = traits_micro.iloc[:,:7]
traits_in_micro
traits_in_micro.to_csv('./chili/wild/traits_in_micro.csv') 

In [85]:
traits_micro_snps =  pd.merge(traits_micro, snps, on='sample')

In [79]:
traits_merged = traits_micro_snps.iloc[:, :7]
traits_merged.to_csv('./chili/wild/traits_merged.csv', index=None)

In [81]:
micros_merged = pd.concat([traits_micro_snps.iloc[:, [0]], traits_micro_snps.iloc[:, 7:17]], axis=1)
micros_merged
micros_merged.to_csv('./chili/wild/micros_merged.csv', index=None) 

In [84]:
snps_merged = pd.concat([traits_micro_snps.iloc[:, [0]], traits_micro_snps.iloc[:, 18:]], axis=1)
snps_merged.to_csv('./chili/wild/snps_merged.csv', index=None) 

### Data load

In [270]:
traits = pd.read_csv('./chili/wild/traits_merged.csv')
micros = pd.read_csv('./chili/wild/micros_merged.csv')
snps = pd.read_csv('./chili/wild/snps_merged_012.csv')

In [271]:
traits = traits.iloc[:,:-1]
traits

Unnamed: 0,sample,flower_diameter,fruit_weight,fruit_length,fruit_width,fruit_height
0,B-4,17.98,1.9,9.87,9.24,37.06
1,B-7,14.63,5.9,20.2,19.58,55.5
2,B-9B,15.27,3.96,14.52,15.12,51.22
3,C-1,15.48,0.92,10.22,10.36,16.66
4,C-2,10.54,0.12,5.09,4.56,8.04
5,C-alfa,13.04,1.9,8.84,9.09,52.48
6,C-G,13.42,2.38,15.82,10.27,38.58
7,C-H,15.71,1.48,10.93,11.0,25.0
8,C-K,12.86,0.86,8.82,9.06,19.88
9,D-5,10.69,0.16,5.54,5.49,9.18


In [280]:
h2(micros,traits)

Type of markers detected: Microsatellites
Inbreeding coefficient F: 1.0

n_bootstrap: 1000

heritability_table_F_1.0:

             trait  heritability  standard_error  ci_lower  ci_upper
0  flower_diameter        0.1229          0.0518    0.0157    0.2206
1     fruit_weight       -0.0757          0.0677   -0.2144    0.0518
2     fruit_length        0.0307          0.0684   -0.1142    0.1546
3      fruit_width        0.0040          0.0695   -0.1419    0.1333
4     fruit_height        0.3056          0.0686    0.1735    0.4387
----------------------------------------------------------------------

h2 pipeline completed successfully.
Total computation time: 0.36 seconds

To recalculate heritability with a different F value, type: heritability(F=value); e.g. heritability(F=0.5)

To access more results, type: F, heterozygosity_table, relatedness_table, phenotypic_similarity_table, heritability_table.
To save all data, type:  save_run()


In [281]:
r_micro = relatedness_table

In [282]:
h2(snps,traits)

Type of markers detected: SNPs
Inbreeding coefficient F: 0.74

Step 1/6: Checking input dataframe...
Step 2/6: Extracting individual IDs (first column)...
Step 3/6: Converting SNP data to numeric format...
Step 4/6: Computing correlation matrix (this may take a while)...
Step 5/6: Cleaning correlation matrix (replacing NaN with 0.0)...
Step 6/6: Building pairwise relatedness output...
Done! Pairwise relatedness successfully computed.
n_bootstrap: 1000

heritability_table_F_0.74:

             trait  heritability  standard_error  ci_lower  ci_upper
0  flower_diameter        0.1114          0.0157    0.0814    0.1419
1     fruit_weight        0.0530          0.0232    0.0076    0.0975
2     fruit_length        0.0957          0.0218    0.0530    0.1383
3      fruit_width        0.0739          0.0223    0.0300    0.1158
4     fruit_height        0.2197          0.0179    0.1830    0.2540
----------------------------------------------------------------------

h2 pipeline completed success

In [283]:
r_snps = relatedness_table

In [284]:
r_micro

Unnamed: 0,pair,relatedness
0,B-4_B-7,0.5
1,B-4_B-9B,0.6
2,B-4_C-1,0.7
3,B-4_C-2,0.4
4,B-4_C-alfa,0.8
...,...,...
346,G-21_H-33,0.6
347,G-21_C-9,0.3
348,H-29_H-33,0.3
349,H-29_C-9,0.4


In [285]:
r_snps

Unnamed: 0,pair,relatedness
0,B-4_B-7,-0.1604
1,B-4_B-9B,-0.1869
2,B-4_C-1,0.7179
3,B-4_C-2,-0.0688
4,B-4_C-alfa,-0.1577
...,...,...
346,G-21_H-33,0.8083
347,G-21_C-9,-0.2733
348,H-29_H-33,-0.2878
349,H-29_C-9,0.8849
