In [1]:
import re
from collections import Counter
import numpy as np
import pandas as pd

In [2]:
_IUPAC = {
    'R': ('A', 'G'), 'Y': ('C', 'T'), 'S': ('G', 'C'),
    'W': ('A', 'T'), 'K': ('G', 'T'), 'M': ('A', 'C')
}

def normalize_snp_alleles(genotype):
    """
    Normalize any diploid SNP genotype to a tuple of two alleles.
    Accepts any separator or compact form:
        AA, A/A, A-T, A:T, A;T, A|T, "A;A"
    Returns:
        tuple('A','T') or None if not interpretable as diploid SNP
    """
    if pd.isna(genotype):
        return None
    s = str(genotype).upper()
    # Extract only valid nucleotide alleles, ignore separators
    alleles = re.findall(r'[ACGT]', s)
    if len(alleles) == 2:
        return tuple(alleles)
    return None

In [3]:
def detect_and_convert_markers_to_012(markers_df, sample_size=1000):
    """
    Detect marker format and convert SNP-like columns to 0/1/2 encoding when possible.

    Behavior
    --------
    - Keeps first column as ID.
    - Detects marker type from data content.
    - Prints detected marker type and explanation.
    - Sets global variable `markers_type`.
    - RETURNS ONLY the converted DataFrame.
    """

    if not isinstance(markers_df, pd.DataFrame):
        raise TypeError("markers_df must be a pandas DataFrame")

    if markers_df.shape[1] == 0:
        raise ValueError("Empty dataframe (no columns).")

    id_col = markers_df.columns[0]
    data_cols = list(markers_df.columns[1:])

    if not data_cols:
        print("No marker columns found (only ID column). Nothing to convert.")
        globals()['markers_type'] = None
        return markers_df.copy()

    # ---- sampling helper ----
    def sample_values():
        stacked = markers_df[data_cols].astype(str).stack()
        if stacked.empty:
            return []
        if len(stacked) <= sample_size:
            return stacked.tolist()
        return stacked.sample(n=sample_size, random_state=1).tolist()

    samples = sample_values()

    # ---- regex patterns ----
    re_numeric_012 = re.compile(r'^[0-2]$')
    re_numeric_01 = re.compile(r'^[01]$')
    re_vcf_gt = re.compile(r'^[01][\/|][01]$')
    re_slash_numeric = re.compile(r'^\s*\d+\s*\/\s*\d+\s*$')
    re_iupac = re.compile(r'^[RYSWKMryswkm]$')

    # ---- detect marker types ----
    counts = Counter()

    for v in samples:
        s = str(v).strip()

        if s == '' or s.lower() in ('nan', 'none'):
            counts['missing'] += 1
            continue

        alleles = re.findall(r'[ACGTacgt]', s)

        if len(alleles) == 2:
            counts['allelic_diploid'] += 1
        elif re_vcf_gt.match(s):
            counts['vcf_gt'] += 1
        elif re_slash_numeric.match(s):
            counts['microsatellite_like'] += 1
        elif re_numeric_012.match(s):
            counts['numeric_012'] += 1
        elif re_numeric_01.match(s):
            counts['numeric_01'] += 1
        elif re_iupac.match(s):
            counts['iupac'] += 1
        else:
            counts['other'] += 1

    most_common_count = counts.most_common(1)[0][1] if counts else 0

    # ---- decide marker type ----
    if counts['microsatellite_like'] >= most_common_count and counts['microsatellite_like'] > 0:
        markers_type = 'microsatellites'
        explanation = "Detected numeric allele sizes (e.g. '150/152')."
    elif counts['vcf_gt'] >= most_common_count and counts['vcf_gt'] > 0:
        markers_type = 'vcf_gt'
        explanation = "Detected VCF GT format (e.g. '0/1')."
    elif counts['numeric_012'] >= most_common_count and counts['numeric_012'] > 0:
        markers_type = 'numeric_012'
        explanation = "Detected numeric SNP encoding (0/1/2)."
    elif counts['allelic_diploid'] >= most_common_count and counts['allelic_diploid'] > 0:
        markers_type = 'allelic_diploid'
        explanation = "Detected diploid SNPs with arbitrary separators."
    elif counts['numeric_01'] >= most_common_count and counts['numeric_01'] > 0:
        markers_type = 'presence_absence'
        explanation = "Detected presence/absence markers (0/1)."
    elif counts['iupac'] >= most_common_count and counts['iupac'] > 0:
        markers_type = 'iupac'
        explanation = "Detected IUPAC ambiguity codes."
    else:
        markers_type = 'unknown'
        explanation = "Could not confidently determine marker format."

    print(f"Markers type detected: {markers_type}")
    print("Details:", explanation)

    globals()['markers_type'] = markers_type

    # ---- formats that should not be auto-converted ----
    if markers_type in ('microsatellites', 'presence_absence', 'unknown'):
        print("No automatic conversion performed for this marker type.")
        return markers_df.copy()

    # ---- prepare output ----
    df_out = markers_df.copy()

    # ---- column-wise conversion ----
    for col in data_cols:
        col_series = markers_df[col]
        non_null = col_series.dropna().astype(str).str.strip()

        if non_null.empty:
            df_out[col] = np.nan
            continue

        # ---- VCF GT ----
        if non_null.str.match(re_vcf_gt).all():
            def map_vcf(x):
                if pd.isna(x):
                    return np.nan
                a, b = re.split(r'[\/|]', str(x))
                return int(a) + int(b)

            df_out[col] = col_series.map(map_vcf).astype(float)
            continue

        # ---- numeric 0/1/2 ----
        if non_null.str.fullmatch(r'[0-2]').all():
            df_out[col] = pd.to_numeric(col_series, errors='coerce').astype(float)
            continue

        # ---- diploid SNPs ----
        allele_list = []
        for g in non_null:
            norm = normalize_snp_alleles(g)
            if norm:
                allele_list.extend(norm)

        if allele_list:
            allele_counts = Counter(allele_list)
            ref = allele_counts.most_common(1)[0][0] 
            if len(allele_counts) == 1:              
                df_out[col] = 0.0
                continue
            alt = allele_counts.most_common()[-1][0]

            def map_snp(g):
                if pd.isna(g):
                    return np.nan
                norm = normalize_snp_alleles(g)
                if not norm:
                    return np.nan
                return int(norm[0] == alt) + int(norm[1] == alt)

            df_out[col] = col_series.map(map_snp).astype(float)
            continue

        # ---- IUPAC ----
        if non_null.str.fullmatch(r'[RYSWKMryswkm]').all():
            df_out[col] = col_series.map(
                lambda x: 1.0 if str(x).upper() in _IUPAC else np.nan
            ).astype(float)
            continue

        # ---- fallback ----
        coerced = pd.to_numeric(col_series, errors='coerce')
        coerced = coerced.where(coerced.isin([0, 1, 2]), np.nan)
        df_out[col] = coerced.astype(float)
        print(f"Column '{col}': fallback numeric coercion.")

    print("Conversion complete.")
    print("Global markers_type:", markers_type)

    return df_out


# Data load

In [4]:
micros = pd.read_csv('./data_type_test/microsatellites.csv')
micros

Unnamed: 0,ID,M1,M2,M3
0,Ind1,150/152,198/204,100/102
1,Ind2,148/150,200/200,102/104
2,Ind3,150/150,198/198,100/104
3,Ind4,152/152,204/204,104/104


In [5]:
snps_012 = pd.read_csv('./data_type_test/snps_012.csv')
snps_012

Unnamed: 0,ID,SNP1,SNP2,SNP3
0,Ind1,0,1,2
1,Ind2,1,1,0
2,Ind3,2,0,1
3,Ind4,1,2,2


In [6]:
snps_allelic = pd.read_csv('./data_type_test/snps_allelic_twoletter.csv')
snps_allelic

Unnamed: 0,ID,SNP1,SNP2,SNP3
0,Ind1,AA,AG,GG
1,Ind2,AG,GG,AA
2,Ind3,GG,AG,AG
3,Ind4,AA,AA,GG


In [7]:
snps_vcf = pd.read_csv('./data_type_test/snps_vcf.csv')
snps_vcf

Unnamed: 0,ID,SNP1,SNP2,SNP3
0,Ind1,0/0,0/1,1/1
1,Ind2,0/1,0/1,0/0
2,Ind3,1/1,0/0,0/1
3,Ind4,0/0,1/1,1/1


In [8]:
snp_slash_format = pd.read_csv('./data_type_test/snp_slash_format.csv')
snp_slash_format

Unnamed: 0,sample,SNP1,SNP2,SNP3
0,ind1,A/A,C/C,G/G
1,ind2,A/G,C/T,G/A
2,ind3,G/G,T/T,A/A


In [9]:
snp_variable_format = pd.read_csv('./data_type_test/snps_semicolon_format.csv')

In [10]:
snp_variable_format = snp_variable_format.drop('pop', axis=1)
snp_variable_format.head(3)

Unnamed: 0.1,Unnamed: 0,100372750-33-A/G,100352026-35-G/A,100398603-66-T/A,100282003-7-C/A,100268131-40-A/G,100346786-19-C/T,100385025-37-A/G,100454812-47-G/T,100249878-21-T/C,...,100472466-7-G/A,100269708-12-A/G,100247673-6-C/T,100400729-21-T/A,100249247-8-C/A,100493546-6-A/T,100250134-41-G/A,100247596-22-T/C,100401377-8-T/C,100318416-9-C/G
0,Cly 1,G;G,G;G,A;A,A;A,A;A,C;C,G;G,G;G,T;T,...,G;G,G;G,T;T,A;A,A;A,T;T,G;G,T;T,C;C,C;C
1,J-9,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
2,H-29,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G


In [11]:
snps_wild = pd.read_csv('./chili/wild/snps_merged.csv')
snps_wild.head(3)

Unnamed: 0,sample,100372750-33-A/G,100352026-35-G/A,100398603-66-T/A,100282003-7-C/A,100268131-40-A/G,100346786-19-C/T,100385025-37-A/G,100454812-47-G/T,100249878-21-T/C,...,100472466-7-G/A,100269708-12-A/G,100247673-6-C/T,100400729-21-T/A,100249247-8-C/A,100493546-6-A/T,100250134-41-G/A,100247596-22-T/C,100401377-8-T/C,100318416-9-C/G
0,B-4,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
1,B-7,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
2,B-9B,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G


# Tests

In [12]:
detect_and_convert_markers_to_012(micros)

Markers type detected: microsatellites
Details: Detected numeric allele sizes (e.g. '150/152').
No automatic conversion performed for this marker type.


Unnamed: 0,ID,M1,M2,M3
0,Ind1,150/152,198/204,100/102
1,Ind2,148/150,200/200,102/104
2,Ind3,150/150,198/198,100/104
3,Ind4,152/152,204/204,104/104


In [13]:
detect_and_convert_markers_to_012(snps_012)

Markers type detected: numeric_012
Details: Detected numeric SNP encoding (0/1/2).
Conversion complete.
Global markers_type: numeric_012


Unnamed: 0,ID,SNP1,SNP2,SNP3
0,Ind1,0.0,1.0,2.0
1,Ind2,1.0,1.0,0.0
2,Ind3,2.0,0.0,1.0
3,Ind4,1.0,2.0,2.0


In [14]:
detect_and_convert_markers_to_012(snps_allelic)

Markers type detected: allelic_diploid
Details: Detected diploid SNPs with arbitrary separators.
Conversion complete.
Global markers_type: allelic_diploid


Unnamed: 0,ID,SNP1,SNP2,SNP3
0,Ind1,0.0,1.0,0.0
1,Ind2,1.0,2.0,2.0
2,Ind3,2.0,1.0,1.0
3,Ind4,0.0,0.0,0.0


In [15]:
detect_and_convert_markers_to_012(snps_vcf)

Markers type detected: vcf_gt
Details: Detected VCF GT format (e.g. '0/1').
Conversion complete.
Global markers_type: vcf_gt


Unnamed: 0,ID,SNP1,SNP2,SNP3
0,Ind1,0.0,1.0,2.0
1,Ind2,1.0,1.0,0.0
2,Ind3,2.0,0.0,1.0
3,Ind4,0.0,2.0,2.0


In [16]:
detect_and_convert_markers_to_012(snp_slash_format)

Markers type detected: allelic_diploid
Details: Detected diploid SNPs with arbitrary separators.
Conversion complete.
Global markers_type: allelic_diploid


Unnamed: 0,sample,SNP1,SNP2,SNP3
0,ind1,0.0,0.0,0.0
1,ind2,1.0,1.0,1.0
2,ind3,2.0,2.0,2.0


In [17]:
detect_and_convert_markers_to_012(snp_variable_format)

Markers type detected: allelic_diploid
Details: Detected diploid SNPs with arbitrary separators.
Conversion complete.
Global markers_type: allelic_diploid


Unnamed: 0.1,Unnamed: 0,100372750-33-A/G,100352026-35-G/A,100398603-66-T/A,100282003-7-C/A,100268131-40-A/G,100346786-19-C/T,100385025-37-A/G,100454812-47-G/T,100249878-21-T/C,...,100472466-7-G/A,100269708-12-A/G,100247673-6-C/T,100400729-21-T/A,100249247-8-C/A,100493546-6-A/T,100250134-41-G/A,100247596-22-T/C,100401377-8-T/C,100318416-9-C/G
0,Cly 1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,...,2.0,0.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0,2.0
1,J-9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,H-29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,D-7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,I-5B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65,Vin 34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,CVLC-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67,E-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
snps_wild

Unnamed: 0,sample,100372750-33-A/G,100352026-35-G/A,100398603-66-T/A,100282003-7-C/A,100268131-40-A/G,100346786-19-C/T,100385025-37-A/G,100454812-47-G/T,100249878-21-T/C,...,100472466-7-G/A,100269708-12-A/G,100247673-6-C/T,100400729-21-T/A,100249247-8-C/A,100493546-6-A/T,100250134-41-G/A,100247596-22-T/C,100401377-8-T/C,100318416-9-C/G
0,B-4,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
1,B-7,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
2,B-9B,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
3,C-1,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
4,C-2,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
5,C-alfa,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
6,C-G,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
7,C-H,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
8,C-K,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
9,D-5,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G


In [19]:
detect_and_convert_markers_to_012(snps_wild)

Markers type detected: allelic_diploid
Details: Detected diploid SNPs with arbitrary separators.
Conversion complete.
Global markers_type: allelic_diploid


Unnamed: 0,sample,100372750-33-A/G,100352026-35-G/A,100398603-66-T/A,100282003-7-C/A,100268131-40-A/G,100346786-19-C/T,100385025-37-A/G,100454812-47-G/T,100249878-21-T/C,...,100472466-7-G/A,100269708-12-A/G,100247673-6-C/T,100400729-21-T/A,100249247-8-C/A,100493546-6-A/T,100250134-41-G/A,100247596-22-T/C,100401377-8-T/C,100318416-9-C/G
0,B-4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,B-7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,B-9B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,C-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,C-alfa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,C-G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,C-H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,C-K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,D-5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Errors load

In [20]:
mixed_types_error = pd.read_csv('./data_type_test/mixed_types_error.csv')
mixed_types_error

Unnamed: 0,ID,SNP1,SNP2
0,A,0,150/152
1,B,AA,0/1
2,C,1,GG
3,D,AG,1


In [21]:
presence_absence = pd.read_csv('./data_type_test/presence_absence.csv')
presence_absence

Unnamed: 0,ID,M1,M2,M3
0,Ind1,1.0,0,1.0
1,Ind2,0.0,0,1.0
2,Ind3,1.0,1,
3,Ind4,,1,0.0


In [22]:
malformed_entries = pd.read_csv('./data_type_test/malformed_entries.csv')
malformed_entries

Unnamed: 0,ID,L1,L2
0,s1,A/A,0/0
1,s2,A;G,0|1
2,s3,??,1/1
3,s4,A/G,not_known


In [23]:
microsatellite_malformed = pd.read_csv('./data_type_test/microsatellite_malformed.csv')
microsatellite_malformed

Unnamed: 0,ID,M1,M2
0,i1,150/152,200/200
1,i2,148/150,198/198
2,i3,150/A,-
3,i4,152/152,204/204


# Errors Test

In [24]:
detect_and_convert_markers_to_012(mixed_types_error)

Markers type detected: numeric_012
Details: Detected numeric SNP encoding (0/1/2).
Conversion complete.
Global markers_type: numeric_012


Unnamed: 0,ID,SNP1,SNP2
0,A,,0.0
1,B,0.0,0.0
2,C,,0.0
3,D,1.0,0.0


In [25]:
detect_and_convert_markers_to_012(presence_absence)

Markers type detected: unknown
Details: Could not confidently determine marker format.
No automatic conversion performed for this marker type.


Unnamed: 0,ID,M1,M2,M3
0,Ind1,1.0,0,1.0
1,Ind2,0.0,0,1.0
2,Ind3,1.0,1,
3,Ind4,,1,0.0


In [26]:
detect_and_convert_markers_to_012(malformed_entries)

Markers type detected: vcf_gt
Details: Detected VCF GT format (e.g. '0/1').
Column 'L2': fallback numeric coercion.
Conversion complete.
Global markers_type: vcf_gt


Unnamed: 0,ID,L1,L2
0,s1,0.0,
1,s2,1.0,
2,s3,,
3,s4,1.0,


In [27]:
detect_and_convert_markers_to_012(microsatellite_malformed)

Markers type detected: microsatellites
Details: Detected numeric allele sizes (e.g. '150/152').
No automatic conversion performed for this marker type.


Unnamed: 0,ID,M1,M2
0,i1,150/152,200/200
1,i2,148/150,198/198
2,i3,150/A,-
3,i4,152/152,204/204
