In [1]:
import re
from collections import Counter
import numpy as np
import pandas as pd

In [2]:


_IUPAC = {
    'R': ('A', 'G'), 'Y': ('C', 'T'), 'S': ('G', 'C'),
    'W': ('A', 'T'), 'K': ('G', 'T'), 'M': ('A', 'C')
}

def detect_and_convert_markers_to_012(markers_df, sample_size=1000):
    """
    Detect marker format and convert SNP-like columns to 0/1/2 encoding when possible.
    - Keeps first column as ID.
    - Prints detected type and explanation.
    - Sets global `markers_type` (string).
    - Returns (converted_df, markers_type).
    """
    if not isinstance(markers_df, pd.DataFrame):
        raise TypeError("markers_df must be a pandas DataFrame")

    if markers_df.shape[1] == 0:
        raise ValueError("Empty dataframe (no columns).")

    id_col = markers_df.columns[0]
    data_cols = list(markers_df.columns[1:])
    if not data_cols:
        print("No marker columns found (only ID column). Nothing to convert.")
        globals()['markers_type'] = None
        return markers_df.copy(), None

    # sampling helper
    def sample_values():
        stacked = markers_df[data_cols].astype(str).stack()
        if len(stacked) == 0:
            return []
        if len(stacked) <= sample_size:
            return stacked.tolist()
        return stacked.sample(n=sample_size, random_state=1).tolist()

    samples = sample_values()

    # regex patterns
    re_numeric_012 = re.compile(r'^[0-2]$')
    re_numeric_01 = re.compile(r'^[01]$')
    re_vcf_gt = re.compile(r'^[01][\/|][01]$')   # 0/0 0/1 1/1 or phased
    re_two_letters = re.compile(r'^[ACGTacgt]{2}$')
    re_slash_numeric = re.compile(r'^\s*\d+\s*\/\s*\d+\s*$')
    re_slash_alleles = re.compile(r'^[ACGTacgt]\s*\/\s*[ACGTacgt]$')
    re_single_letter = re.compile(r'^[ACGTacgt]$')
    re_iupac = re.compile(r'^[RYSWKMryswkm]$')

    from collections import Counter
    counts = Counter()
    for v in samples:
        s = str(v).strip()
        if s == '' or s.lower() == 'nan' or s.lower() == 'none':
            counts['missing'] += 1
            continue
        if '/' in s:
            if re_vcf_gt.match(s):
                counts['vcf_gt'] += 1
            elif re_slash_numeric.match(s):
                counts['microsatellite_like'] += 1
            elif re_slash_alleles.match(s):
                counts['slash_allelic'] += 1
            else:
                counts['slash_other'] += 1
        elif re_numeric_012.match(s):
            counts['numeric_012'] += 1
        elif re_numeric_01.match(s):
            counts['numeric_01'] += 1
        elif re_two_letters.match(s):
            counts['two_letter_alleles'] += 1
        elif re_single_letter.match(s):
            counts['single_letter'] += 1
        elif re_iupac.match(s):
            counts['iupac'] += 1
        else:
            counts['other'] += 1

    # Decide overall type (simple priority rules)
    most_common_count = counts.most_common(1)[0][1] if counts else 0

    if counts['microsatellite_like'] and counts['microsatellite_like'] >= most_common_count:
        markers_type = 'microsatellites'
        explanation = "Detected numeric-slash entries (e.g. '150/152') -> microsatellites."
    elif counts['vcf_gt'] and counts['vcf_gt'] >= most_common_count:
        markers_type = 'vcf_gt'
        explanation = "Detected VCF GT style (e.g. '0/1','1/1')."
    elif counts['numeric_012'] and counts['numeric_012'] >= most_common_count:
        markers_type = 'numeric_012'
        explanation = "Detected numeric genotypes (0/1/2)."
    elif counts['two_letter_alleles'] and counts['two_letter_alleles'] >= most_common_count:
        markers_type = 'allelic_two_letter'
        explanation = "Detected two-letter allelic genotypes (AA/AG/GG)."
    elif counts['slash_allelic'] and counts['slash_allelic'] >= most_common_count:
        markers_type = 'slash_allelic'
        explanation = "Detected allelic genotypes with slash (A/A, A/G)."
    elif counts['numeric_01'] and counts['numeric_01'] >= most_common_count:
        markers_type = 'presence_absence'
        explanation = "Detected presence/absence (0/1). Not auto-converted."
    elif counts['iupac'] and counts['iupac'] >= most_common_count:
        markers_type = 'iupac'
        explanation = "Detected IUPAC codes (R/Y...)."
    else:
        markers_type = 'unknown'
        explanation = "Could not confidently determine marker format."

    pretty = {
        'microsatellites': 'Microsatellites',
        'vcf_gt': 'VCF GT (0/1)',
        'numeric_012': 'Numeric SNPs (0/1/2)',
        'allelic_two_letter': 'Allelic (AA/AG/GG)',
        'slash_allelic': 'Allelic with slash (A/A, A/G)',
        'presence_absence': 'Presence/Absence (0/1)',
        'iupac': 'IUPAC codes',
        'unknown': 'Unknown'
    }
    print(f"Markers type detected: {pretty.get(markers_type, markers_type)}")
    print("Details:", explanation)
    globals()['markers_type'] = markers_type

    # If microsatellites or unknown -> do not convert automatically
    if markers_type in ('microsatellites', 'unknown'):
        print("No automatic conversion performed for microsatellites/unknown formats.")
        return markers_df.copy(), markers_type

    if markers_type == 'presence_absence':
        print("Presence/absence data detected. No automatic conversion performed (user requested to think).")
        return markers_df.copy(), markers_type

    # Prepare output DataFrame
    df_out = markers_df.copy()

    # helper patterns reused
    vcf_re = re_vcf_gt
    two_letter_re = re_two_letters
    slash_alleles_re = re_slash_alleles

    # Convert column by column
    for col in data_cols:
        col_series = markers_df[col]
        non_null = col_series.dropna().astype(str).str.strip()
        if non_null.empty:
            df_out[col] = np.nan
            continue

        # VCF GT: 0/0 0/1 1/1  -> map to sum of alleles
        if non_null.str.match(vcf_re).all():
            def map_vcf_val(x):
                if pd.isna(x):
                    return np.nan
                s = str(x).strip()
                if not vcf_re.match(s):
                    return np.nan
                a, b = re.split(r'[\/|]', s)
                return int(a) + int(b)
            df_out[col] = col_series.map(map_vcf_val).astype('float')
            continue

        # numeric 0/1/2
        if non_null.str.fullmatch(r'[0-2]').all():
            df_out[col] = pd.to_numeric(col_series, errors='coerce').astype('float')
            continue

        # two-letter AA/AG/GG
        if non_null.str.fullmatch(r'[ACGTacgt]{2}').all():
            # count alleles across column to pick REF = most frequent allele
            allele_list = []
            for g in non_null:
                gup = g.upper()
                allele_list.append(gup[0])
                allele_list.append(gup[1])
            allele_counts = Counter(allele_list)
            if not allele_counts:
                df_out[col] = np.nan
                continue
            allele_order = [a for a,_ in allele_counts.most_common()]
            ref = allele_order[0]
            alt = allele_order[1] if len(allele_order) > 1 else allele_order[0]
            # mapping: count how many ALT alleles present (0/1/2) where REF is zero
            def map_two_letter(g):
                if pd.isna(g):
                    return np.nan
                gup = str(g).upper()
                if len(gup) != 2:
                    return np.nan
                a1, a2 = gup[0], gup[1]
                return (int(a1 == alt) + int(a2 == alt))
            df_out[col] = col_series.map(map_two_letter).astype('float')
            print(f"Column '{col}': detected two-letter genotypes; REF='{ref}', ALT='{alt}'")
            continue

        # slash alleles A/A or A/G
        if non_null.str.match(slash_alleles_re).all():
            # collect alleles across column
            allele_list = []
            for g in non_null:
                s = str(g).strip().upper()
                parts = [p.strip() for p in s.split('/')]
                if len(parts) == 2:
                    allele_list.extend(parts)
            allele_counts = Counter(allele_list)
            if not allele_counts:
                df_out[col] = np.nan
                continue
            allele_order = [a for a,_ in allele_counts.most_common()]
            ref = allele_order[0]
            alt = allele_order[1] if len(allele_order) > 1 else allele_order[0]
            def map_slash(g):
                if pd.isna(g):
                    return np.nan
                s = str(g).strip().upper()
                if '/' not in s:
                    return np.nan
                a,b = [x.strip() for x in s.split('/')]
                return (int(a == alt) + int(b == alt))
            df_out[col] = col_series.map(map_slash).astype('float')
            print(f"Column '{col}': detected slash-alleles; REF='{ref}', ALT='{alt}'")
            continue

        # IUPAC -> conservative map to heterozygote (1)
        if non_null.str.fullmatch(r'[RYSWKMryswkm]').all():
            def map_iupac(x):
                if pd.isna(x):
                    return np.nan
                s = str(x).upper()
                if s in _IUPAC:
                    return 1.0
                return np.nan
            df_out[col] = col_series.map(map_iupac).astype('float')
            continue

        # Fallback: try to coerce numeric
        coerced = pd.to_numeric(col_series, errors='coerce')
        if coerced.dropna().empty:
            # leave as original but convert obvious 'nan' strings to NaN
            df_out[col] = col_series.replace({'nan': np.nan})
            print(f"Column '{col}': fallback - left unchanged (non-numeric and not recognized).")
        else:
            coerced = coerced.where(coerced.isin([0,1,2]), np.nan)
            df_out[col] = coerced.astype('float')
            print(f"Column '{col}': coerced to numeric 0/1/2 where possible.")

    print("Conversion complete where possible. Global markers_type:", markers_type)
    return df_out, markers_type


In [27]:
_IUPAC = {
    'R': ('A', 'G'), 'Y': ('C', 'T'), 'S': ('G', 'C'),
    'W': ('A', 'T'), 'K': ('G', 'T'), 'M': ('A', 'C')
}

import pandas as pd
import numpy as np
import re
from collections import Counter

# --- Helper: normalize SNP alleles in any format ---
def normalize_snp_alleles(genotype):
    """
    Normalize any SNP diploid representation to two alleles.
    Examples:
        'AA', 'A/A', 'A-T', 'A:T', 'A;T', 'A|T' -> ('A','A') or ('A','T')
    Returns None if not recognized.
    """
    if pd.isna(genotype):
        return None
    g = str(genotype).upper()
    # Split by common separators or use as two consecutive letters
    alleles = re.split(r'[/|:;\-]', g)
    alleles = [a.strip() for a in alleles if a.strip()]
    if len(alleles) == 2:
        return tuple(alleles)
    elif len(alleles) == 1 and len(alleles[0]) == 2:
        return tuple(alleles[0])
    elif len(g) == 2 and all(c in 'ACGT' for c in g):
        return tuple(g)
    else:
        return None

def detect_and_convert_markers_to_012(markers_df, sample_size=1000):
    """
    Detect marker format and convert SNP-like columns to 0/1/2 encoding when possible.
    - Keeps first column as ID.
    - Prints detected type and explanation.
    - Sets global `markers_type` (string).
    - Returns (converted_df, markers_type).
    """
    if not isinstance(markers_df, pd.DataFrame):
        raise TypeError("markers_df must be a pandas DataFrame")

    if markers_df.shape[1] == 0:
        raise ValueError("Empty dataframe (no columns).")

    id_col = markers_df.columns[0]
    data_cols = list(markers_df.columns[1:])
    if not data_cols:
        print("No marker columns found (only ID column). Nothing to convert.")
        globals()['markers_type'] = None
        return markers_df.copy(), None

    # sampling helper
    def sample_values():
        stacked = markers_df[data_cols].astype(str).stack()
        if len(stacked) == 0:
            return []
        if len(stacked) <= sample_size:
            return stacked.tolist()
        return stacked.sample(n=sample_size, random_state=1).tolist()

    samples = sample_values()

    # regex patterns
    re_numeric_012 = re.compile(r'^[0-2]$')
    re_numeric_01 = re.compile(r'^[01]$')
    re_vcf_gt = re.compile(r'^[01][\/|][01]$')   # 0/0 0/1 1/1 or phased
    re_two_letters = re.compile(r'^[ACGTacgt]{2}$')
    re_slash_numeric = re.compile(r'^\s*\d+\s*\/\s*\d+\s*$')
    # modified to detect A/T, A-T, A:T, A;T etc.
    re_slash_alleles = re.compile(r'^[ACGTacgt]\s*[/|:;\-]?\s*[ACGTacgt]$')
    re_single_letter = re.compile(r'^[ACGTacgt]$')
    re_iupac = re.compile(r'^[RYSWKMryswkm]$')

    counts = Counter()
    for v in samples:
        s = str(v).strip()
        if s == '' or s.lower() == 'nan' or s.lower() == 'none':
            counts['missing'] += 1
            continue
        if '/' in s:
            if re_vcf_gt.match(s):
                counts['vcf_gt'] += 1
            elif re_slash_numeric.match(s):
                counts['microsatellite_like'] += 1
            elif re_slash_alleles.match(s):
                counts['slash_allelic'] += 1
            else:
                counts['slash_other'] += 1
        elif re_numeric_012.match(s):
            counts['numeric_012'] += 1
        elif re_numeric_01.match(s):
            counts['numeric_01'] += 1
        elif re_two_letters.match(s):
            counts['two_letter_alleles'] += 1
        elif re_single_letter.match(s):
            counts['single_letter'] += 1
        elif re_iupac.match(s):
            counts['iupac'] += 1
        else:
            counts['other'] += 1

    # Decide overall type (simple priority rules)
    most_common_count = counts.most_common(1)[0][1] if counts else 0

    if counts['microsatellite_like'] and counts['microsatellite_like'] >= most_common_count:
        markers_type = 'microsatellites'
        explanation = "Detected numeric-slash entries (e.g. '150/152') -> microsatellites."
    elif counts['vcf_gt'] and counts['vcf_gt'] >= most_common_count:
        markers_type = 'vcf_gt'
        explanation = "Detected VCF GT style (e.g. '0/1','1/1')."
    elif counts['numeric_012'] and counts['numeric_012'] >= most_common_count:
        markers_type = 'numeric_012'
        explanation = "Detected numeric genotypes (0/1/2)."
    elif counts['two_letter_alleles'] and counts['two_letter_alleles'] >= most_common_count:
        markers_type = 'allelic_two_letter'
        explanation = "Detected two-letter allelic genotypes (AA/AG/GG)."
    elif counts['slash_allelic'] and counts['slash_allelic'] >= most_common_count:
        markers_type = 'slash_allelic'
        explanation = "Detected allelic genotypes with slash or other separators (A/A, A-T, A:T, A;T)."
    elif counts['numeric_01'] and counts['numeric_01'] >= most_common_count:
        markers_type = 'presence_absence'
        explanation = "Detected presence/absence (0/1). Not auto-converted."
    elif counts['iupac'] and counts['iupac'] >= most_common_count:
        markers_type = 'iupac'
        explanation = "Detected IUPAC codes (R/Y...)."
    else:
        markers_type = 'unknown'
        explanation = "Could not confidently determine marker format."

    pretty = {
        'microsatellites': 'Microsatellites',
        'vcf_gt': 'VCF GT (0/1)',
        'numeric_012': 'Numeric SNPs (0/1/2)',
        'allelic_two_letter': 'Allelic (AA/AG/GG)',
        'slash_allelic': 'Allelic with separators (A/A, A-T, A:T, A;T)',
        'presence_absence': 'Presence/Absence (0/1)',
        'iupac': 'IUPAC codes',
        'unknown': 'Unknown'
    }
    print(f"Markers type detected: {pretty.get(markers_type, markers_type)}")
    print("Details:", explanation)
    globals()['markers_type'] = markers_type

    # If microsatellites or unknown -> do not convert automatically
    if markers_type in ('microsatellites', 'unknown'):
        print("No automatic conversion performed for microsatellites/unknown formats.")
        return markers_df.copy(), markers_type

    if markers_type == 'presence_absence':
        print("Presence/absence data detected. No automatic conversion performed (user requested to think).")
        return markers_df.copy(), markers_type

    df_out = markers_df.copy()

    # helper patterns reused
    vcf_re = re_vcf_gt

    # Convert column by column
    for col in data_cols:
        col_series = markers_df[col]
        non_null = col_series.dropna().astype(str).str.strip()
        if non_null.empty:
            df_out[col] = np.nan
            continue

        # VCF GT: 0/0 0/1 1/1  -> map to sum of alleles
        if non_null.str.match(vcf_re).all():
            def map_vcf_val(x):
                if pd.isna(x):
                    return np.nan
                s = str(x).strip()
                if not vcf_re.match(s):
                    return np.nan
                a, b = re.split(r'[\/|]', s)
                return int(a) + int(b)
            df_out[col] = col_series.map(map_vcf_val).astype('float')
            continue

        # numeric 0/1/2
        if non_null.str.fullmatch(r'[0-2]').all():
            df_out[col] = pd.to_numeric(col_series, errors='coerce').astype('float')
            continue

        # SNP with two letters or any separator
        if non_null.str.fullmatch(r'[ACGTacgt]{2}').all() or non_null.str.match(re_slash_alleles).all():
            # count alleles across column to pick REF = most frequent allele
            allele_list = []
            for g in non_null:
                norm = normalize_snp_alleles(g)
                if norm:
                    allele_list.extend(norm)
            allele_counts = Counter(allele_list)
            if not allele_counts:
                df_out[col] = np.nan
                continue
            allele_order = [a for a,_ in allele_counts.most_common()]
            ref = allele_order[0]
            alt = allele_order[1] if len(allele_order) > 1 else allele_order[0]

            def map_snp(g):
                if pd.isna(g):
                    return np.nan
                norm = normalize_snp_alleles(g)
                if not norm:
                    return np.nan
                a1, a2 = norm
                return int(a1 == alt) + int(a2 == alt)

            df_out[col] = col_series.map(map_snp).astype('float')
            print(f"Column '{col}': detected SNP genotypes; REF='{ref}', ALT='{alt}'")
            continue

        # IUPAC -> conservative map to heterozygote (1)
        if non_null.str.fullmatch(r'[RYSWKMryswkm]').all():
            def map_iupac(x):
                if pd.isna(x):
                    return np.nan
                s = str(x).upper()
                if s in _IUPAC:
                    return 1.0
                return np.nan
            df_out[col] = col_series.map(map_iupac).astype('float')
            continue

        # Fallback: try to coerce numeric
        coerced = pd.to_numeric(col_series, errors='coerce')
        if coerced.dropna().empty:
            df_out[col] = col_series.replace({'nan': np.nan})
            print(f"Column '{col}': fallback - left unchanged (non-numeric and not recognized).")
        else:
            coerced = coerced.where(coerced.isin([0,1,2]), np.nan)
            df_out[col] = coerced.astype('float')
            print(f"Column '{col}': coerced to numeric 0/1/2 where possible.")

    print("Conversion complete where possible. Global markers_type:", markers_type)
    return df_out, markers_type


# Data load

In [28]:
micros = pd.read_csv('./data_type_test/microsatellites.csv')
micros

Unnamed: 0,ID,M1,M2,M3
0,Ind1,150/152,198/204,100/102
1,Ind2,148/150,200/200,102/104
2,Ind3,150/150,198/198,100/104
3,Ind4,152/152,204/204,104/104


In [29]:
snps_012 = pd.read_csv('./data_type_test/snps_012.csv')
snps_012

Unnamed: 0,ID,SNP1,SNP2,SNP3
0,Ind1,0,1,2
1,Ind2,1,1,0
2,Ind3,2,0,1
3,Ind4,1,2,2


In [30]:
snps_allelic = pd.read_csv('./data_type_test/snps_allelic_twoletter.csv')
snps_allelic

Unnamed: 0,ID,SNP1,SNP2,SNP3
0,Ind1,AA,AG,GG
1,Ind2,AG,GG,AA
2,Ind3,GG,AG,AG
3,Ind4,AA,AA,GG


In [31]:
snps_vcf = pd.read_csv('./data_type_test/snps_vcf.csv')
snps_vcf

Unnamed: 0,ID,SNP1,SNP2,SNP3
0,Ind1,0/0,0/1,1/1
1,Ind2,0/1,0/1,0/0
2,Ind3,1/1,0/0,0/1
3,Ind4,0/0,1/1,1/1


In [32]:
snp_slash_format = pd.read_csv('./data_type_test/snp_slash_format.csv')
snp_slash_format

Unnamed: 0,sample,SNP1,SNP2,SNP3
0,ind1,A/A,C/C,G/G
1,ind2,A/G,C/T,G/A
2,ind3,G/G,T/T,A/A


In [41]:
snp_variable_format = pd.read_csv('./data_type_test/snps_semicolon_format.csv')

In [45]:
snp_variable_format = snp_variable_format.drop('pop', axis=1)
snp_variable_format.head(3)

Unnamed: 0.1,Unnamed: 0,100372750-33-A/G,100352026-35-G/A,100398603-66-T/A,100282003-7-C/A,100268131-40-A/G,100346786-19-C/T,100385025-37-A/G,100454812-47-G/T,100249878-21-T/C,...,100472466-7-G/A,100269708-12-A/G,100247673-6-C/T,100400729-21-T/A,100249247-8-C/A,100493546-6-A/T,100250134-41-G/A,100247596-22-T/C,100401377-8-T/C,100318416-9-C/G
0,Cly 1,G;G,G;G,A;A,A;A,A;A,C;C,G;G,G;G,T;T,...,G;G,G;G,T;T,A;A,A;A,T;T,G;G,T;T,C;C,C;C
1,J-9,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G
2,H-29,A;A,A;A,T;T,C;C,G;G,T;T,A;A,T;T,T;T,...,A;A,G;G,C;C,T;T,C;C,A;A,A;A,T;T,T;T,G;G


# Tests

In [46]:
detect_and_convert_markers_to_012(micros)

Markers type detected: Microsatellites
Details: Detected numeric-slash entries (e.g. '150/152') -> microsatellites.
No automatic conversion performed for microsatellites/unknown formats.


(     ID       M1       M2       M3
 0  Ind1  150/152  198/204  100/102
 1  Ind2  148/150  200/200  102/104
 2  Ind3  150/150  198/198  100/104
 3  Ind4  152/152  204/204  104/104,
 'microsatellites')

In [47]:
detect_and_convert_markers_to_012(snps_012)

Markers type detected: Numeric SNPs (0/1/2)
Details: Detected numeric genotypes (0/1/2).
Conversion complete where possible. Global markers_type: numeric_012


(     ID  SNP1  SNP2  SNP3
 0  Ind1   0.0   1.0   2.0
 1  Ind2   1.0   1.0   0.0
 2  Ind3   2.0   0.0   1.0
 3  Ind4   1.0   2.0   2.0,
 'numeric_012')

In [48]:
detect_and_convert_markers_to_012(snps_allelic)

Markers type detected: Allelic (AA/AG/GG)
Details: Detected two-letter allelic genotypes (AA/AG/GG).
Column 'SNP1': detected SNP genotypes; REF='A', ALT='G'
Column 'SNP2': detected SNP genotypes; REF='A', ALT='G'
Column 'SNP3': detected SNP genotypes; REF='G', ALT='A'
Conversion complete where possible. Global markers_type: allelic_two_letter


(     ID  SNP1  SNP2  SNP3
 0  Ind1   0.0   1.0   0.0
 1  Ind2   1.0   2.0   2.0
 2  Ind3   2.0   1.0   1.0
 3  Ind4   0.0   0.0   0.0,
 'allelic_two_letter')

In [49]:
detect_and_convert_markers_to_012(snps_vcf)

Markers type detected: VCF GT (0/1)
Details: Detected VCF GT style (e.g. '0/1','1/1').
Conversion complete where possible. Global markers_type: vcf_gt


(     ID  SNP1  SNP2  SNP3
 0  Ind1   0.0   1.0   2.0
 1  Ind2   1.0   1.0   0.0
 2  Ind3   2.0   0.0   1.0
 3  Ind4   0.0   2.0   2.0,
 'vcf_gt')

In [50]:
detect_and_convert_markers_to_012(snp_slash_format)

Markers type detected: Allelic with separators (A/A, A-T, A:T, A;T)
Details: Detected allelic genotypes with slash or other separators (A/A, A-T, A:T, A;T).
Column 'SNP1': detected SNP genotypes; REF='A', ALT='G'
Column 'SNP2': detected SNP genotypes; REF='C', ALT='T'
Column 'SNP3': detected SNP genotypes; REF='G', ALT='A'
Conversion complete where possible. Global markers_type: slash_allelic


(  sample  SNP1  SNP2  SNP3
 0   ind1   0.0   0.0   0.0
 1   ind2   1.0   1.0   1.0
 2   ind3   2.0   2.0   2.0,
 'slash_allelic')

In [52]:
detect_and_convert_markers_to_012(snp_variable_format)

Markers type detected: Unknown
Details: Could not confidently determine marker format.
No automatic conversion performed for microsatellites/unknown formats.


(   Unnamed: 0 100372750-33-A/G 100352026-35-G/A 100398603-66-T/A  \
 0       Cly 1              G;G              G;G              A;A   
 1         J-9              A;A              A;A              T;T   
 2        H-29              A;A              A;A              T;T   
 3        C-10              A;A              A;A              T;T   
 4         D-7              A;A              A;A              T;T   
 ..        ...              ...              ...              ...   
 64       I-5B              A;A              A;A              T;T   
 65     Vin 34              A;A              A;A              T;T   
 66    CVLC-23              A;A              A;A              T;T   
 67       E-18              A;A              A;A              T;T   
 68       F-18              A;A              A;A              T;T   
 
    100282003-7-C/A 100268131-40-A/G 100346786-19-C/T 100385025-37-A/G  \
 0              A;A              A;A              C;C              G;G   
 1              C;C   

# Errors load

In [13]:
mixed_types_error = pd.read_csv('./data_type_test/mixed_types_error.csv')
mixed_types_error

Unnamed: 0,ID,SNP1,SNP2
0,A,0,150/152
1,B,AA,0/1
2,C,1,GG
3,D,AG,1


In [14]:
presence_absence = pd.read_csv('./data_type_test/presence_absence.csv')
presence_absence

Unnamed: 0,ID,M1,M2,M3
0,Ind1,1.0,0,1.0
1,Ind2,0.0,0,1.0
2,Ind3,1.0,1,
3,Ind4,,1,0.0


In [15]:
malformed_entries = pd.read_csv('./data_type_test/malformed_entries.csv')
malformed_entries

Unnamed: 0,ID,L1,L2
0,s1,A/A,0/0
1,s2,A;G,0|1
2,s3,??,1/1
3,s4,A/G,not_known


In [16]:
microsatellite_malformed = pd.read_csv('./data_type_test/microsatellite_malformed.csv')
microsatellite_malformed

Unnamed: 0,ID,M1,M2
0,i1,150/152,200/200
1,i2,148/150,198/198
2,i3,150/A,-
3,i4,152/152,204/204


# Errors Test

In [17]:
detect_and_convert_markers_to_012(mixed_types_error)

Markers type detected: Numeric SNPs (0/1/2)
Details: Detected numeric genotypes (0/1/2).
Column 'SNP1': coerced to numeric 0/1/2 where possible.
Column 'SNP2': coerced to numeric 0/1/2 where possible.
Conversion complete where possible. Global markers_type: numeric_012


(  ID  SNP1  SNP2
 0  A   0.0   NaN
 1  B   NaN   NaN
 2  C   1.0   NaN
 3  D   NaN   1.0,
 'numeric_012')

In [18]:
detect_and_convert_markers_to_012(presence_absence)

Markers type detected: Unknown
Details: Could not confidently determine marker format.
No automatic conversion performed for microsatellites/unknown formats.


(     ID   M1  M2   M3
 0  Ind1  1.0   0  1.0
 1  Ind2  0.0   0  1.0
 2  Ind3  1.0   1  NaN
 3  Ind4  NaN   1  0.0,
 'unknown')

In [19]:
detect_and_convert_markers_to_012(malformed_entries)

Markers type detected: Unknown
Details: Could not confidently determine marker format.
No automatic conversion performed for microsatellites/unknown formats.


(   ID   L1         L2
 0  s1  A/A        0/0
 1  s2  A;G        0|1
 2  s3   ??        1/1
 3  s4  A/G  not_known,
 'unknown')

In [20]:
detect_and_convert_markers_to_012(microsatellite_malformed)

Markers type detected: Microsatellites
Details: Detected numeric-slash entries (e.g. '150/152') -> microsatellites.
No automatic conversion performed for microsatellites/unknown formats.


(   ID       M1       M2
 0  i1  150/152  200/200
 1  i2  148/150  198/198
 2  i3    150/A        -
 3  i4  152/152  204/204,
 'microsatellites')