# Encode SNPs for ML
- Distinguish homozygous (like AA, GG) from heterozygous (like AG, CT)
- Remove ambiguous/no call entries (like "--" or any invalid genotype)
- Create a wide binary/categorical feature representation (e.g., rs12345_AA = 1, rs12345_AG = 0, etc.)


## Fix the file loading
Parse the CSV with correct column headers and separation.

In [None]:
import pandas as pd

In [4]:
def load_and_clean_dna(filepath):
    # Proper parsing
    df = pd.read_csv(filepath, comment='#', sep=',')

    # Drop rows with any missing values or ambiguous calls like '--'
    df.columns = df.columns.str.strip()  # remove extra spaces from headers
    df = df.dropna()
    df = df[df['RESULT'].apply(lambda x: len(str(x)) == 2 and set(str(x)).issubset({'A', 'T', 'C', 'G'}))]

    print("✅ Cleaned SNP data:", df.shape)
    return df.reset_index(drop=True)

In [None]:
df = load_and_clean_dna("data/raw/MyHeritage_raw_dna_data.csv")
df.head()

## Encode SNPs (Wide format)
Encode each SNP as its own column like rs1234_AA, rs1234_AG, etc.

I. Binary presence (1) for the observed genotype

II. Multi-hot encoding or use numeric like:
- 1 = Homozygous (e.g., AA, GG)
- 0.5 = Heterozygous (e.g., AG, CT)

### Wide binary (one-hot per genotype)

In [8]:
def wide_one_hot_encoding(df):
    df['RESULT'] = df['RESULT'].astype(str)
    df['RSID'] = df['RSID'].astype(str)

    df['GENO_KEY'] = df['RSID'] + '_' + df['RESULT']
    df['VALUE'] = 1

    wide_df = df[['GENO_KEY', 'VALUE']].set_index('GENO_KEY').T
    print("✅ Encoded shape (wide format):", wide_df.shape)
    return wide_df

In [None]:
wide_encoded_df = wide_one_hot_encoding(df)

### Numeric encoding (1 for homozygous, 0.5 for heterozygous)

In [10]:
def encode_genotype_numeric(df):
    def encode_geno(gt):
        return 1.0 if gt[0] == gt[1] else 0.5

    df['ENCODED'] = df['RESULT'].apply(encode_geno)
    numeric_df = df[['RSID', 'ENCODED']].drop_duplicates().set_index('RSID').T
    print("✅ Encoded shape (numeric):", numeric_df.shape)
    return numeric_df

In [None]:
numeric_encoded_df = encode_genotype_numeric(df)

# Normalize RSIDs, Chromosomes, and Positions
- Remove quotes from all fields (e.g., "1" → 1)
- Ensure RSID, CHROMOSOME, POSITION, and RESULT are in clean, properly typed columns
- Remove whitespace, fix datatypes

## Normalize SNP Identifiers


In [12]:
def normalize_snp_fields(df):
    # Strip quotes and whitespace
    df['RSID'] = df['RSID'].str.strip().str.replace('"', '', regex=False)
    df['CHROMOSOME'] = df['CHROMOSOME'].astype(str).str.strip().str.replace('"', '', regex=False)
    df['POSITION'] = df['POSITION'].astype(str).str.strip().str.replace('"', '', regex=False)
    df['RESULT'] = df['RESULT'].astype(str).str.strip().str.replace('"', '', regex=False)

    # Cast POSITION to integer
    df['POSITION'] = df['POSITION'].astype(int)

    print("✅ Normalized SNP data")
    print(df.head())
    return df

In [None]:
df_normalized = normalize_snp_fields(df)

In [None]:
print(df_normalized.dtypes)

## Save Clean SNP Data

In [None]:
# Create output folder if needed
import os
os.makedirs("data", exist_ok=True)

# Save clean SNP data
df_normalized.to_csv("data/processed/personal_snp_clean.csv", index=False)
print("📁 Saved to: data/processed/personal_snp_clean.csv")

In [None]:
wide_encoded_df.to_csv("data/processed/personal_snp_onehot.csv")
print("📁 Saved one-hot encoded SNPs to: data/processed/personal_snp_onehot.csv")

In [None]:
numeric_encoded_df.to_csv("data/processed/personal_snp_numeric.csv")
print("📁 Saved numeric SNP encoding to: data/processed/personal_snp_numeric.csv")