In [6]:
import pandas as pd

_base_complement = {"A":"T", "C":"G", "G":"C", "T":"A"}

def _complement(seq):
    return "".join([_base_complement[b] for b in seq])

def canonic_id(chri, bp, a1, a2):
    alleles = ['A', 'T', 'C', 'G']
    if a1 not in alleles: return None
    if a2 not in alleles: return None
    a1a2=a1+a2
    alternatives = [a1a2, _complement(a1a2), a1a2[::-1], _complement(a1a2)[::-1]]
    if len(set(alternatives)) != 4 : return None
    return f"{chri}:{bp}_{list(sorted(alternatives))[0]}"

def format_chr(chrvec):
    tmpchrvec = chrvec.astype('str')
    tmpchrvec = tmpchrvec.str.lower()
    tmpchrvec = tmpchrvec.str.replace('chr', '')
    tmpchrvec[tmpchrvec=='x'] = '23'
    tmpchrvec[tmpchrvec=='y'] = '24'
    tmpchrvec[tmpchrvec=='par'] = '25'
    tmpchrvec[tmpchrvec=='m'] = '26'
    tmpchrvec[tmpchrvec=='mt'] = '26'
    tmpchrvec[tmpchrvec=='x_par1'] = '25'
    tmpchrvec[tmpchrvec=='x_par2'] = '25'
    tmpchrvec[tmpchrvec=='x_nonpar'] = '23'
    tmpchrvec[tmpchrvec=='na'] = '-9'
    tmpchrvec[tmpchrvec.isnull()] = '-9'
    tmpchrvec[tmpchrvec=='nan'] = '-9'
    tmpchrvec[tmpchrvec==' '] = '-9'
    tmpchrvec = tmpchrvec.astype('float').astype('int')
    return tmpchrvec
        
bim = pd.read_csv('chr21.bim', header=None, sep='\t', names=['CHR', 'SNP', 'GP', 'BP', 'A1', 'A2'])
bim['A1'] = bim['A1'].str.upper(); bim['A2'] = bim['A2'].str.upper()
bim['CHR'] = format_chr(bim['CHR'])
bim['ID'] = [canonic_id(chri, bp, a1, a2) for chri, bp, a1, a2 in zip(bim['CHR'], bim['BP'], bim['A1'], bim['A2'])]


In [8]:
# canonic mapping of alleles
alleles = ['A', 'T', 'C', 'G']
alleles2 = [(a1, a2) for a1 in alleles for a2 in alleles]
dict([(a1+a2, canonic_id(0, 0, a1, a2)) for a1, a2 in alleles2])

{'AA': None,
 'AT': None,
 'AC': '0:0_AC',
 'AG': '0:0_AG',
 'TA': None,
 'TT': None,
 'TC': '0:0_AG',
 'TG': '0:0_AC',
 'CA': '0:0_AC',
 'CT': '0:0_AG',
 'CC': None,
 'CG': None,
 'GA': '0:0_AG',
 'GT': '0:0_AC',
 'GC': None,
 'GG': None}