In [1]:
import pandas as pd
import os
my_heritage = pd.read_csv(os.path.join("FakeGenome", "MyHeritage_raw_dna_data_fake.csv"), header=12, names=[
    "rsid", "chromosome", "position", "result"
], dtype={
    "rsid": str, "chromosome": str, "position": int, "result": str
})
my_heritage["allele1"] = my_heritage["result"].str[:1]
my_heritage["allele2"] = my_heritage["result"].str[1:]
my_heritage

Unnamed: 0,rsid,chromosome,position,result,allele1,allele2
0,rs72631887,1,835092,AG,A,G
1,rs4970383,1,838555,AA,A,A
2,rs4475691,1,846808,AA,A,A
3,rs72631889,1,851390,GG,G,G
4,rs7537756,1,854250,GG,G,G
...,...,...,...,...,...,...
609329,rs796720893,Y,58994369,TT,T,T
609330,rs4893683,Y,58995039,CC,C,C
609331,rs796479460,Y,58995204,TT,T,T
609332,rs79602159,Y,58997068,GG,G,G


In [2]:
ancestry = pd.read_csv(os.path.join("FakeGenome", "AncestryDNA_fake.txt"), sep="\t", header=18, dtype={
    "rsid": str, "chromosome": str, "position": int, "allele1": str, "allele2": str
})
ancestry["result"] = ancestry["allele1"] + ancestry["allele2"]
ancestry

Unnamed: 0,rsid,chromosome,position,allele1,allele2,result
0,rs3131972,1,752721,G,T,GT
1,rs114525117,1,759036,G,G,GG
2,rs4040617,1,779322,A,A,AA
3,rs141175086,1,780397,C,C,CC
4,rs115093905,1,787173,A,A,AA
...,...,...,...,...,...,...
677859,rs41534744,26,16129,G,G,GG
677860,rs41419246,26,16145,T,T,TT
677861,rs41466049,26,16162,T,T,TT
677862,rs41355449,26,16327,G,G,GG


In [3]:
fake23andme = pd.read_csv(os.path.join("FakeGenome", "23andme_fake.txt"), sep="\t", header=19, names=[
    "rsid", "chromosome", "position", "result"
], dtype={
    "rsid": str, "chromosome": str, "position": int, "genotype": str
})
fake23andme["allele1"] = fake23andme["result"].str[:1]
fake23andme["allele2"] = fake23andme["result"].str[1:]
fake23andme

Unnamed: 0,rsid,chromosome,position,result,allele1,allele2
0,rs4477212,1,82154,AA,A,A
1,rs3094315,1,752566,CC,C,C
2,rs3131972,1,752721,GG,G,G
3,rs12124819,1,776546,AA,A,A
4,rs11240777,1,798959,GT,G,T
...,...,...,...,...,...,...
963034,i701671,MT,16526,GT,G,T
963035,i4990307,MT,16527,AA,A,A
963036,i4000756,MT,16540,CT,C,T
963037,i4000755,MT,16548,AG,A,G


In [4]:
my_heritage["rsid"].isin(ancestry["rsid"]).value_counts()

False    405901
True     203433
Name: rsid, dtype: int64

In [5]:
my_heritage["rsid"].isin(fake23andme["rsid"]).value_counts()

False    379753
True     229581
Name: rsid, dtype: int64

In [6]:
ancestry["rsid"].isin(fake23andme["rsid"]).value_counts()

True     467876
False    209988
Name: rsid, dtype: int64

In [7]:
filtered_df = my_heritage[~my_heritage["rsid"].isin(ancestry["rsid"])]
filtered_df[~filtered_df["rsid"].isin(fake23andme["rsid"])]

Unnamed: 0,rsid,chromosome,position,result,allele1,allele2
0,rs72631887,1,835092,AG,A,G
3,rs72631889,1,851390,GG,G,G
6,rs376747791,1,863130,AG,A,G
8,rs148327885,1,878331,CC,C,C
9,rs143853699,1,879911,CC,C,C
...,...,...,...,...,...,...
609329,rs796720893,Y,58994369,TT,T,T
609330,rs4893683,Y,58995039,CC,C,C
609331,rs796479460,Y,58995204,TT,T,T
609332,rs79602159,Y,58997068,GG,G,G


In [8]:
filtered_df = ancestry[~ancestry["rsid"].isin(my_heritage["rsid"])]
filtered_df[~filtered_df["rsid"].isin(fake23andme["rsid"])]

Unnamed: 0,rsid,chromosome,position,allele1,allele2,result
1,rs114525117,1,759036,G,G,GG
2,rs4040617,1,779322,A,A,AA
3,rs141175086,1,780397,C,C,CC
4,rs115093905,1,787173,A,A,AA
7,rs4422948,1,835499,A,G,AG
...,...,...,...,...,...,...
677857,rs199474699,26,15990,A,A,AA
677858,rs386420030,26,16086,G,G,GG
677859,rs41534744,26,16129,G,G,GG
677862,rs41355449,26,16327,G,G,GG


In [9]:
filtered_df = fake23andme[~fake23andme["rsid"].isin(my_heritage["rsid"])]
filtered_df[~filtered_df["rsid"].isin(ancestry["rsid"])]

Unnamed: 0,rsid,chromosome,position,result,allele1,allele2
0,rs4477212,1,82154,AA,A,A
1,rs3094315,1,752566,CC,C,C
3,rs12124819,1,776546,AA,A,A
14,rs28415373,1,893981,CC,C,C
24,rs9697457,1,934345,AA,A,A
...,...,...,...,...,...,...
963034,i701671,MT,16526,GT,G,T
963035,i4990307,MT,16527,AA,A,A
963036,i4000756,MT,16540,CT,C,T
963037,i4000755,MT,16548,AG,A,G


In [10]:
my_heritage["ancestry_overlap"] = my_heritage["rsid"].isin(ancestry["rsid"])
my_heritage["23andme_overlap"] = my_heritage["rsid"].isin(fake23andme["rsid"])
my_heritage["full_overlap"] = my_heritage["ancestry_overlap"] & my_heritage["23andme_overlap"]
my_heritage["full_overlap"].value_counts()

False    438324
True     171010
Name: full_overlap, dtype: int64

In [11]:
my_heritage.query("full_overlap")["rsid"]

2          rs4475691
4          rs7537756
5         rs13302982
7          rs1110052
11         rs3748597
             ...    
609124    rs17842518
609157    rs13303755
609162    rs16980610
609247     rs2268588
609266     rs9786795
Name: rsid, Length: 171010, dtype: object

In [12]:
my_heritage = pd.read_csv(os.path.join("FakeGenome", "MyHeritage_raw_dna_data_fake.csv"), header=12, names=[
    "rsid", "chromosome", "position", "result"
], dtype={
    "rsid": str, "chromosome": str, "position": int, "result": str
})
my_heritage["allele1"] = my_heritage["result"].str[:1]
my_heritage["allele2"] = my_heritage["result"].str[1:]
rsid_indexed_my_heritage = my_heritage.set_index("rsid")
ancestry = pd.read_csv(os.path.join("FakeGenome", "AncestryDNA_fake.txt"), sep="\t", header=18, dtype={
    "rsid": str, "chromosome": str, "position": int, "allele1": str, "allele2": str
})
ancestry["result"] = ancestry["allele1"] + ancestry["allele2"]
rsid_indexed_ancestry = ancestry.set_index("rsid")
fake23andme = pd.read_csv(os.path.join("FakeGenome", "23andme_fake.txt"), sep="\t", header=19, names=[
    "rsid", "chromosome", "position", "result"
], dtype={
    "rsid": str, "chromosome": str, "position": int, "genotype": str
})
fake23andme["allele1"] = fake23andme["result"].str[:1]
fake23andme["allele2"] = fake23andme["result"].str[1:]
rsid_indexed_23andme = fake23andme.set_index("rsid")
joined_df = rsid_indexed_my_heritage.join(
    rsid_indexed_23andme, how="inner", lsuffix="_my_heritage", rsuffix="_23andme").join(
    rsid_indexed_ancestry, how="inner")
joined_df = joined_df.rename(columns={
    "chromosome": "chromosome_ancestry", "position": "position_ancestry", 
    "result": "result_ancestry",
    "allele1": "allele1_ancestry", "allele2": "allele2_ancestry"
})
for tester in ["_my_heritage", "_ancestry", "_23andme"]:
    joined_df["result" + tester] = (joined_df["allele1" + tester] + joined_df["allele2" + tester]).apply(lambda x: x if len(x) < 2 or x < x[1] + x[0] else x[1] + x[0])
    joined_df["result" + tester] = joined_df["result" + tester].apply(lambda x: x + x if len(x) < 2 else x)
joined_df["allele_match"] = ((joined_df["result_ancestry"] == joined_df["result_23andme"]) & 
                             (joined_df["result_ancestry"] == joined_df["result_my_heritage"]) & 
                             (joined_df["result_23andme"] == joined_df["result_my_heritage"]))
print(joined_df["allele_match"].value_counts())
joined_df.query("allele_match == False").to_csv(os.path.join("FakeGenome", "error_table.csv"), sep="\t", header=True)
joined_df.query("allele_match == False")

False    166945
True       4065
Name: allele_match, dtype: int64


Unnamed: 0_level_0,chromosome_my_heritage,position_my_heritage,result_my_heritage,allele1_my_heritage,allele2_my_heritage,chromosome_23andme,position_23andme,result_23andme,allele1_23andme,allele2_23andme,chromosome_ancestry,position_ancestry,allele1_ancestry,allele2_ancestry,result_ancestry,allele_match
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
rs4475691,1,846808,AA,A,A,1,846808,CT,C,T,1,846808,C,T,CT,False
rs7537756,1,854250,GG,G,G,1,854250,GG,G,G,1,854250,A,G,AG,False
rs13302982,1,861808,AG,A,G,1,861808,AG,A,G,1,861808,T,T,TT,False
rs1110052,1,873558,CT,C,T,1,873558,AG,A,G,1,873558,T,T,TT,False
rs3748597,1,888659,TT,T,T,1,888659,AG,A,G,1,888659,C,C,CC,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rs17842518,Y,23443971,AG,A,G,Y,23443971,CC,C,C,24,23443971,A,A,AA,False
rs13303755,Y,23612197,AA,A,A,Y,23612197,GG,G,G,24,23612197,A,G,AG,False
rs16980610,Y,23634362,TT,T,T,Y,23634362,TT,T,T,24,23634362,C,C,CC,False
rs2268588,Y,24510581,AA,A,A,Y,24510581,GG,G,G,24,24510581,G,G,GG,False


In [13]:
print(joined_df["result_ancestry"].value_counts())
print(joined_df["result_23andme"].value_counts())
print(joined_df["result_my_heritage"].value_counts())

GG    29960
CC    29747
TT    27023
AA    26791
AG    23707
CT    23356
AC     5193
GT     5018
CG       75
00       51
AT       51
II       34
DD        4
Name: result_ancestry, dtype: int64
CC    29863
GG    29640
AA    26883
TT    26807
AG    23589
CT    23531
AC     5242
GT     4901
--      377
CG       87
AT       45
II       40
DD        5
Name: result_23andme, dtype: int64
CC    29417
GG    29297
TT    26985
AA    26926
AG    23726
CT    23537
AC     5110
GT     5002
--      818
CG       89
AT       52
II       48
DD        3
Name: result_my_heritage, dtype: int64


In [14]:
joined_df["bad_error"] = (
    (joined_df["allele1_ancestry"].isin(["A", "C", "G", "T"]) & 
     joined_df["allele2_ancestry"].isin(["A", "C", "G", "T"]) &
     joined_df["allele1_23andme"].isin(["A", "C", "G", "T"]) & 
     joined_df["allele2_23andme"].isin(["A", "C", "G", "T"]) &
     joined_df["result_ancestry"] != joined_df["result_23andme"]) & 
    (joined_df["allele1_ancestry"].isin(["A", "C", "G", "T"]) & 
     joined_df["allele2_ancestry"].isin(["A", "C", "G", "T"]) &
     joined_df["allele1_my_heritage"].isin(["A", "C", "G", "T"]) & 
     joined_df["allele2_my_heritage"].isin(["A", "C", "G", "T"]) &
     joined_df["result_ancestry"] != joined_df["result_my_heritage"]) & 
    (joined_df["allele1_23andme"].isin(["A", "C", "G", "T"]) & 
     joined_df["allele2_23andme"].isin(["A", "C", "G", "T"]) &
     joined_df["allele1_my_heritage"].isin(["A", "C", "G", "T"]) & 
     joined_df["allele2_my_heritage"].isin(["A", "C", "G", "T"]) &
     joined_df["result_23andme"] == joined_df["result_my_heritage"])
    )
joined_df["bad_error"].value_counts()

False    171010
Name: bad_error, dtype: int64

In [15]:
my_heritage = pd.read_csv(os.path.join("FakeGenome", "MyHeritage_raw_dna_data_fake.csv"), header=12, names=[
    "rsid", "chromosome", "position", "result"
], dtype={
    "rsid": str, "chromosome": str, "position": int, "result": str
})
my_heritage["allele1"] = my_heritage["result"].str[:1]
my_heritage["allele2"] = my_heritage["result"].str[1:]
rsid_indexed_my_heritage = my_heritage.set_index("rsid")
second_sample_my_heritage = pd.read_csv(os.path.join("FakeGenome", "MyHeritage_raw_dna_data_fake.csv"), header=12, names=[
    "rsid", "chromosome", "position", "result"
], dtype={
    "rsid": str, "chromosome": str, "position": int, "result": str
})
second_sample_my_heritage["allele1"] = second_sample_my_heritage["result"].str[:1]
second_sample_my_heritage["allele2"] = second_sample_my_heritage["result"].str[1:]
rsid_indexed_second_sample_my_heritage = second_sample_my_heritage.set_index("rsid")

bl = ["A", "C", "G", "T"]
def join_and_clean(first_sample, second_sample):
    df = first_sample.join(second_sample, how="outer", lsuffix="_first_sample", rsuffix="_second_sample")
    print((df["chromosome_first_sample"] == df["chromosome_second_sample"]).value_counts())
    df = df.rename(columns={"chromosome_first_sample": "chromosome"}).drop(columns=["chromosome_second_sample"])
    print((df["position_first_sample"] == df["position_second_sample"]).value_counts())
    df = df.rename(columns={"position_first_sample": "position"}).drop(columns=["position_second_sample"])
    for person in ["_first_sample", "_second_sample"]:
        df["result" + person] = (df["allele1" + person] + df["allele2" + person]).apply(lambda x: x if len(x) < 2 or x < x[1] + x[0] else x[1] + x[0])
        df["result" + person] = df["result" + person].apply(lambda x: x + x if len(x) < 2 else x)
    df["results_match"] = (df["result_first_sample"] == df["result_second_sample"])
    print(df["results_match"].value_counts())
    df["result"] = (df["result_first_sample"] + df["result_second_sample"]).apply(
        lambda x: x[0:2] if x[2] not in bl or x[3] not in bl else x[2:4])
    return df

my_heritage = join_and_clean(rsid_indexed_my_heritage, rsid_indexed_second_sample_my_heritage)
my_heritage.query("results_match == False").to_csv(os.path.join("FakeGenome", "errors_my_heritage.csv"), sep="\t", header=True)
my_heritage = my_heritage.drop(columns=["result_first_sample", "allele1_first_sample", "allele2_first_sample", "result_second_sample", "allele1_second_sample", "allele2_second_sample", "results_match"])
my_heritage

True    609334
dtype: int64
True    609334
dtype: int64
True    609334
Name: results_match, dtype: int64


Unnamed: 0_level_0,chromosome,position,result
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rs72631887,1,835092,AG
rs4970383,1,838555,AA
rs4475691,1,846808,AA
rs72631889,1,851390,GG
rs7537756,1,854250,GG
...,...,...,...
rs796720893,Y,58994369,TT
rs4893683,Y,58995039,CC
rs796479460,Y,58995204,TT
rs79602159,Y,58997068,GG


In [16]:
ancestry = pd.read_csv(os.path.join("FakeGenome", "AncestryDNA_fake.txt"), sep="\t", header=18, dtype={
    "rsid": str, "chromosome": str, "position": int, "allele1": str, "allele2": str
})
ancestry["result"] = ancestry["allele1"] + ancestry["allele2"]
rsid_indexed_ancestry = ancestry.set_index("rsid")
second_sample_ancestry = pd.read_csv(os.path.join("FakeGenome", "AncestryDNA_fake.txt"), sep="\t", header=18, dtype={
    "rsid": str, "chromosome": str, "position": int, "allele1": str, "allele2": str
})
second_sample_ancestry["result"] = second_sample_ancestry["allele1"] + second_sample_ancestry["allele2"]
rsid_indexed_second_sample_ancestry = second_sample_ancestry.set_index("rsid")
ancestry = join_and_clean(rsid_indexed_ancestry, rsid_indexed_second_sample_ancestry)
ancestry.query("results_match == False").to_csv(os.path.join("FakeGenome", "errors_ancestry.csv"), sep="\t", header=True)
ancestry = ancestry.drop(columns=["result_first_sample", "allele1_first_sample", "allele2_first_sample", "result_second_sample", "allele1_second_sample", "allele2_second_sample", "results_match"])
ancestry

True    677864
dtype: int64
True    677864
dtype: int64
True    677864
Name: results_match, dtype: int64


Unnamed: 0_level_0,chromosome,position,result
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rs3131972,1,752721,GT
rs114525117,1,759036,GG
rs4040617,1,779322,AA
rs141175086,1,780397,CC
rs115093905,1,787173,AA
...,...,...,...
rs41534744,26,16129,GG
rs41419246,26,16145,TT
rs41466049,26,16162,TT
rs41355449,26,16327,GG


In [17]:
fake23andme = pd.read_csv(os.path.join("FakeGenome", "23andme_fake.txt"), sep="\t", header=19, names=[
    "rsid", "chromosome", "position", "result"
], dtype={
    "rsid": str, "chromosome": str, "position": int, "genotype": str
})
fake23andme["allele1"] = fake23andme["result"].str[:1]
fake23andme["allele2"] = fake23andme["result"].str[1:]
df23andme = fake23andme.set_index("rsid")
df23andme["result"] = (df23andme["allele1"] + df23andme["allele2"]).apply(lambda x: x if len(x) < 2 or x < x[1] + x[0] else x[1] + x[0])
df23andme["result"] = df23andme["result"].apply(lambda x: x + x if len(x) < 2 else x)
df23andme = df23andme.drop(columns=["allele1", "allele2"])
df23andme

Unnamed: 0_level_0,chromosome,position,result
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rs4477212,1,82154,AA
rs3094315,1,752566,CC
rs3131972,1,752721,GG
rs12124819,1,776546,AA
rs11240777,1,798959,GT
...,...,...,...
i701671,MT,16526,GT
i4990307,MT,16527,AA
i4000756,MT,16540,CT
i4000755,MT,16548,AG


In [18]:
def merge_tables(a, b):
    m = a.join(b, how="outer", lsuffix="_a", rsuffix="_b")
    m["chromosome"] = m["chromosome_a"].fillna(m["chromosome_b"])
    m["position"] = m["position_a"].fillna(m["position_b"])
    m["position"] = m["position"].astype(int)
    m["result_a"] = m["result_a"].fillna(m["result_b"])
    m["result_b"] = m["result_b"].fillna(m["result_a"])
    m["result"] = (m["result_a"] + m["result_b"]).apply(
        lambda x: x[0:2] if x[2] not in bl or x[3] not in bl else x[2:4])
    return m.drop(columns=["chromosome_a", "chromosome_b", "position_a", "position_b", "result_a", "result_b"])

master = merge_tables(merge_tables(my_heritage, ancestry), df23andme)
master["chromosome"] = master["chromosome"].apply(
    lambda x: "0" + x if x in [str(i) for i in range(10)] else x)
master = master.sort_values(by=["chromosome", "position"])
master.to_csv(os.path.join("FakeGenome", "master.csv"), sep="\t", header=True)
master

Unnamed: 0_level_0,chromosome,position,result
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rs4477212,01,82154,AA
rs3094315,01,752566,CC
rs3131972,01,752721,GG
rs114525117,01,759036,GG
rs12124819,01,776546,AA
...,...,...,...
rs6568298,Y,59029728,TT
rs4047343,Y,59030922,CC
rs6568295,Y,59031513,TT
rs2334083,Y,59032331,CT


In [19]:
import numpy as np
counts = [29721, 29690, 27025, 26949, 23724, 23505, 5174, 4994, 80, 52, 53, 39, 6]
probs = [x/sum(counts) for x in counts]
ancestry_n = 677864
ancestry_fake = ancestry.copy()
ancestry_fake["result"] = ancestry["result"].apply(lambda _: np.random.choice(
    ["GG", "CC", "TT", "AA", "AG", "CT", "AC", "GT", "CG", "00", "AT", "II", "DD"], 
    size=1, replace=True, p=probs)[0])
ancestry_fake["allele1"] = ancestry_fake["result"].str[:1]
ancestry_fake["allele2"] = ancestry_fake["result"].str[1:]
ancestry_fake = ancestry_fake.drop(columns=["result"])
ancestry_fake.to_csv(os.path.join("FakeGenome", "AncestryDNA_fake.csv"), sep="\t", header=True)
ancestry_fake

Unnamed: 0_level_0,chromosome,position,allele1,allele2
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rs3131972,1,752721,C,C
rs114525117,1,759036,C,T
rs4040617,1,779322,G,G
rs141175086,1,780397,G,G
rs115093905,1,787173,T,T
...,...,...,...,...
rs41534744,26,16129,C,T
rs41419246,26,16145,A,G
rs41466049,26,16162,G,G
rs41355449,26,16327,G,G


In [20]:
import csv
counts = [29520, 29524, 26948, 26877, 23619, 23410, 5140, 4968, 819, 99, 48, 39, 5]
probs = [x/sum(counts) for x in counts]
my_heritage_fake = my_heritage.copy()
my_heritage_fake["result"] = my_heritage_fake["result"].apply(lambda _: np.random.choice(
    ["GG", "CC", "TT", "AA", "AG", "CT", "AC", "GT", "--", "CG", "AT", "II", "DD"], 
    size=1, replace=True, p=probs)[0])
my_heritage_fake = my_heritage_fake.rename(columns={
    "rsid": "RSID", "chromosome": "CHROMOSOME","position": "POSITION", "result": "RESULT"})
my_heritage_fake.to_csv(
    os.path.join("FakeGenome", "MyHeritage_raw_dna_data_fake.csv"), 
    sep=",", header=True, quotechar="\"", quoting=csv.QUOTE_ALL)
my_heritage_fake

Unnamed: 0_level_0,CHROMOSOME,POSITION,RESULT
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rs72631887,1,835092,GG
rs4970383,1,838555,AG
rs4475691,1,846808,CC
rs72631889,1,851390,GG
rs7537756,1,854250,TT
...,...,...,...
rs796720893,Y,58994369,TT
rs4893683,Y,58995039,GG
rs796479460,Y,58995204,AG
rs79602159,Y,58997068,CT


In [21]:
counts = [29680, 29632, 26986, 26930, 23637, 23465, 5149, 4973, 373, 89, 52, 39, 7]
probs = [x/sum(counts) for x in counts]
df23andme_fake = df23andme.copy()
df23andme_fake["result"] = df23andme_fake["result"].apply(lambda _: np.random.choice(
    ["GG", "CC", "TT", "AA", "AG", "CT", "AC", "GT", "--", "CG", "AT", "II", "DD"], 
    size=1, replace=True, p=probs)[0])
df23andme_fake = df23andme_fake.rename(columns={"result": "genotype"})
df23andme_fake.to_csv(
    os.path.join("FakeGenome", "23andme_fake.csv"), sep="\t", header=True)
df23andme_fake

Unnamed: 0_level_0,chromosome,position,genotype
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rs4477212,1,82154,AG
rs3094315,1,752566,AA
rs3131972,1,752721,TT
rs12124819,1,776546,GG
rs11240777,1,798959,AG
...,...,...,...
i701671,MT,16526,TT
i4990307,MT,16527,GG
i4000756,MT,16540,AC
i4000755,MT,16548,CT
