# Obesity Data Preprocessing

## Import packages

In [88]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Meta data consideration

In [89]:
# Display all columns
pd.set_option("display.max_columns", None)
# Load meta data
obs = pd.read_csv('../data/obesity/SraRunTable.txt')
obs.head()

Unnamed: 0,Run,Age,Assay Type,AssemblyName,AvgSpotLen,Bases,BIOMATERIAL_PROVIDER,BioProject,BioSample,BioSampleModel,Bytes,Center Name,Consent,DATASTORE filetype,DATASTORE provider,DATASTORE region,Experiment,Instrument,Isolate,Library Name,LibraryLayout,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,Sample Name,sex,SRA Study,tissue
0,SRR6996662,46.619178,AMPLICON,GCA_000001405.13,162,16736251,"Chang\, SC",PRJNA449974,SAMN08924187,Human,11018904,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931452,Ion Torrent PGM,CGMH,OBL_067,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_067,male,SRP139885,Blood
1,SRR6996663,47.221918,AMPLICON,GCA_000001405.13,164,12417372,"Chang\, SC",PRJNA449974,SAMN08924186,Human,8309201,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931451,Ion Torrent PGM,CGMH,OBL_066,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_066,male,SRP139885,Blood
2,SRR6996664,57.441096,AMPLICON,GCA_000001405.13,153,15245589,"Chang\, SC",PRJNA449974,SAMN08924185,Human,10191254,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931450,Ion Torrent PGM,CGMH,OBL_065,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_065,male,SRP139885,Blood
3,SRR6996665,49.950685,AMPLICON,GCA_000001405.13,156,22546458,"Chang\, SC",PRJNA449974,SAMN08924184,Human,14776552,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931449,Ion Torrent PGM,CGMH,OBL_064,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_064,male,SRP139885,Blood
4,SRR6996666,50.906849,AMPLICON,GCA_000001405.13,155,20378605,"Chang\, SC",PRJNA449974,SAMN08924183,Human,13410438,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931448,Ion Torrent PGM,CGMH,OBL_063,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_063,male,SRP139885,Blood


In [90]:
# Count number of obesity and nonobesity
lib_name = obs['Library Name']
obl = 0
obh = 0
for x in lib_name:
    if 'OBL' in x:
        obl += 1
    elif 'OBH' in x:
        obh += 1
print ('nonobesity, obl:', obl, '; obisity, obh:', obh)

nonobesity, obl: 64 ; obisity, obh: 75


## Generate data for training

In [91]:
# Read a vcf file
def read_vcf(vcf_path):
    with open(vcf_path, "rt") as ifile:
          for line in ifile:
            if line.startswith("#CHROM"):
                  vcf_names = [x for x in line.split('\t')]
                  break
    ifile.close()
    data = pd.read_csv(vcf_path, comment='#', delim_whitespace=True, header=None, names=vcf_names)
    return data

In [92]:
# Load genotype data after implemeting GATK
ori_geno = read_vcf('../data/obesity/imputation_output/biallelic_snps.vcf')
print(ori_geno.shape)
ori_geno.head()

(625, 148)


Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SRR6996662,SRR6996663,SRR6996664,SRR6996665,SRR6996666,SRR6996667,SRR6996668,SRR6996669,SRR6996670,SRR6996671,SRR6996672,SRR6996673,SRR6996674,SRR6996675,SRR6996676,SRR6996677,SRR6996678,SRR6996679,SRR6996680,SRR6996681,SRR6996682,SRR6996683,SRR6996684,SRR6996685,SRR6996686,SRR6996687,SRR6996688,SRR6996689,SRR6996690,SRR6996691,SRR6996692,SRR6996693,SRR6996694,SRR6996695,SRR6996696,SRR6996697,SRR6996698,SRR6996699,SRR6996700,SRR6996701,SRR6996702,SRR6996703,SRR6996704,SRR6996705,SRR6996706,SRR6996707,SRR6996708,SRR6996709,SRR6996710,SRR6996711,SRR6996712,SRR6996713,SRR6996714,SRR6996715,SRR6996716,SRR6996717,SRR6996718,SRR6996719,SRR6996720,SRR6996721,SRR6996722,SRR6996723,SRR6996724,SRR6996725,SRR6996726,SRR6996727,SRR6996728,SRR6996729,SRR6996730,SRR6996731,SRR6996732,SRR6996733,SRR6996734,SRR6996735,SRR6996736,SRR6996737,SRR6996738,SRR6996739,SRR6996740,SRR6996741,SRR6996742,SRR6996743,SRR6996744,SRR6996745,SRR6996746,SRR6996747,SRR6996748,SRR6996749,SRR6996750,SRR6996751,SRR6996752,SRR6996753,SRR6996754,SRR6996755,SRR6996756,SRR6996757,SRR6996758,SRR6996759,SRR6996760,SRR6996761,SRR6996762,SRR6996763,SRR6996764,SRR6996765,SRR6996766,SRR6996767,SRR6996768,SRR6996769,SRR6996770,SRR6996771,SRR6996772,SRR6996773,SRR6996774,SRR6996775,SRR6996776,SRR6996777,SRR6996778,SRR6996779,SRR6996780,SRR6996781,SRR6996782,SRR6996783,SRR6996784,SRR6996785,SRR6996786,SRR6996787,SRR6996788,SRR6996789,SRR6996790,SRR6996791,SRR6996792,SRR6996793,SRR6996794,SRR6996795,SRR6996796,SRR6996797,SRR6996798,SRR6996799,SRR6996800\n
0,chr1,1152303,rs9442380,T,C,159.11,PASS,AC=2;AF=0.25;AN=8;DP=11;ExcessHet=0.3218;FS=0;...,GT:AD:DP:GQ:PL,"./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:1,0:1:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","1/1:0,2:2:6:68,6,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:1,0:1:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","0/0:1,0:1:3:0,3,28","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","0/0:1,0:1:3:0,3,25","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:1,0:1:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:1,0:1:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","0/0:1,0:1:3:0,3,23","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:1,0:1:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:1,0:1:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0"
1,chr1,4918530,.,A,C,192.98,PASS,AC=2;AF=1;AN=2;DP=2;ExcessHet=3.0103;FS=0;Inbr...,GT:AD:DP:GQ:PL,"./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","1/1:0,2:2:6:67,6,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0"
2,chr1,10379664,.,T,C,191.98,PASS,AC=2;AF=1;AN=2;DP=2;ExcessHet=3.0103;FS=0;Inbr...,GT:AD:DP:GQ:PL,"./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","1/1:0,2:2:6:66,6,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0"
3,chr1,18236545,rs6660120,A,G,152.39,PASS,AC=2;AF=1;AN=2;DP=12;ExcessHet=3.0103;FS=0;Inb...,GT:AD:DP:GQ:PL,"./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:2,0:2:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:2,0:2:.:0,0,0","./.:1,0:1:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:1,0:1:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:1,0:1:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","1/1:0,1:1:3:35,3,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:1,0:1:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:2,0:2:.:0,0,0"
4,chr1,18236600,.,T,C,129.11,PASS,AC=1;AF=0.5;AN=2;BaseQRankSum=-0.674;DP=2;Exce...,GT:AD:DP:GQ:PL,"./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","0/1:1,1:2:23:23,0,31","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0","./.:0,0:0:.:0,0,0"


In [93]:
# Load an imputed target data for training
geno = read_vcf('../data/obesity/biallelic_for_training.vcf')

# Add rsID to the data
new_ID = ori_geno['#CHROM'].astype(str) + '_' + ori_geno['POS'].astype(str) + '_' + ori_geno['REF'] + '_' + ori_geno['ALT']  
ori_geno['INDEX'] = new_ID
ori_geno = ori_geno.set_index('INDEX')
geno = geno.set_index('ID')
geno_id = list(geno.index)
geno['ID'] = ori_geno.loc[geno_id, 'ID']

# Replace variants without rsID by CHROM_POS_REF_ALT
for x in geno_id :
    if geno.loc[x, 'ID'] == '.':
        geno.loc[x, 'ID'] = x

temp = geno.isna().any(axis=1)
id_nan = temp[temp].index       
for x in id_nan:
    geno.loc[x, 'ID'] = x
    
geno = geno.set_index('ID')
print(geno.shape)
geno.head()

(135, 147)


Unnamed: 0_level_0,#CHROM,POS,REF,ALT,QUAL,FILTER,INFO,FORMAT,SRR6996662,SRR6996663,SRR6996664,SRR6996665,SRR6996666,SRR6996667,SRR6996668,SRR6996669,SRR6996670,SRR6996671,SRR6996672,SRR6996673,SRR6996674,SRR6996675,SRR6996676,SRR6996677,SRR6996678,SRR6996679,SRR6996680,SRR6996681,SRR6996682,SRR6996683,SRR6996684,SRR6996685,SRR6996686,SRR6996687,SRR6996688,SRR6996689,SRR6996690,SRR6996691,SRR6996692,SRR6996693,SRR6996694,SRR6996695,SRR6996696,SRR6996697,SRR6996698,SRR6996699,SRR6996700,SRR6996701,SRR6996702,SRR6996703,SRR6996704,SRR6996705,SRR6996706,SRR6996707,SRR6996708,SRR6996709,SRR6996710,SRR6996711,SRR6996712,SRR6996713,SRR6996714,SRR6996715,SRR6996716,SRR6996717,SRR6996718,SRR6996719,SRR6996720,SRR6996721,SRR6996722,SRR6996723,SRR6996724,SRR6996725,SRR6996726,SRR6996727,SRR6996728,SRR6996729,SRR6996730,SRR6996731,SRR6996732,SRR6996733,SRR6996734,SRR6996735,SRR6996736,SRR6996737,SRR6996738,SRR6996739,SRR6996740,SRR6996741,SRR6996742,SRR6996743,SRR6996744,SRR6996745,SRR6996746,SRR6996747,SRR6996748,SRR6996749,SRR6996750,SRR6996751,SRR6996752,SRR6996753,SRR6996754,SRR6996755,SRR6996756,SRR6996757,SRR6996758,SRR6996759,SRR6996760,SRR6996761,SRR6996762,SRR6996763,SRR6996764,SRR6996765,SRR6996766,SRR6996767,SRR6996768,SRR6996769,SRR6996770,SRR6996771,SRR6996772,SRR6996773,SRR6996774,SRR6996775,SRR6996776,SRR6996777,SRR6996778,SRR6996779,SRR6996780,SRR6996781,SRR6996782,SRR6996783,SRR6996784,SRR6996785,SRR6996786,SRR6996787,SRR6996788,SRR6996789,SRR6996790,SRR6996791,SRR6996792,SRR6996793,SRR6996794,SRR6996795,SRR6996796,SRR6996797,SRR6996798,SRR6996799,SRR6996800\n
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1
rs6660120,chr1,18236545,A,G,.,PASS,AF=1,GT,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1
rs1993709,chr1,72372846,A,G,.,PASS,AF=0.989209,GT,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,0|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,0|1,1|1,0|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1
rs4322186,chr1,72372878,G,T,.,PASS,AF=0.0611511,GT,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|1,0|0,0|0,0|0,0|1,0|0,0|0,0|0,0|0,0|0,0|0,0|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|1,1|0,0|0,1|1,1|0,0|0,0|0,0|0,0|0,0|1,0|0,0|1,0|0,0|0,0|0,0|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,1|1,0|0,0|0,0|0,0|0,0|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|1,0|0,0|0,0|0,0|0
rs2797098,chr1,72559191,G,A,.,PASS,AF=1,GT,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1
rs6424534,chr1,73198389,T,G,.,PASS,AF=1,GT,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1,1|1


In [94]:
# Encode bi-allelic genotypes by 0, 1, 2
def get_genotype(data):
    # Encode genotype to 0, 1, 2
    df = data.replace(['0|0', '0|1', '1|0', '1|1'], [0, 1, 1, 2])
    return (df)
# Data including encoded genotypes
data = get_genotype(geno)
data = data.iloc[:, 8::]
data = data.T
print(data.shape)
data.head()

(139, 135)


ID,rs6660120,rs1993709,rs4322186,rs2797098,rs6424534,rs524219,rs2091510,rs12021920,rs984222,rs543874,rs10913469,rs2605100,rs28370041,rs939582,rs2867124,rs11127485,rs6548238,rs6745262,rs4854343,rs4854344,rs4854345,rs117879374,chr2_24935057_C_T,rs887913,rs17049906,rs702910,rs1026401,rs187205547,rs140440965,rs6784615,rs6445486,rs4687621,rs6795735,rs186470486,rs13078807,rs9816226,rs10938397,rs34749134,rs7687015,rs1800592,rs2099013,rs2112347,rs115287355,rs1042713,rs6556096,rs6861681,rs206936,rs6905288,rs987237,chr6_119696197_A_G,rs3814478,rs2948267,rs4727419,rs4729494,rs1404957,rs1639519,rs17150703,rs13278851,rs516175,rs76951015,rs4994,rs4735692,rs187224014,rs7079574,rs10508504,rs151269914,rs7081678,rs88032,rs4929949,rs112376394,rs3817334,rs142529541,rs7120548,rs10838738,rs660339,rs886538,rs657538,rs7138803,rs11109072,rs4500588,rs17089410,rs1957893,rs79090609,rs1957894,rs78628000,rs2531995,rs10163244,rs11860225,rs147340331,rs10500331,rs11643187,rs1946127,rs11077019,rs79774828,rs9937775,rs8049439,rs142922470,rs117430099,rs117696251,rs1421085,rs1558902,rs193231717,rs141115189,rs7190492,rs8044769,rs74884695,rs9929160,rs1424234,rs1424233,rs2317124,rs4783224,rs1993831,rs9890502,rs7503807,rs4798616,rs4797306,rs1805081,rs558699,rs73447671,rs571312,rs79912123,rs17782313,rs476828,rs78022601,rs12970134,rs146856830,rs79224334,rs75223771,rs4450508,rs29941,rs442398,rs435942,rs11084753,rs6081769,rs13041126
SRR6996662,2,2,0,2,2,2,2,0,1,1,0,1,0,2,2,2,2,2,2,2,2,0,0,1,0,2,2,0,0,2,2,2,1,0,0,1,1,0,1,1,2,1,0,1,2,0,2,2,0,2,0,2,2,2,2,2,2,2,1,0,0,2,0,2,0,0,0,0,0,0,1,0,1,1,2,2,2,0,0,2,0,2,0,2,0,0,0,0,0,0,0,1,1,0,2,0,0,0,0,1,1,0,0,2,2,0,2,1,1,2,2,2,1,0,2,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,0
SRR6996663,2,2,0,2,2,2,2,0,0,0,1,1,0,2,2,2,2,1,2,2,2,0,0,0,0,2,2,0,0,2,2,2,1,0,0,1,1,0,0,2,2,1,0,1,2,0,1,1,0,2,0,2,2,2,2,2,0,0,0,0,1,2,0,2,0,0,0,0,1,0,1,0,0,1,2,2,2,0,0,2,1,1,0,1,0,2,1,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,1,1,0,2,0,0,2,2,2,0,0,2,2,0,2,0,1,0,1,1,0,1,0,0,0,1,1,1,1,1,2,0
SRR6996664,2,2,0,2,2,2,2,0,2,1,1,2,0,2,2,2,2,0,2,2,2,0,0,1,0,2,2,0,0,2,2,2,2,0,0,1,0,0,1,1,2,1,0,0,2,0,2,1,2,2,0,2,2,2,2,2,0,0,1,0,0,1,0,2,1,0,0,0,1,0,2,0,0,2,1,2,2,1,0,2,0,2,0,2,0,1,1,0,0,1,0,2,1,0,2,1,0,0,0,0,0,0,0,1,1,0,2,1,1,2,2,2,1,1,2,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1
SRR6996665,2,2,0,2,2,2,2,1,0,0,0,2,0,2,2,2,2,0,2,2,2,1,0,1,0,2,2,0,0,2,2,2,2,0,0,2,0,0,0,2,2,0,0,1,2,0,0,1,0,2,0,2,2,2,2,2,1,1,1,0,1,1,0,2,0,0,0,0,1,0,1,0,1,1,2,2,2,2,1,2,1,1,0,1,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,2,1,0,2,1,1,2,2,2,0,1,2,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1
SRR6996666,2,2,0,2,2,2,2,1,1,1,0,1,0,2,2,2,2,1,2,2,2,0,0,0,1,2,2,0,0,2,2,2,2,0,0,1,0,0,0,0,2,1,0,0,2,0,1,1,0,2,0,2,2,2,2,2,1,1,2,1,0,1,0,2,0,0,0,0,0,0,1,0,0,1,0,2,2,0,0,2,0,1,1,1,0,0,1,0,0,0,0,1,0,2,2,0,0,0,0,1,1,0,0,2,2,0,2,1,1,2,2,2,1,1,2,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0


In [95]:
# Collect sex data
s = list(obs['sex'])
se = []
for x in s:
    if x == 'male':
        se.append(0)
    elif x == 'female':
        se.append(1)
    else:
        se.append(-9)
# Collect phenotype data
pn = list(obs['Sample Name'])
pheno = []
for x in pn:
    if 'OBL' in x:
        pheno.append(0)
    if 'OBH' in x:
        pheno.append(1)
        
data['SEX'] = se
data['PHENOTYPE'] = pheno

In [96]:
# Split original data to training and testing data
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,0:-1], data.iloc[:,-1], test_size=0.2, random_state=42)
# Save the training and testing data
X_train.to_csv('../data/obesity/X_train.csv')
X_test.to_csv('../data/obesity/X_test.csv')
y_train.to_csv('../data/obesity/y_train.csv')
y_test.to_csv('../data/obesity/y_test.csv')