# Obesity Data Preprocessing

## Import packages

In [17]:
!pip install bed-reader



In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from bed_reader import to_bed, tmp_path, open_bed

## Meta data consideration

In [19]:
# Display all columns
pd.set_option("display.max_columns", None)

In [20]:
# Load meta data
obs = pd.read_csv('../data/obesity/SraRunTable.txt')

In [21]:
obs.head()

Unnamed: 0,Run,Age,Assay Type,AssemblyName,AvgSpotLen,Bases,BIOMATERIAL_PROVIDER,BioProject,BioSample,BioSampleModel,Bytes,Center Name,Consent,DATASTORE filetype,DATASTORE provider,DATASTORE region,Experiment,Instrument,Isolate,Library Name,LibraryLayout,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,Sample Name,sex,SRA Study,tissue
0,SRR6996662,46.619178,AMPLICON,GCA_000001405.13,162,16736251,"Chang\, SC",PRJNA449974,SAMN08924187,Human,11018904,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931452,Ion Torrent PGM,CGMH,OBL_067,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_067,male,SRP139885,Blood
1,SRR6996663,47.221918,AMPLICON,GCA_000001405.13,164,12417372,"Chang\, SC",PRJNA449974,SAMN08924186,Human,8309201,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931451,Ion Torrent PGM,CGMH,OBL_066,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_066,male,SRP139885,Blood
2,SRR6996664,57.441096,AMPLICON,GCA_000001405.13,153,15245589,"Chang\, SC",PRJNA449974,SAMN08924185,Human,10191254,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931450,Ion Torrent PGM,CGMH,OBL_065,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_065,male,SRP139885,Blood
3,SRR6996665,49.950685,AMPLICON,GCA_000001405.13,156,22546458,"Chang\, SC",PRJNA449974,SAMN08924184,Human,14776552,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931449,Ion Torrent PGM,CGMH,OBL_064,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_064,male,SRP139885,Blood
4,SRR6996666,50.906849,AMPLICON,GCA_000001405.13,155,20378605,"Chang\, SC",PRJNA449974,SAMN08924183,Human,13410438,CHANGSC137'S SHARED SUBMISSIONS,public,"bam,sra","gs,ncbi,s3","gs.US,ncbi.public,s3.us-east-1",SRX3931448,Ion Torrent PGM,CGMH,OBL_063,SINGLE,PCR,GENOMIC,Homo sapiens,ION_TORRENT,2018-04-17T00:00:00Z,OBL_063,male,SRP139885,Blood


In [22]:
# Count number of obesity and nonobesity
lib_name = obs['Library Name']
obl = 0
obh = 0
for x in lib_name:
    if 'OBL' in x:
        obl += 1
    elif 'OBH' in x:
        obh += 1
print ('nonobesity, obl:', obl, '; obisity, obh:', obh)

nonobesity, obl: 64 ; obisity, obh: 75


## Write NGS genotype-phenotype data to PLINK

In [23]:
# Load genotype data after implemeting GATK
geno = pd.read_csv('../data/obesity/geno_snps', sep = '\s+', header = None)

In [24]:
geno.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143
0,chr1,1152303,rs9442380,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/0,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/0,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/0,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
1,chr1,4918530,.,A,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
2,chr1,10379664,.,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
3,chr1,18236545,rs6660120,A,G,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
4,chr1,18236600,.,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.


In [25]:
# Add column names
cl = ['CHR', 'POS', 'rsID', 'Allele_1', 'Allele_2'] + list(obs['Run'])
df = geno.set_axis(cl, axis = 1)
df.head()

Unnamed: 0,CHR,POS,rsID,Allele_1,Allele_2,SRR6996662,SRR6996663,SRR6996664,SRR6996665,SRR6996666,SRR6996667,SRR6996668,SRR6996669,SRR6996670,SRR6996671,SRR6996672,SRR6996673,SRR6996674,SRR6996675,SRR6996676,SRR6996677,SRR6996678,SRR6996679,SRR6996680,SRR6996681,SRR6996682,SRR6996683,SRR6996684,SRR6996685,SRR6996686,SRR6996687,SRR6996688,SRR6996689,SRR6996690,SRR6996691,SRR6996692,SRR6996693,SRR6996694,SRR6996695,SRR6996696,SRR6996697,SRR6996698,SRR6996699,SRR6996700,SRR6996701,SRR6996702,SRR6996703,SRR6996704,SRR6996705,SRR6996706,SRR6996707,SRR6996708,SRR6996709,SRR6996710,SRR6996711,SRR6996712,SRR6996713,SRR6996714,SRR6996715,SRR6996716,SRR6996717,SRR6996718,SRR6996719,SRR6996720,SRR6996721,SRR6996722,SRR6996723,SRR6996724,SRR6996725,SRR6996726,SRR6996727,SRR6996728,SRR6996729,SRR6996730,SRR6996731,SRR6996732,SRR6996733,SRR6996734,SRR6996735,SRR6996736,SRR6996737,SRR6996738,SRR6996739,SRR6996740,SRR6996741,SRR6996742,SRR6996743,SRR6996744,SRR6996745,SRR6996746,SRR6996747,SRR6996748,SRR6996749,SRR6996750,SRR6996751,SRR6996752,SRR6996753,SRR6996754,SRR6996755,SRR6996756,SRR6996757,SRR6996758,SRR6996759,SRR6996760,SRR6996761,SRR6996762,SRR6996763,SRR6996764,SRR6996765,SRR6996766,SRR6996767,SRR6996768,SRR6996769,SRR6996770,SRR6996771,SRR6996772,SRR6996773,SRR6996774,SRR6996775,SRR6996776,SRR6996777,SRR6996778,SRR6996779,SRR6996780,SRR6996781,SRR6996782,SRR6996783,SRR6996784,SRR6996785,SRR6996786,SRR6996787,SRR6996788,SRR6996789,SRR6996790,SRR6996791,SRR6996792,SRR6996793,SRR6996794,SRR6996795,SRR6996796,SRR6996797,SRR6996798,SRR6996799,SRR6996800
0,chr1,1152303,rs9442380,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/0,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/0,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/0,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
1,chr1,4918530,.,A,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
2,chr1,10379664,.,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
3,chr1,18236545,rs6660120,A,G,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,1/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.
4,chr1,18236600,.,T,C,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,0/1,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.,./.


In [26]:
# Encode bi-allelic genotypes by 0, 1, 2
def get_genotype(data):
    N, D = data.shape
    drop_index = []
    for i in range(N):
        for j in range(5, D):
            if data.iloc[i, j] == "./." or data.iloc[i, j] == ".|.":
                data.iloc[i,j] = np.nan
            elif data.iloc[i, j] == "0/0" or data.iloc[i, j] == "0|0":
                data.iloc[i,j] = 0
            elif data.iloc[i, j] == "0/1" or data.iloc[i, j] == "0|1":
                data.iloc[i,j] = 1
            elif data.iloc[i, j] == "1/1" or data.iloc[i, j] == "1|1":
                data.iloc[i,j] = 2
            else:
                drop_index.append(i)
    unique_drop_index = np.unique(drop_index)
    df = data.drop(index = unique_drop_index)
    return (df)

In [27]:
# Data including encoded genotypes
data = get_genotype(df)

In [28]:
# Collect sex data
s = list(obs['sex'])
se = []
for x in s:
    if x == 'male':
        se.append(1)
    elif x == 'female':
        se.append(2)
    else:
        se.append(-9)
# Collect phenotype data
pn = list(obs['Sample Name'])
pheno = []
for x in pn:
    if 'OBL' in x:
        pheno.append(1)
    if 'OBH' in x:
        pheno.append(2)
        
# Collect rsID and idd
rsid = list(data['rsID'])
idd = list(obs['Run'])
n = len(idd)
l = len(rsid)

# Write NGS data to PLINK files
output_file = tmp_path() / "obs_ngs.bed"
val = data.iloc[:, 5::].T.values.astype(np.float32)
properties = {
   "fid": idd,
   "iid": idd,
   "father": [0]*n,
   "mother": [0]*n,
   "sex": se,
   "pheno": pheno,
   "chromosome": list(data['CHR']),
   "sid": rsid,
   "cm_position": [0]*l,
   "bp_position": list(data['POS']),
   "allele_1": list(data['Allele_1']),
   "allele_2": list(data['Allele_2']),
}
to_bed("../data/obesity/obs_ngs.bed", val, properties=properties)

## Subsequent data preprocessing

#### Quality control ([refer to README](../README.md))

cd ../data/obesity

plink \
    --bfile obs_ngs \
    --maf 0.05 \
    --hwe 1e-6 \
    --geno 0.1 \
    --write-snplist \
    --make-bed \
    --out obs_ngs.QC

In [30]:
# Load bed files after QC
bed_qc = open_bed('../data/obesity/obs_ngs.QC.bed')
val_qc = bed_qc.read()
val_qc

array([[0., 0., 0., ..., 1., 0., 2.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       ...,
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 2., 0., 1.],
       [1., 0., 0., ..., 0., 2., 1.]], dtype=float32)

In [31]:
print ('Size of data: ', val_qc.shape)
print( 'Elements of data: ', np.unique(val_qc))

Size of data:  (139, 135)
Elements of data:  [ 0.  1.  2. nan]


In [32]:
# Load the bim file after QC
ngs_bim = pd.read_csv('../data/obesity/obs_ngs.QC.bim', sep = '\s+', header = None)
ngs_bim.head()

Unnamed: 0,0,1,2,3,4,5
0,1,rs11208659,0,65513597,T,C
1,1,rs3101337,0,72285451,T,C
2,1,rs3101336,0,72285502,C,T
3,1,rs9425089,0,72299399,A,C
4,1,rs2568958,0,72299433,A,G


In [33]:
# Generate SNP list file after QC
ngs_bim.iloc[:, 1].to_csv('../data/obesity/rsID.filename', header = None, index = False)

## Prepare data for training model

Convert PLINK file to one vcf file:
plink --bfile obs_ngs.QC --recode vcf --out obs_ngs.finish (refer to [README](../README.md))

In [60]:
# Read a vcf file
def read_vcf(vcf_path):
    with open(vcf_path, "rt") as ifile:
          for line in ifile:
            if line.startswith("#CHROM"):
                  vcf_names = [x for x in line.split('\t')]
                  break
    ifile.close()
    data = pd.read_csv(vcf_path, comment='#', delim_whitespace=True, header=None, names=vcf_names)
    return data

In [61]:
obs_data = read_vcf('../data/obesity/obs_ngs.finish.vcf')
obs_data.head()

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,SRR6996662_SRR6996662,SRR6996663_SRR6996663,SRR6996664_SRR6996664,SRR6996665_SRR6996665,SRR6996666_SRR6996666,SRR6996667_SRR6996667,SRR6996668_SRR6996668,SRR6996669_SRR6996669,SRR6996670_SRR6996670,SRR6996671_SRR6996671,SRR6996672_SRR6996672,SRR6996673_SRR6996673,SRR6996674_SRR6996674,SRR6996675_SRR6996675,SRR6996676_SRR6996676,SRR6996677_SRR6996677,SRR6996678_SRR6996678,SRR6996679_SRR6996679,SRR6996680_SRR6996680,SRR6996681_SRR6996681,SRR6996682_SRR6996682,SRR6996683_SRR6996683,SRR6996684_SRR6996684,SRR6996685_SRR6996685,SRR6996686_SRR6996686,SRR6996687_SRR6996687,SRR6996688_SRR6996688,SRR6996689_SRR6996689,SRR6996690_SRR6996690,SRR6996691_SRR6996691,SRR6996692_SRR6996692,SRR6996693_SRR6996693,SRR6996694_SRR6996694,SRR6996695_SRR6996695,SRR6996696_SRR6996696,SRR6996697_SRR6996697,SRR6996698_SRR6996698,SRR6996699_SRR6996699,SRR6996700_SRR6996700,SRR6996701_SRR6996701,SRR6996702_SRR6996702,SRR6996703_SRR6996703,SRR6996704_SRR6996704,SRR6996705_SRR6996705,SRR6996706_SRR6996706,SRR6996707_SRR6996707,SRR6996708_SRR6996708,SRR6996709_SRR6996709,SRR6996710_SRR6996710,SRR6996711_SRR6996711,SRR6996712_SRR6996712,SRR6996713_SRR6996713,SRR6996714_SRR6996714,SRR6996715_SRR6996715,SRR6996716_SRR6996716,SRR6996717_SRR6996717,SRR6996718_SRR6996718,SRR6996719_SRR6996719,SRR6996720_SRR6996720,SRR6996721_SRR6996721,SRR6996722_SRR6996722,SRR6996723_SRR6996723,SRR6996724_SRR6996724,SRR6996725_SRR6996725,SRR6996726_SRR6996726,SRR6996727_SRR6996727,SRR6996728_SRR6996728,SRR6996729_SRR6996729,SRR6996730_SRR6996730,SRR6996731_SRR6996731,SRR6996732_SRR6996732,SRR6996733_SRR6996733,SRR6996734_SRR6996734,SRR6996735_SRR6996735,SRR6996736_SRR6996736,SRR6996737_SRR6996737,SRR6996738_SRR6996738,SRR6996739_SRR6996739,SRR6996740_SRR6996740,SRR6996741_SRR6996741,SRR6996742_SRR6996742,SRR6996743_SRR6996743,SRR6996744_SRR6996744,SRR6996745_SRR6996745,SRR6996746_SRR6996746,SRR6996747_SRR6996747,SRR6996748_SRR6996748,SRR6996749_SRR6996749,SRR6996750_SRR6996750,SRR6996751_SRR6996751,SRR6996752_SRR6996752,SRR6996753_SRR6996753,SRR6996754_SRR6996754,SRR6996755_SRR6996755,SRR6996756_SRR6996756,SRR6996757_SRR6996757,SRR6996758_SRR6996758,SRR6996759_SRR6996759,SRR6996760_SRR6996760,SRR6996761_SRR6996761,SRR6996762_SRR6996762,SRR6996763_SRR6996763,SRR6996764_SRR6996764,SRR6996765_SRR6996765,SRR6996766_SRR6996766,SRR6996767_SRR6996767,SRR6996768_SRR6996768,SRR6996769_SRR6996769,SRR6996770_SRR6996770,SRR6996771_SRR6996771,SRR6996772_SRR6996772,SRR6996773_SRR6996773,SRR6996774_SRR6996774,SRR6996775_SRR6996775,SRR6996776_SRR6996776,SRR6996777_SRR6996777,SRR6996778_SRR6996778,SRR6996779_SRR6996779,SRR6996780_SRR6996780,SRR6996781_SRR6996781,SRR6996782_SRR6996782,SRR6996783_SRR6996783,SRR6996784_SRR6996784,SRR6996785_SRR6996785,SRR6996786_SRR6996786,SRR6996787_SRR6996787,SRR6996788_SRR6996788,SRR6996789_SRR6996789,SRR6996790_SRR6996790,SRR6996791_SRR6996791,SRR6996792_SRR6996792,SRR6996793_SRR6996793,SRR6996794_SRR6996794,SRR6996795_SRR6996795,SRR6996796_SRR6996796,SRR6996797_SRR6996797,SRR6996798_SRR6996798,SRR6996799_SRR6996799,SRR6996800_SRR6996800\n
0,1,65513597,rs11208659,C,T,.,.,PR,GT,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1
1,1,72285451,rs3101337,C,T,.,.,PR,GT,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/1,0/0,1/1,0/1,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0,0/0,0/0,0/1,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0,0/1,0/1,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0
2,1,72285502,rs3101336,T,C,.,.,PR,GT,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/1,0/0,1/1,0/1,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0,0/0,0/0,0/1,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0,0/1,0/1,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0
3,1,72299399,rs9425089,C,A,.,.,PR,GT,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/1,0/0,1/1,0/1,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0
4,1,72299433,rs2568958,G,A,.,.,PR,GT,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/1,0/0,1/1,0/1,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0,0/0,0/0,0/1,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0,0/1,0/1,0/1,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/0,0/0,0/0


In [62]:
# Reconstruct data
obs = obs_data.iloc[:, 9::]
obs['ID'] = obs_data.iloc[:, 2]
obs.set_index('ID', inplace = True)
list_col = obs.columns
l_col = []
for e in list_col:
    l_col.append(e.split("_")[0])
obs = obs.T.reset_index()
obs = obs.iloc[:, 1::]
obs['sample'] = l_col
obs.set_index('sample', inplace = True)
# Encode genotype to 0, 1, 2
pre_obs = obs.replace(['0/0', './.', '0/1', '1/1'], [0, 0, 1, 2])
pre_obs.head()

ID,rs11208659,rs3101337,rs3101336,rs9425089,rs2568958,rs2815752,rs10789336,rs4322186,rs1514176,rs1514175,rs12410097,rs1555543,rs12021920,rs984222,rs1011731,rs543874,rs10913469,rs2605100,rs939582,rs2867124,rs6548238,rs6745262,rs4854343,rs4854344,rs4854345,rs7561317,rs713586,rs887913,rs17049906,rs2943650,rs4684846,rs1822825,rs6795735,rs7647305,rs9816226,rs10938397,rs7687015,rs1800592,rs2112347,rs6861681,rs1294421,rs206936,rs6905288,rs987237,rs2800710,rs9491696,rs1055144,rs17150703,rs13278851,rs13252210,rs516175,rs545854,rs4994,rs4735692,rs58104805,rs10968576,rs10508504,rs4929949,rs4074134,rs4923461,rs925946,rs10501087,rs6265,rs10767664,rs3817334,rs7120548,rs10838738,rs564343,rs660339,rs5443,rs718314,rs1948149,rs7138803,rs1443512,rs4759309,rs11109072,rs7316835,rs2074356,rs17089410,rs7989336,rs1957893,rs79090609,rs1957894,rs2241423,rs2531995,rs10163244,rs11860225,rs4786083,rs10500331,rs8052357,rs11643187,rs11646906,rs12924838,rs1946127,rs11077019,rs8049439,rs4788102,rs7498665,rs7359397,rs6499640,rs9939973,rs9940128,rs1421085,rs1558902,rs1121980,rs72803680,rs7193144,rs8050136,rs8051591,rs9935401,rs3751812,rs9926289,rs9939609,rs7190492,rs9930501,rs9930506,rs9932754,rs8044769,rs1424233,rs9890502,rs7503807,rs1805081,rs571480,rs571312,rs17782313,rs476828,rs12970134,rs477181,rs502933,rs4450508,rs29941,rs442398,rs11084753,rs13041126,rs4823006
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1
SRR6996662,0,0,0,0,0,0,0,0,1,1,1,0,0,1,0,1,0,1,0,0,0,2,0,0,0,0,2,1,0,0,2,1,1,1,1,1,1,1,1,0,1,0,0,0,0,1,1,2,2,2,1,0,0,0,0,0,0,0,2,2,0,0,0,0,1,1,1,1,2,0,1,0,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,2,0,1,0,1,1,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,2
SRR6996663,0,0,0,0,0,0,0,0,1,1,0,0,0,2,0,0,1,1,0,0,0,1,0,0,0,0,2,0,0,0,0,1,1,1,1,1,0,0,1,0,0,1,1,0,0,1,2,0,0,0,0,2,1,0,0,1,0,1,1,1,0,1,1,1,1,0,1,1,2,1,1,1,0,0,0,0,0,0,1,0,1,0,1,2,2,1,0,0,0,2,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,0,1
SRR6996664,0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,1,1,0,1,1,1,0,0,0,1,2,1,1,1,0,0,1,1,1,0,1,0,0,1,1,0,0,0,2,2,2,2,0,2,0,1,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,0,0,1,0,1,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1
SRR6996665,0,0,0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,2,0,0,2,1,0,1,1,1,1,1,1,1,0,1,1,0,2,0,1,1,2,0,0,0,0,1,1,1,0,2,1,1,0,2,0,0,1,1,2,1,1,1,0,1,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
SRR6996666,0,0,0,0,0,0,0,0,2,2,1,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0,2,0,1,0,2,0,0,1,1,0,0,2,1,0,0,1,1,0,1,1,0,1,1,2,2,0,0,1,1,0,0,0,2,2,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,1,2,0,1,1,1,1,1,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [65]:
# Read fam file after QC
obs_fam = pd.read_csv('../data/obesity/obs_ngs.QC.fam',header = None, sep = '\s+')
obs_fam.head

# Add sex and phenotype
pre_obs['Sex'] = list(obs_fam.iloc[:, 4])
pre_obs['Phenotype'] = list(obs_fam.iloc[:, 5])

In [66]:
# Split original data to training and testing data
X_train, X_test, y_train, y_test = train_test_split(pre_obs.iloc[:,0:-1], pre_obs.iloc[:,-1], test_size=0.2, random_state=42)
# Save the training and testing data
X_train.to_csv('../data/obesity/X_train.csv')
X_test.to_csv('../data/obesity/X_test.csv')
y_train.to_csv('../data/obesity/y_train.csv')
y_test.to_csv('../data/obesity/y_test.csv')