## Estimating SNP heritability

### Setup the environment

In [1]:
%matplotlib inline
from warnings import simplefilter 
simplefilter(action='ignore', category=FutureWarning)

import sys
import numpy as np
import scipy as sp
import scipy.stats as st
import pylab as pl
import pandas as pd
import h5py
sp.random.seed(0)

from limix.vardec import VarianceDecomposition
import limix.util as lmx_util


### Load the data
Load the arabidopsis data, which are in an hdf5 file.

To process your own data, use the limix command line binary (see [here](http://nbviewer.jupyter.org/github/limix/limix-tutorials/blob/master/preprocessing_QC/loading_files.ipynb) for an example).

In [2]:
sys.path.append('./..')
import data as tutorial_data
file_name = tutorial_data.get_file('arab107')

### Set up the data object
The HDF5 file holds both the genotype and phenotype data.

In [3]:
f = h5py.File(file_name, 'r')
phenotype_names = ['5_FT10','6_FT16','7_FT22']
pheno_group = f['phenotype']
pheno_df = pd.DataFrame(pheno_group['matrix'][:], 
                        columns=np.char.decode(pheno_group['col_header']['phenotype_ID'][:]),
                        index=pheno_group['row_header']['sample_ID'][:])[phenotype_names]

In [4]:
pheno_df.head()

Unnamed: 0,5_FT10,6_FT16,7_FT22
5837,57.0,50.0,43.0
6008,60.0,41.0,24.0
6009,98.0,151.0,250.0
6016,75.0,101.0,113.0
6040,71.0,92.0,87.0


In [5]:
pheno_df.describe()

Unnamed: 0,5_FT10,6_FT16,7_FT22
count,194.0,193.0,193.0
mean,63.969072,64.647237,74.719689
std,17.821556,40.496864,71.747296
min,41.0,26.0,23.3
25%,49.0,41.0,30.0
50%,59.25,50.25,44.0
75%,71.0,70.0,75.0
max,121.0,252.0,250.0


In [6]:
geno_group = f['genotype']
chromosomes = geno_group['col_header']['chrom'][::10]
positions = geno_group['col_header']['pos'][::10]
geno_df = pd.DataFrame(geno_group['matrix'][:,::10], columns=positions, 
                       index=geno_group['row_header']['sample_ID'][:],
                      dtype='float64')

In [7]:
geno_df.head()

Unnamed: 0,657,7601,13045,25365,31926,41427,48118,55684,62259,69311,...,26931730,26934779,26937502,26941289,26945443,26951228,26957649,26964341,26969880,26973598
6122,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
6121,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6116,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
6115,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [8]:
#flowering phenotypes
phenotype_ID =  pheno_df.columns[1]

filtered_pheno_df = pheno_df.dropna()
filtered_pheno_df.info()
sample_idx = geno_df.index.intersection(filtered_pheno_df.index)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 5837 to 100000
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   5_FT10  192 non-null    float64
 1   6_FT16  192 non-null    float64
 2   7_FT22  192 non-null    float64
dtypes: float64(3)
memory usage: 6.0 KB


In [9]:
phenotype_std = lmx_util.preprocess.rankStandardizeNormal(filtered_pheno_df.loc[sample_idx].values)
snps = geno_df.loc[sample_idx]
N = snps.shape[0]
S = snps.shape[1]
P = pheno_df.shape[1]
print("loaded %d samples, %d phenotypes, %s snps" % (N, P, S))

loaded 192 samples, 3 phenotypes, 21456 snps


### Use the SNPs to estimate sample similarity/relatedness (also known as kinship)

In [10]:
from limix.stats import linear_kinship, gower_norm
sample_relatedness_unnormalized = linear_kinship(snps.values)
sample_relatedness = gower_norm(sample_relatedness_unnormalized)

100%|██████████| 100/100 [00:00<00:00, 2117.57it/s]


In [11]:
sample_relatedness[0:5, 0:5]

array([[ 1.08575548, -0.00220991,  0.01914943, -0.0065966 , -0.02605002],
       [-0.00220991,  0.98639869, -0.01216375, -0.01691818, -0.01916793],
       [ 0.01914943, -0.01216375,  1.04626177, -0.0221134 ,  0.01371893],
       [-0.0065966 , -0.01691818, -0.0221134 ,  1.31148388, -0.01127501],
       [-0.02605002, -0.01916793,  0.01371893, -0.01127501,  0.93731567]])

### Estimating heritability


In [12]:
for idx in range(len(phenotype_names)):
    print("Estimating the pseudo/narrow-sense heritability of: " + phenotype_names[idx])
    y = phenotype_std[:,idx]
    vc = VarianceDecomposition(y)
    vc.addFixedEffect()
    vc.addRandomEffect(K=sample_relatedness)
    vc.addRandomEffect(is_noise=True)
    vc.optimize()

    _var = vc.getVarianceComps()
    _var /= _var.sum()
    _var = _var.ravel()
    print('genetic: %.2f, noise: %.2f' % (_var[0],_var[1]))

Estimating the pseudo/narrow-sense heritability of: 5_FT10
genetic: 0.90, noise: 0.10
Estimating the pseudo/narrow-sense heritability of: 6_FT16
genetic: 0.97, noise: 0.03
Estimating the pseudo/narrow-sense heritability of: 7_FT22
genetic: 1.00, noise: 0.00


Although flowering time is believed to be highly heritable, these values (90 - 100%) are likely to be overestimates. Note that measures of SNP-heritability also act as a measure of confounding due to population structure.