# Principal Component Analysis

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g'}, tmp_dir='/home/olavur/tmp')

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-848846b477-48ks9:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/qc/hail-20210513-1426-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [3]:
import pandas as pd

In [4]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/non-fargen/resources'

## Load FarGen exome data

Load filtered, high-quality, variants.

In [5]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants.mt/')

In [6]:
n_variants, n_samples = fargen_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 1332013
Number of samples: 474


## Impute sex

We impute the sex of the samples by computing the inbreeding coefficient (F) on the X chromosome. This inbreeding coefficient is calculated as $F = \frac{O - E}{N-E}$ where $O$ is the observed number of homozygotes, $E$ is the expected number of homozygotes, and $N$ is the number of non-missing genotype calls. The expected number of homozygotes is calculated as $E = 1 - 2 f (1 - f)$ where $f$ is the minor-allel frequency.

NOTE: the sex imputation method requires diallelic sites.

In [7]:
fargen_mt = fargen_mt.filter_rows(hl.len(fargen_mt.alleles) == 2)

In [8]:
imputed_sex_ht = hl.impute_sex(fargen_mt.GT)

2021-05-13 14:26:22 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'


Below we've plotted the inbreeding coefficient, and there is a quite clear clustering of individuals.

In [9]:
p = hl.plot.histogram(imputed_sex_ht.f_stat, title='Inbreeding coefficient (F) computed on the X chromosome')
p.plot_width = 800
p.plot_height = 500
show(p)

Based on the plot above, we define new $F$ thresholds for male and female, and do the imputation again.

In [10]:
imputed_sex_ht = hl.impute_sex(fargen_mt.GT, female_threshold=0.3, male_threshold=0.4)
imputed_sex_ht.show(10)

s,is_female,f_stat,n_called,expected_homs,observed_homs
str,bool,float64,int64,float64,int64
"""FN000001""",False,0.717,38690,37400.0,38334
"""FN000002""",False,0.66,38690,37400.0,38262
"""FN000009""",False,0.627,38690,37400.0,38220
"""FN000011""",True,-0.0956,38690,37400.0,37310
"""FN000012""",True,-0.0217,38690,37400.0,37403
"""FN000014""",True,-0.0178,38690,37400.0,37408
"""FN000015""",False,0.619,38690,37400.0,38210
"""FN000016""",True,0.189,38690,37400.0,37668
"""FN000017""",True,-0.0178,38690,37400.0,37408
"""FN000018""",True,0.0013,38690,37400.0,37432


## Load sex data

In [11]:
actual_sex_ht = hl.import_table(BASE_DIR + '/data/metadata/fargen_indi-gen.csv', delimiter=',')

2021-05-13 14:26:34 Hail: INFO: Reading table without type imputation
  Loading field 'IndividualName' as type str (not specified)
  Loading field 'Gender' as type str (not specified)


In [12]:
actual_sex_ht = actual_sex_ht.key_by(actual_sex_ht.IndividualName)

In [13]:
actual_sex_ht = actual_sex_ht.annotate(is_female = actual_sex_ht.Gender == '0')

## Compare imputed and actual sex

In [14]:
sex_ht = imputed_sex_ht.annotate(is_female_actual=actual_sex_ht[imputed_sex_ht.s].is_female)

In [15]:
sex_ht.show(10)

2021-05-13 14:26:37 Hail: INFO: Coerced sorted dataset
2021-05-13 14:26:37 Hail: INFO: Coerced sorted dataset


s,is_female,f_stat,n_called,expected_homs,observed_homs,is_female_actual
str,bool,float64,int64,float64,int64,bool
"""FN000001""",False,0.717,38690,37400.0,38334,False
"""FN000002""",False,0.66,38690,37400.0,38262,False
"""FN000009""",False,0.627,38690,37400.0,38220,False
"""FN000011""",True,-0.0956,38690,37400.0,37310,True
"""FN000012""",True,-0.0217,38690,37400.0,37403,True
"""FN000014""",True,-0.0178,38690,37400.0,37408,True
"""FN000015""",False,0.619,38690,37400.0,38210,False
"""FN000016""",True,0.189,38690,37400.0,37668,True
"""FN000017""",True,-0.0178,38690,37400.0,37408,True
"""FN000018""",True,0.0013,38690,37400.0,37432,True


In [16]:
imputed = pd.Series(sex_ht.is_female.collect(), name='Imputed')
actual = pd.Series(sex_ht.is_female_actual.collect(), name='Actual')

# Calculate confusion matrix.
confusion_table = pd.crosstab(imputed, actual, margins=True, margins_name='Sum')

2021-05-13 14:26:40 Hail: INFO: Coerced sorted dataset
2021-05-13 14:26:40 Hail: INFO: Coerced sorted dataset


In [17]:
confusion_table

Actual,False,True,Sum
Imputed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,187,4,191
True,6,276,282
Sum,193,280,473


## Inspect incorrectly imputed

As we have seen, the males and females cluster very nicely w.r.t. the inbreeding coeffcient, so we can be confident that we are correctly imputing the sex. We see the imputation data for the samples where the imputed and actual sex mismatch. Most likely, either these samples are swapped, or the participants have not informed their biological sex in the questionaires.

In [18]:
sex_ht = sex_ht.annotate(sex_check=sex_ht.is_female == sex_ht.is_female_actual)

In [19]:
sex_ht.filter(sex_ht.sex_check == False).show()

2021-05-13 14:26:43 Hail: INFO: Coerced sorted dataset
2021-05-13 14:26:43 Hail: INFO: Coerced sorted dataset


s,is_female,f_stat,n_called,expected_homs,observed_homs,is_female_actual,sex_check
str,bool,float64,int64,float64,int64,bool,bool
"""FN000187""",True,0.14,38690,37400.0,37607,False,False
"""FN000861""",True,0.0616,38689,37400.0,37507,False,False
"""FN000871""",True,0.0743,38690,37400.0,37524,False,False
"""FN000884""",False,0.74,38687,37400.0,38359,True,False
"""FN000902""",False,0.664,38690,37400.0,38267,True,False
"""FN000909""",True,-0.124,38690,37400.0,37274,False,False
"""FN000940""",True,-0.337,38690,37400.0,37006,False,False
"""FN000957""",False,0.563,38689,37400.0,38139,True,False
"""FN001018""",True,0.0987,38689,37400.0,37554,False,False
"""FN001127""",False,0.752,38690,37400.0,38377,True,False


**NOTE:** There is one sample that we don't have gender data on because they were deleted from FarGen. This is the sample below.

In [20]:
sex_ht.filter(hl.is_missing(sex_ht.sex_check)).show()

2021-05-13 14:26:45 Hail: INFO: Coerced sorted dataset
2021-05-13 14:26:45 Hail: INFO: Coerced sorted dataset


s,is_female,f_stat,n_called,expected_homs,observed_homs,is_female_actual,sex_check
str,bool,float64,int64,float64,int64,bool,bool
"""FN000781""",False,0.61,38690,37400.0,38199,,
