In [26]:
import pandas as pd
import itertools
from scipy import stats

### Read in the categorical phenotypes

In [27]:
pop_file = pd.read_csv("pop.phe", sep='\s+', header=None)
pop_file.head()


Unnamed: 0,0,1,2
0,HCB181,1,1
1,HCB182,1,1
2,HCB183,1,1
3,HCB184,1,1
4,HCB185,1,1


In [28]:
# getting 
# IDs are the first column
# The phenotype encodings are the second column.
ids = pop_file[0]
pop =  pop_file[2]

### Duplicate SNP IDs
Since each SNP has to values (one for each chromosome copy) we need to provide each SNP ID with a extra character so we know what SNP we are looking at. Here we add _1 and _2 to each SNP ID.

In [29]:
snp_ids = pd.read_csv("hapmap1.map", sep='\s+', header=None)[1]
snp_ids_diploid = [(x+"_1", x+"_2") for x in list(snp_ids)]
snp_ids_diploid = list(itertools.chain(*snp_ids_diploid))
snp_ids_diploid[0:10]

['rs6681049_1',
 'rs6681049_2',
 'rs4074137_1',
 'rs4074137_2',
 'rs7540009_1',
 'rs7540009_2',
 'rs1891905_1',
 'rs1891905_2',
 'rs9729550_1',
 'rs9729550_2']

In [30]:
genotype = pd.read_csv("hapmap1.ped", sep="\s+", header=None)
genotype['populations'] = pop
genotype.iloc[:10, :10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,HCB181,1,0,0,1,1,2,2,2,2
1,HCB182,1,0,0,1,1,2,2,1,2
2,HCB183,1,0,0,1,2,2,2,1,2
3,HCB184,1,0,0,1,1,2,2,1,2
4,HCB185,1,0,0,1,1,2,2,1,2
5,HCB186,1,0,0,1,1,2,2,2,2
6,HCB187,1,0,0,1,1,2,2,2,2
7,HCB188,1,0,0,1,1,2,2,1,2
8,HCB189,1,0,0,1,1,2,2,2,2
9,HCB190,1,0,0,1,1,2,2,2,2


We now need to adjust the column names to include the individual IDS, phenotypes, and the SNP IDs we just made.

In [31]:
col_names = ["individual", "X1", "X2", "X3", "X4", "phenotype"] + snp_ids_diploid + ['populations']

In [32]:
genotype.columns = col_names
genotype.iloc[:10, :10]

Unnamed: 0,individual,X1,X2,X3,X4,phenotype,rs6681049_1,rs6681049_2,rs4074137_1,rs4074137_2
0,HCB181,1,0,0,1,1,2,2,2,2
1,HCB182,1,0,0,1,1,2,2,1,2
2,HCB183,1,0,0,1,2,2,2,1,2
3,HCB184,1,0,0,1,1,2,2,1,2
4,HCB185,1,0,0,1,1,2,2,1,2
5,HCB186,1,0,0,1,1,2,2,2,2
6,HCB187,1,0,0,1,1,2,2,2,2
7,HCB188,1,0,0,1,1,2,2,1,2
8,HCB189,1,0,0,1,1,2,2,2,2
9,HCB190,1,0,0,1,1,2,2,2,2


### Rearrange the columns and set the index of the dataframe to be the individual IDs.

In [33]:
#### Here re-sort the column headers so that:
# 1- Individual is your index
# 2- pop and phenotype are first columns, followed by snp_ids
# 3- remove the x columns if there are not needed or rename them to their proper names if they are needed.
re_col_names = ['individual', 'populations', 'phenotype', 'X1', 'X2', 'X3', 'X4'] + snp_ids_diploid
genotype = genotype[re_col_names]
genotype = genotype.set_index('individual')
genotype.head()

Unnamed: 0_level_0,populations,phenotype,X1,X2,X3,X4,rs6681049_1,rs6681049_2,rs4074137_1,rs4074137_2,...,rs2269380_1,rs2269380_2,rs6151412_1,rs6151412_2,rs11912064_1,rs11912064_2,rs1001469_1,rs1001469_2,rs756638_1,rs756638_2
individual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCB181,1,1,1,0,0,1,2,2,2,2,...,2,2,2,2,2,2,1,2,2,2
HCB182,1,1,1,0,0,1,2,2,1,2,...,2,2,2,2,2,2,1,2,2,2
HCB183,1,2,1,0,0,1,2,2,1,2,...,1,1,2,2,2,2,2,2,1,2
HCB184,1,1,1,0,0,1,2,2,1,2,...,1,2,2,2,2,2,1,1,1,2
HCB185,1,1,1,0,0,1,2,2,1,2,...,1,2,2,2,2,2,1,2,2,2


The above restructring allows us to access individual data by using the individual's ID along with the attribute we want. For example, if we want to access individual HCB181 and look at there phenotype, population, and allele for SNP rs6681049 we could do the following:

In [34]:
genotype.loc['HCB181'][['populations', 'phenotype', 'rs6681049_1', 'rs6681049_2']]

populations    1
phenotype      1
rs6681049_1    2
rs6681049_2    2
Name: HCB181, dtype: int64

Here we can see that individual HCB181 belongs to population 1, has phenotype code 1, and the SNP has a homozygous genotype for the second variant.

In [35]:
from collections import Counter

We can now filter by each population and obtain SNP frequencies. 

In [36]:
# Filter by population 1 and 2
genotypes_pop_1 = genotype[genotype['populations'] == 1]
genotypes_pop_2 = genotype[genotype['populations'] == 2]

In [37]:
# Define SNP ID we want to look at
snp_id = 'rs6681049'

# Grab SNP values within each population
snp_genotype_pop_1 = genotypes_pop_1[snp_id+"_1"].astype('str') +  + genotypes_pop_1[snp_id+"_2"].astype('str')
snp_genotype_pop_2 = genotypes_pop_2[snp_id+"_1"].astype('str') +  + genotypes_pop_2[snp_id+"_2"].astype('str')

In [38]:
# Count the occurrences of each allele and obtain frequencies
genotype_counts_pop_1 = Counter(list(snp_genotype_pop_1))
genotype_counts_pop_2 = Counter(list(snp_genotype_pop_2))

count_aa_pop_1 = genotype_counts_pop_1['22']
count_bb_pop_1 = genotype_counts_pop_1['11']
count_ab_pop_1 = genotype_counts_pop_1['12']

count_aa_pop_2 = genotype_counts_pop_2['22']
count_bb_pop_2 = genotype_counts_pop_2['11']
count_ab_pop_2 = genotype_counts_pop_2['12']

In [39]:
# We then define the number of occurrences by twice the total since we are counting by pairs of alleles
total = count_aa_pop_1 + count_bb_pop_1 + count_ab_pop_1
A1_frequency = (2*count_aa_pop_1 + count_ab_pop_1) / (2*total)
A2_frequency = (2*count_bb_pop_1 + count_ab_pop_1) / (2*total)
print('Population 1 Allele 1 freq: ', A1_frequency)
print('Population 1 Allele 2 freq: ', A2_frequency)

Population 1 Allele 1 freq:  0.7666666666666667
Population 1 Allele 2 freq:  0.23333333333333334


In [40]:
A1_frequency = (2*count_aa_pop_2 + count_ab_pop_2) / (2*total)
A2_frequency = (2*count_bb_pop_2 + count_ab_pop_2) / (2*total)
print('Population 2 Allele 1 freq: ', A1_frequency)
print('Population 2 Allele 2 freq: ', A2_frequency)

Population 2 Allele 1 freq:  0.7888888888888889
Population 2 Allele 2 freq:  0.18888888888888888


When considering the frequency of SNPs we only consider the minor allele frequency which is the frequency of the allele that occurs less. Thus, in population 1 we see a minor allele frequency of 0.233 and for population 2 we see a minor allele frequency of 0.188 for this particular SNP.

get_by_pheno will filter the given dataframe by the two phenotypes in the dataset: affected and unaffected.

In [41]:
def get_by_pheno(df, snp_id):
    GT_uf = df[df['phenotype'] == 1][[snp_id+'_1', snp_id+'_2']]
    GT_af = df[df['phenotype'] == 2][[snp_id+'_1', snp_id+'_2']]
    return GT_uf, GT_af
    
GT_uf, GT_af = get_by_pheno(genotype, snp_id)

get_snp_genotypes takes in the two separate populations and obtains the genotypes as a string for each individual.

In [42]:
def get_snp_genotypes(unaffected, affected, snp_id):
    snp_GT_uf = unaffected[snp_id+"_1"].astype('str') + + unaffected[snp_id+"_2"].astype('str')
    snp_GT_af = affected[snp_id+"_1"].astype('str') +  + affected[snp_id+"_2"].astype('str')
    return snp_GT_uf, snp_GT_af

snp_GT_uf, snp_GT_af = get_snp_genotypes(GT_uf, GT_af, snp_id)

get_allele_counts functions each look at the genotypes of the unaffected population and the affected population. The counts of each allele is obtained by counting the genotypes and then counting the occurrences of each allele in each genotype.

In [43]:
def get_allele_counts_uf(uf_genotypes):
    genotype_counts_uf = Counter(list(uf_genotypes))
    count_aa_GT_uf = genotype_counts_uf['22']
    count_bb_GT_uf = genotype_counts_uf['11']
    count_ab_GT_uf = genotype_counts_uf['12']
    uf_total = 2*(count_aa_GT_uf + count_bb_GT_uf + count_ab_GT_uf)
    
    return count_aa_GT_uf, count_bb_GT_uf, count_ab_GT_uf, uf_total

def get_allele_counts_af(af_genotypes):
    genotype_counts_af = Counter(list(af_genotypes))
    count_aa_GT_af = genotype_counts_af['22']
    count_bb_GT_af = genotype_counts_af['11']
    count_ab_GT_af = genotype_counts_af['12']
    af_total = 2*(count_aa_GT_af + count_bb_GT_af + count_ab_GT_af)
    
    return count_aa_GT_af, count_bb_GT_af, count_ab_GT_af, af_total

count_aa_GT_uf, count_bb_GT_uf, count_ab_GT_uf, uf_total = get_allele_counts_uf(snp_GT_uf)
count_aa_GT_af, count_bb_GT_af, count_ab_GT_af, af_total = get_allele_counts_af(snp_GT_af)

get_freqs counts the alleles of the counted genotypes and returns frequencies.

In [44]:
def get_freqs(a, b, ab, total):
    a_count = (2*a + ab) / total
    b_count = (2*b + ab) / total
    return a_count, b_count

total = uf_total + af_total
a_uf, b_uf = get_freqs(count_aa_GT_uf, count_bb_GT_uf, count_ab_GT_uf, total)
a_af, b_af = get_freqs(count_aa_GT_af, count_bb_GT_af, count_ab_GT_af, total)

event_probs gets the probabililities of each possible event (allele 1, allele 2, affected or unaffected).

In [45]:
def event_probs(a_af, b_af, a_uf, b_uf):
    p_aff = a_af + b_af
    p_unaff = a_uf + b_uf
    p_a = a_af + a_uf
    p_b = b_af + b_uf
    return p_aff, p_unaff, p_a, p_b
p_aff, p_unaff, p_a, p_b = event_probs(a_af, b_af, a_uf, b_uf)

get_expected gets the probabilities of the paired events under the assumption of independence.

In [46]:
def get_expected(p_aff, p_unaff, p_a, p_b, total):
    exp_a_af = p_a * p_aff * (total)
    exp_a_uf = p_a * p_unaff * (total)
    exp_b_af = p_b * p_aff * (total)
    exp_b_uf = p_b * p_unaff * (total)
    return exp_a_af, exp_a_uf, exp_b_af, exp_b_uf
exp_a_af, exp_a_uf, exp_b_af, exp_b_uf = get_expected(p_aff, p_unaff, p_a, p_b, total)

get_counts gets the actual counts of each allele.

In [47]:
def get_counts(a_uf, b_uf, a_af, b_af, total):
    a_count_uf = a_uf * total
    a_count_af = a_af * total
    b_count_uf = b_uf * total
    b_count_af = b_af * total
    return a_count_uf, a_count_af, b_count_uf, b_count_af
a_count_uf, a_count_af, b_count_uf, b_count_af = get_counts(a_uf, b_uf, a_af, b_af, total)

When combining the values from above we can do a basic association test to test for significance of this SNP.

In [48]:
# Perform Basic Association Test for rs6681049
# association test
test = stats.chisquare([a_count_uf, b_count_uf, a_count_af, b_count_af], 
                [exp_a_uf, exp_b_uf, exp_a_af, exp_b_af],
                ddof=1,
                axis=0)

# get p-value
print("CHISQ: ", test[0])
print("P-VAL: ", test[1])

CHISQ:  3.066637047163362
P-VAL:  0.21581827959014507


Now we can combine everything from above to compute the significance of each SNP in the dataset.
DO NOT RUN THIS CELL. The file chi_stats is a csv file that contains the computation below. Read that file into a pandas DataFrame.

In [49]:
snp_chi = []
snp_pval = []
for i in range(len(snp_ids)):
    # get the genotypes of the SNP
    genotype_uf, genotype_af = get_by_pheno(genotype, snp_ids[i])
    snp_gt_uf, snp_gt_af = get_snp_genotypes(genotype_uf, genotype_af, snp_ids[i])
    
    # count genotypes and get frequencies
    count_aa_GT_uf, count_bb_GT_uf, count_ab_GT_uf, uf_total = get_allele_counts_uf(snp_gt_uf)
    count_aa_GT_af, count_bb_GT_af, count_ab_GT_af, af_total = get_allele_counts_af(snp_gt_af)
    total = uf_total + af_total
    a_uf, b_uf = get_freqs(count_aa_GT_uf, count_bb_GT_uf, count_ab_GT_uf, total)
    a_af, b_af = get_freqs(count_aa_GT_af, count_bb_GT_af, count_ab_GT_af, total)
    
    # compute probabilities and get expected values
    p_aff, p_unaff, p_a, p_b = event_probs(a_af, b_af, a_uf, b_uf)
    exp_a_af, exp_a_uf, exp_b_af, exp_b_uf = get_expected(p_aff, p_unaff, p_a, p_b, total)
    
    # get actual counts for test
    a_count_uf, a_count_af, b_count_uf, b_count_af = get_counts(a_uf, b_uf, a_af, b_af, total)
    
    # perform test
    test = stats.chisquare([a_count_uf, b_count_uf, a_count_af, b_count_af], 
                [exp_a_uf, exp_b_uf, exp_a_af, exp_b_af],
                ddof=1,
                axis=0)
    
    snp_chi.append(test[0])
    snp_pval.append(test[1])
    if (i % 1000) == 0:
        print(i)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000


This dataframe now contains the SNPs with their chi-square statistics and p-values.

In [59]:
d = {'SNP_ID': snp_ids, 'CHISQ': snp_chi, 'PVAL': snp_pval}
chi_stats = pd.DataFrame(d)
chi_stats.head()

Unnamed: 0,SNP_ID,CHISQ,PVAL
0,rs6681049,3.066637,0.215818
1,rs4074137,0.001919,0.999041
2,rs7540009,,
3,rs1891905,0.015266,0.992396
4,rs9729550,2.630876,0.268357


### Other (adjusted) significance values

### Mean Chi Square, genomic inflation factor

### Genotypic and other association models

### Stratification Analysis
Here we will perform IBS (Inherited By Descent) clustering. Two individuals are IBS if they have identical nucleotide bases for a particular segment.