In [1]:
import simuOpt
simuOpt.setOptions(alleleType='short', quiet=True, numThreads=4)
import simuPOP as sim
import numpy as np
import pandas as pd
import random
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, export_svgs
from scipy import linalg
from saegus import analyze, operators, parameters
np.set_printoptions(suppress=True, precision=5)

In [2]:
example_pop = sim.loadPopulation('example_pop.pop')

In [3]:
example_pop.addInfoFields(['ind_id', 'mother_id', 'father_id', 'g', 'p'])

In [4]:
sim.tagID(example_pop)

In [5]:
sim.stat(example_pop, numOfSegSites=sim.ALL_AVAIL, vars=['segSites'])

In [6]:
segregating_loci = example_pop.dvars().segSites

In [7]:
qtl = sorted(random.sample(segregating_loci, 20))

In [8]:
trait = parameters.Trait()

In [9]:
allele_effects_table = trait.construct_allele_effects_table(example_pop, qtl, random.expovariate, 1)

In [10]:
allele_effects_array = trait.construct_ae_array(allele_effects_table, qtl)

In [11]:
heritability = 0.7

In [12]:
operators.calculate_g(example_pop, allele_effects_array)

In [13]:
operators.calculate_error_variance(example_pop, heritability)

In [14]:
operators.calculate_p(example_pop)

In [None]:
gwas = analyze.GWAS(example_pop, segregating_loci, 'example')

In [None]:
print(gwas.individual_names)

In [None]:
gwas.trait_formatter(trait_file_name='example_trait.txt')

In [None]:
sim.stat(example_pop, alleleFreq=sim.ALL_AVAIL)

In [None]:
allele_states = analyze.gather_allele_data(example_pop)

In [None]:
minor_alleles = np.array(allele_states[:, 3], dtype=np.int8)

In [None]:
segregating_minor_alleles = minor_alleles[segregating_loci]

In [None]:
count_matrix = np.array(gwas.calculate_count_matrix(segregating_minor_alleles, segregating_loci), dtype=np.int8)

In [None]:
count_matrix.shape

In [None]:
print(count_matrix)

The rows of v are the eigenvectors of a^T * a. The columns of u are the eigenvectors of a * a^T. For row i in v and column i in u the corresponding eigenvalue is s[i] ** 2

In [None]:
count_matrix = np.zeros((105, 42837), dtype=np.int8)

In [None]:
for i, ind in enumerate(example_pop.individuals()):
    ageno = np.array(ind.genotype(ploidy=0), dtype=np.int8)[segregating_loci]
    bgeno = np.array(ind.genotype(ploidy=1), dtype=np.int8)[segregating_loci]
    acomps = np.array(np.equal(segregating_minor_alleles, ageno), dtype=np.int8)
    bcomps = np.array(np.equal(segregating_minor_alleles, bgeno), dtype=np.int8)
    comp_count = acomps + bcomps
    count_matrix[i, :] = comp_count

In [None]:
column_means = np.apply_along_axis(np.mean, axis=0, arr=count_matrix)

In [32]:
print(column_means)

NameError: name 'column_means' is not defined

In [None]:
print(count_matrix)

In [None]:
shifted = np.array([count_matrix[:, i] - column_means[i] for i in range(42837)]).T

In [None]:
print(shifted)

In [None]:
P = column_means/2

In [None]:
scale = np.sqrt(P*(1-P))

In [None]:
M = np.matrix(np.array([shifted[:, i] / scale[i] for i in range(42837)]).T)

In [None]:
print(M)

In [None]:
X = (1/42837)*(M * M.T)

In [None]:
print(X)

In [None]:
eigendata = linalg.eig(X)

In [None]:
eigenvalues = np.array(eigendata[0], dtype=np.float)

In [None]:
eigenvalues

In [None]:
eigenvectors = np.array(eigendata[1], dtype=np.float)

In [None]:
print(eigenvalues)

In [None]:
print(eigenvectors)

In [None]:
sum_eigen_values = np.sum(eigenvalues)

In [None]:
eigenvalues[0]/sum_eigen_values

In [None]:
structure_covariates = np.array([eigenvalues[0]*eigenvectors[:, 0], eigenvalues[1]*eigenvectors[:, 1]]).T

In [None]:
output_matrix = pd.DataFrame(structure_covariates, 
                             index=gwas.individual_names)

In [None]:
with open('example_structure.txt', 'w') as f:
    f.write(structure_header)
    output_matrix.to_csv(f, sep='\t', index=True, header=False)

In [None]:
hapmap_columns = ['rs', 'alleles', 'chrom', 'pos',
                 'strand', 'assembly', 'center',
                 'center', 'protLSID', 'assayLSID',
                 'panelLSID', 'QCode'] + list(gwas.individual_names)

In [None]:
hapmap_columns

In [None]:
hapmap_matrix = pd.DataFrame(columns=hapmap_columns)

In [None]:
hapmap_matrix.rs = segregating_loci

In [None]:
hapmap_matrix.alleles = segregating_minor_alleles

In [None]:
chromosomes = np.array([example_pop.chromLocusPair(locus)[0] + 1
 for locus in segregating_loci], dtype=np.int8)

In [None]:
chromosomes

In [None]:
hapmap_matrix.chrom = chromosomes

In [None]:
hapmap_matrix.chrom

In [None]:
hapmap_matrix.pos = np.arange(42837)

In [None]:
hapmap_matrix.loc[:, 'strand':'QCode'] = np.core.defchararray.array(
    [['NA']*len(hapmap_matrix.pos)]*8).T

In [None]:
hapmap_matrix.loc[:, 'strand':'QCode'] = np.core.defchararray.array([['NA']*42837]*8).T

In [None]:
for i, ind in enumerate(example_pop.individuals()):
    hapmap_matrix.loc[:, gwas.individual_names[i]] = [
        ''.join(sorted(gwas.int_to_snp_conversions[a] +
                      gwas.int_to_snp_conversions[b]))
        for a, b, in zip(np.array(ind.genotype(ploidy=0))[segregating_loci], 
                         np.array(ind.genotype(ploidy=1))[segregating_loci])
    ]

In [None]:
print(np.array(hapmap_matrix))

In [None]:
with open('example_hapmap.txt', 'w') as hmp_file:
    hapmap_matrix.to_csv(hmp_file, sep='\t', index=False)