In [3]:
import pandas as pd
import numpy as np
import random, msprime, pyslim, tskit

In [22]:
# load trees from slim (simplify removes founder invididual in each sub population and thus support for fixed mutations)
ts = tskit.load("./davide_intern/people/davidec/davide-intern/out.trees").simplify()

# get nodes/chromosomes for female individuals:
female_nodes = list()
for ind in ts.individuals():
    # if pyslim.decode_individual(ind.metadata).sex == 0:
    if ind.metadata['sex'] == 0:
        female_nodes.extend(ind.nodes)


In [24]:
# get the asmple ids among females chromosomes:
sample_nodes = random.sample(female_nodes, k=8)

In [25]:
# overlay mutations
mutated_ts = msprime.mutate(ts, rate=1.5e-8*31) #for humans args.generationtime 31

In [28]:
# get the positions of each segregating site
positions = [site.position for site in mutated_ts.sites()]  

In [44]:
# get genotypes for sample at variant sites in population:
variants = mutated_ts.variants(samples=sample_nodes,
    #as_bytes=False,
    impute_missing_data=False) 
table = np.array([var.genotypes for var in variants])

In [51]:
# turn table into dataframe with positions
df = pd.DataFrame(table, dtype='int8')
df['pos'] = positions
# write sites to hdf
df.to_hdf('./davide_intern/people/davidec/davide-intern/output.h5', key='df', format='table', mode='w') #args.sites_file output file

In [None]:
# write a VCF
vcf_df = pd.DataFrame(table, dtype='int8') #pd.DataFrame
# remove rows without derived variants
polymorphic = (vcf_df != 0).any(axis=1)
vcf_df = vcf_df.loc[polymorphic]
positions = pd.Series([round(p) for p in positions])
positions = positions[polymorphic]
vcf_df.insert(0, 'FORMAT', 'GT')
vcf_df.insert(0, 'INFO', 'AA=A')
vcf_df.insert(0, 'FILTER', '')
vcf_df.insert(0, 'QUAL', '')
vcf_df.insert(0, 'ALT', 'T')
vcf_df.insert(0, 'REF', 'A')
vcf_df.insert(0, 'ID', list(range(len(positions))))
vcf_df.insert(0, 'POS', positions.values)
vcf_df.insert(0, '#CHROM', '1')
vcf_df.to_csv('./davide_intern/people/davidec/davide-intern/output.vcf', sep='\t', index=False)

In [56]:
positions

1      1535
2      2423
4      5509
6      6733
9     12011
10    12080
12    13393
15    18634
17    19214
23    21330
24    24230
26    26373
28    27905
dtype: int64

In [58]:
# write a VCF where the haplotypes are artificially treated as unphased diploid genotypes
geno_vcf_df = vcf_df.iloc[:, 0:9]
samples = vcf_df.columns.values[9:].tolist()
for i, (a, b) in enumerate(zip(samples[0::2], samples[1::2])):
    geno_vcf_df[i] = vcf_df[a].astype('str') + '/' +  vcf_df[b].astype('str')
geno_vcf_df.to_csv('./davide_intern/people/davidec/davide-intern/output_geno_file.vcf', sep='\t', index=False)