#### Set the the environment

In [5]:
import os
import numpy as np

#### Define a Bgen reader and read a few values from it (they are a 3D numpy array)

[nan, nan, nan] is "missing"

In [2]:
from pysnptools.distreader import Bgen
distreader = Bgen("example.bgen")

print(distreader)
distreader[:2,:4].read().val

Bgen('example.bgen')


array([[[           nan,            nan,            nan],
        [5.06591937e-03, 2.07519974e-03, 9.92858881e-01],
        [9.92095944e-01, 3.05176014e-04, 7.59887952e-03],
        [4.88280947e-03, 2.83812969e-02, 9.66735894e-01]],

       [[2.78023628e-02, 8.63673794e-03, 9.63560899e-01],
        [9.64354910e-03, 2.68554967e-03, 9.87670901e-01],
        [1.22986045e-02, 9.81567363e-01, 6.13403227e-03],
        [9.90448002e-01, 9.27734003e-03, 2.74657970e-04]]])

#### Ask the distreader for the first 5 individual ID's, SNP ID's, and position info.

By default, it fills in the family id with '0' and concats BGEN's RSID and SNPID with a comma.

In [8]:
print(distreader.iid[:5])
print(distreader.sid[:5])
print(distreader.pos[:5])

[['0' 'sample_001']
 ['0' 'sample_002']
 ['0' 'sample_003']
 ['0' 'sample_004']
 ['0' 'sample_005']]
['SNPID_2,RSID_2' 'SNPID_3,RSID_3' 'SNPID_4,RSID_4' 'SNPID_5,RSID_5'
 'SNPID_6,RSID_6']
[[1.e+00 0.e+00 2.e+03]
 [1.e+00 0.e+00 3.e+03]
 [1.e+00 0.e+00 4.e+03]
 [1.e+00 0.e+00 5.e+03]
 [1.e+00 0.e+00 6.e+03]]


#### Convert distribution to expected value, on the fly, with 'as_snp'

In [12]:
expectedreader = distreader.as_snp(max_weight=1)
print(expectedreader)
expectedreader[:2,:4].read().val

Bgen('example.bgen').as_snp()


array([[       nan, 0.99389648, 0.00775147, 0.98092654],
       [0.96787927, 0.98901368, 0.49691771, 0.00491333]])

#### We can also turn any SNP reader (such as 'Bed') into an on-the-fly distribution reader with 'as_dist'.

In [13]:
from pysnptools.snpreader import Bed
snpreader = Bed("all.bed", count_A1=True)
print(snpreader[:2,:4].read().val)
snptodist = snpreader.as_dist()
print(snptodist)
snptodist[:2,:4].read().val

[[0. 0. 1. 2.]
 [0. 0. 1. 1.]]
Bed('all.bed',count_A1=True).as_dist()


array([[[1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.]]])

In [None]:
#!!!cmk If we have qctools installed and an e variable set, we can write to BGEN.

In [15]:
tempbgen = Bgen.write('temp.bgen',distreader,bits=16,compression='zlib')
print(tempbgen)

Bgen('temp.bgen')


#### Convert a Bed reader into a distribution reader

In [3]:
snptodist = snpreader.as_dist()
print(snptodist)
snptodist[:2,:4].read().val

Bed('all.bed',count_A1=True).as_dist()


array([[[1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.]]])

#### Read a few values from the distribution reader. They are a 3D numpy array.

In [5]:
distdata = distreader[:2,:4].read()
distdata.val

array([[[1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.]]])

#### Write the distribution values to a file.

In [6]:
from pysnptools.distreader import DistMemMap
DistMemMap.write('sample.dist.memmap',distdata)

DistMemMap('sample.dist.memmap')

##### Read from the file. Listing the individual ID's the SNP id's and the values.

In [7]:
distmemmap = DistMemMap('sample.dist.memmap')
print(distmemmap.iid)
print(distmemmap.sid)
print(distmemmap.val)

[['cid0P0' 'cid0P0']
 ['cid1P0' 'cid1P0']]
['snp625_m0_.03m1_.07' 'snp1750_m0_.02m1_.04' 'snp0_m0_.37m1_.24'
 'snp375_m0_.52m1_.68']
[[[1. 0. 0.]
  [1. 0. 0.]
  [0. 1. 0.]
  [0. 0. 1.]]

 [[1. 0. 0.]
  [1. 0. 0.]
  [0. 1. 0.]
  [0. 1. 0.]]]


#### Create an in-memory distribution
* Create via random
* Create a second distribution with the individuals reversed and with every 10th SNP.
* Print the start of the 2nd distribution's individual ID's the SNP id's and the values

In [8]:
from pysnptools.distreader import DistData
import numpy as np

np.random.seed(0)
iid_count = 100
sid_count = 1000
val = np.random.random((iid_count,sid_count,3))
val /= val.sum(axis=2,keepdims=True)  #make probabilities sum to 1
distdata = DistData(val=val,
                    iid=[('fam0','iid{0}'.format(i)) for i in range(iid_count)],
                    sid=['sid{0}'.format(s) for s in range(sid_count)]
                    )
distdata2 = distdata[::-1,::10].read()
print(distdata2.iid[:3])
print(distdata2.sid[:3])
print(distdata2.val[:3,:3])

[['fam0' 'iid99']
 ['fam0' 'iid98']
 ['fam0' 'iid97']]
['sid0' 'sid10' 'sid20']
[[[0.20081495 0.4737261  0.32545895]
  [0.52343568 0.16311684 0.31344748]
  [0.26296867 0.5548358  0.18219553]]

 [[0.29093073 0.36418333 0.34488594]
  [0.12887058 0.64865356 0.22247586]
  [0.00325359 0.69403215 0.30271426]]

 [[0.02445881 0.66399687 0.31154432]
  [0.33538909 0.35483828 0.30977264]
  [0.40125371 0.25403278 0.34471351]]]


In [10]:
import os
os.getcwd()

'/mnt/d/OneDrive/programs/pysnptools/doc/ipynb'

In [12]:
# test bgen reading
from pysnptools.distreader.bgen import Bgen
bgenfile = '../../pysnptools/examples/example.bgen'
bgen = Bgen(bgenfile)
bgen.shape

(500, 199)

In [14]:
#!!!cmk would be nice to have *MemMap writes know how to work from *Readers and have optional runners
memmapfile = 'example32.dist.memmap'
memmap = DistMemMap.write(memmapfile,bgen,dtype=np.float32)

In [15]:
bgensize = os.stat(bgenfile).st_size
memmapsize = os.stat(memmapfile).st_size
bgensize,memmapsize,memmapsize/bgensize

(665108, 1223028, 1.8388412107507353)

In [16]:
memmap.val[:5,:5,:]

memmap([[[           nan,            nan,            nan],
         [5.06591937e-03, 2.07519974e-03, 9.92858887e-01],
         [9.92095947e-01, 3.05176014e-04, 7.59887975e-03],
         [4.88280971e-03, 2.83812974e-02, 9.66735899e-01],
         [2.92977947e-03, 9.96612430e-01, 4.57778107e-04]],

        [[2.78023630e-02, 8.63673817e-03, 9.63560879e-01],
         [9.64354910e-03, 2.68554967e-03, 9.87670898e-01],
         [1.22986045e-02, 9.81567383e-01, 6.13403227e-03],
         [9.90447998e-01, 9.27734002e-03, 2.74657970e-04],
         [9.93530273e-01, 2.38037063e-03, 4.08936106e-03]],

        [[1.73650365e-02, 4.96841371e-02, 9.32950854e-01],
         [9.79705811e-01, 1.94701962e-02, 8.23974842e-04],
         [4.79125930e-03, 1.10778986e-02, 9.84130859e-01],
         [9.89319146e-01, 3.90613219e-03, 6.77469606e-03],
         [9.84344482e-01, 7.14111375e-03, 8.51440430e-03]],

        [[2.48717945e-02, 9.32830811e-01, 4.22973931e-02],
         [9.81903613e-01, 5.30989561e-03, 1.278650

In [17]:
memmap.flush()

#/mnt/f/backup/carlk4d/data/carlk/cachebio/genetics/onemil/id1000000.sid_1000000.seed0.byiid.bychrom/iid620000to630000.chrom1.bed
#F:\backup\carlk4d\data\carlk\cachebio\genetics\onemil\id1000000.sid_1000000.seed0.byiid.bychrom\iid620000to630000.chrom1.bed
209,629,000 bytes
#to 
#UKbio using 8 bits per probability, 
#Qctool convertion with 10K IID and 100K SNPs takes 3000 seconds
(py2) carlk@kadie2:/mnt/m/qctool$ build/release/qctool_v2.0.7 -g /mnt/f/backup/carlk4d/data/carlk/cachebio/genetics/onemil/id1000000.sid_1000000.seed0.byiid.bychrom/iid620000to630000.chrom1.bed -og /mnt/m/big.bgen
    
#-bgen-bits	For use when outputting BGEN files only. Tell QCTOOL to use this number of bits to store each probability.
#-bgen-compression	Specify what compression to use when outputting BGEN files only. This can be "none", "zlib", or "zstd".    

#https://bitbucket.org/gavinband/bgen/wiki/BGEN_in_the_UK_Biobank
8bit and zlib

(py2) carlk@kadie2:/mnt/m/qctool$ build/release/qctool_v2.0.7 -g /mnt/d/deldir/testsnps_1_10_50000_50000/data/chrom8.piece4of5.bed -og /mnt/m/1m.bgen -bg
en-bits 8 -bgen-compression zlib
9827000 bytes, 50Kiid, 805 snps, convertion takes 1.5 minutes
5492 bgen

In [25]:
mtop = '/mnt/m' # r'M:'
dtop = '/mnt/d' # r'D:''

In [20]:
from pysnptools.distreader.bgen import Bgen
bgenfile = mtop+'/1m.bgen'
bgen = Bgen(bgenfile)
bgen.shape

(50000, 805)

In [22]:
memmapfile = mtop+'/1m.dist.memmap'
memmap = DistMemMap.write(memmapfile,bgen,dtype=np.float32)

In [23]:
bgensize = os.stat(bgenfile).st_size
memmapsize = os.stat(memmapfile).st_size
bgensize,memmapsize,memmapsize/bgensize

(5623459, 484272356, 86.11645537026233)

In [24]:
bgen[1,50:60].read().val

array([[[nan, nan, nan],
        [ 0.,  0.,  1.],
        [ 0.,  0.,  1.],
        [ 0.,  0.,  1.],
        [ 0.,  0.,  1.],
        [nan, nan, nan],
        [ 0.,  0.,  1.],
        [nan, nan, nan],
        [ 0.,  0.,  1.],
        [ 0.,  0.,  1.]]])

In [28]:
from pysnptools.snpreader import Bed
bed = Bed(dtop+'/deldir/testsnps_1_10_50000_50000/data/chrom8.piece4of5.bed',count_A1=True)
bed[0:20,300:5000].read().val

array([[nan,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., nan],
       ...,
       [nan, nan,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0., nan, nan],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [None]:
iid_count = 500*1000
sid_count = 500
from pysnptools.distreader import DistGen
from pysnptools.distreader import Bgen
gen_file = r'm:\deldir\{0}x{1}.gen'.format(iid_count,sid_count)
sample_file2 = r'/mnt/m/deldir/{0}x{1}.sample'.format(iid_count,sid_count)
gen_file2 = r'/mnt/m/deldir/{0}x{1}.gen'.format(iid_count,sid_count)
bgen_file = r'm:\deldir\{0}x{1}.bgen'.format(iid_count,sid_count)
bgen_file2 = r'/mnt/m/deldir/{0}x{1}.bgen'.format(iid_count,sid_count)

In [None]:
distgen = DistGen(seed=332,iid_count=iid_count,sid_count=sid_count)
Bgen.genwrite(gen_file,distgen.read(),decimal_places=5)
print ('/mnt/m/qctool/build/release/qctool_v2.0.7 -g {0} -s {1} -og {2} -bgen-bits 8 -bgen-compression zlib'.format(gen_file2,sample_file2,bgen_file2))

In [None]:
distgen.pos

In [None]:
bgen_file

In [None]:
bgen = Bgen(bgen_file,metadata=False) #!!!cmk metadata creation will fail, #!!!cmk False will create it?????
bgen.iid,bgen.sid,bgen.pos

In [None]:
bgen[::100000,::50].read().val

In [None]:
iid_count = 500*1000
sid_count = 5*1000*1000

from pysnptools.distreader import DistGen
from pysnptools.distreader import Bgen
distgen = DistGen(seed=332,iid_count=iid_count,sid_count=sid_count)
chrom_list = sorted(set(distgen.pos[:,0]))
len(chrom_list)

In [None]:
import logging
logging.basicConfig(level=logging.INFO)
for chrom in chrom_list[::-1]:
    chromgen = distgen[:,distgen.pos[:,0]==chrom]
    print(chrom,chromgen.sid_count)
    name = '{0}x{1}.chrom{2}'.format(iid_count,sid_count,int(chrom))
    gen_file = r'm:\deldir\{0}.gen'.format(name)
    sample_file2 = r'/mnt/m/deldir/{0}.sample'.format(name)
    gen_file2 = r'/mnt/m/deldir/{0}.gen'.format(name)
    print("about to read {0}x{1}".format(chromgen.iid_count,chromgen.sid_count))
    Bgen.genwrite(gen_file,chromgen,decimal_places=5) #better in batches?
    print("done")
    bgen_file = r'm:\deldir\{0}.bgen'.format(name)
    bgen_file2 = r'/mnt/m/deldir/{0}.bgen'.format(name)
    print ('/mnt/m/qctool/build/release/qctool_v2.0.7 -g {0} -s {1} -og {2} -bgen-bits 8 -bgen-compression zlib'.format(gen_file2,sample_file2,bgen_file2))

In [None]:
chromgen

In [None]:
import logging
logging.basicConfig(level=logging.INFO)
from pysnptools.distreader import Bgen
bgen = Bgen(r'M:\deldir\500000x100.bgen',verbose=True)
print(bgen.shape)

In [None]:
%%time
bgen[0,50:60].read().val

In [None]:
import logging
import os
logging.basicConfig(level=logging.INFO)
from pysnptools.distreader import DistMemMap
distmammap_file = r'M:\deldir\500000x100.dist.memmap'
if not os.path.exists(distmammap_file):
    distmemmap = DistMemMap.write(distmammap_file,bgen,dtype='float32',sid_batch_size=2)
else:
    distmemmap = DistMemMap(distmammap_file)
distmemmap    

In [None]:
%%time
distmemmap.val[::50000,::10]

In [None]:
%%time
from pysnptools.distreader import Bgen
bgen2 = Bgen(r'M:\deldir\1x1000000.bgen',verbose=True) #!!!cmk why does this keep re-"Mapping variants" after metadata file already there?
#!!!cmk why slow even after mapping is done?
print(bgen2.shape)

In [None]:
%%time
bgen2.shape

In [None]:
%%time
start = 2500000
bgen2[-1,start:start+100].read().val

In [None]:
import logging
import os
logging.basicConfig(level=logging.INFO)
from pysnptools.distreader import DistMemMap
distmammap_file2 = r'M:\deldir\1x5000000.dist.memmap'
if not os.path.exists(distmammap_file2):
    distmemmap2 = DistMemMap.write(distmammap_file2,bgen2,dtype='float32',sid_batch_size=10)
else:
    distmemmap2 = DistMemMap(distmammap_file2)
distmemmap2    

In [6]:
%%time
#/mnt/m/qctool/build/release/qctool_v2.0.7 -g /mnt/m/deldir/1x1000000.gen -s /mnt/m/deldir/1x1000000.sample -og /mnt/m/deldir/1x1000000.bgen -bgen-bits 8 -bgen-compression zlib   
from bgen_reader import read_bgen
filename = r'm:\deldir\1x1000000.bgen'
bgen = read_bgen(filename,verbose=True)

We will create the metafile `m:\deldir\1x1000000.bgen.metadata`. This file will speed up further
reads and only need to be created once. So, please, bear with me.


Mapping variants: 100%|███████████████████████████████████████████████████| 1000000/1000000 [00:16<00:00, 59736.24it/s]

Wall time: 22.5 s





In [4]:
%%time
import logging
logging.basicConfig(level=logging.INFO)
from pysnptools.distreader import Bgen
bgen2 = Bgen(filename,verbose=True)
bgen2._run_once() #Force it to create its metafile.npz now (does not use *.metadata)

Wall time: 10.9 s -- time=0:00:03.93, 990,000 of 1,000,000


In [7]:
%%time
id_list = bgen['variants']['id']
len(id_list)

Wall time: 16.5 s


1000000

In [9]:
%%time
id_list = bgen2.sid
len(id_list)

Wall time: 0 ns


1000000

In [10]:
%%time
chrom_list = bgen['variants']['chrom']
len(chrom_list)

Wall time: 18.4 s


1000000

In [11]:
%%time
chrom_list = bgen2.pos[:,0]
len(chrom_list)

Wall time: 0 ns


1000000

In [12]:
%%time
bgen["genotype"][500000].compute()['probs']

Wall time: 151 ms


array([[0.65490196, 0.2       , 0.14509804]])

In [13]:
%%time
bgen2[:,500000].read()

Reading Metadata  -- time=0:00:00, 0 of 1Wall time: 53.3 ms


DistData(Bgen('m:\deldir\1x1000000.bgen')[:,500000])

In [29]:
%%time
import numpy as np
stop = 1000
parray = np.zeros((1,stop,3),dtype='float32')
for i in range(stop):
    parray[0,i,:] =bgen["genotype"][i].compute()['probs']
print(np.nanmean(parray))

0.33333334
Wall time: 21 s


In [35]:
%%time
stop = 1000
parray = bgen2[:,:stop].read(dtype='float32')
print(np.nanmean(parray.val))

Reading Metadata  -- time=0:00:00.00, 0 of 1,0000.33333334
Wall time: 45 ms


In [39]:
%%time
stop = 1*1000*1000
parray = bgen2[:,:stop].read(dtype='float32')
print(parray.shape)
print(np.nanmean(parray.val))

(1, 1000000)data  -- time=0:00:31.14, 999,000 of 1,000,000
0.3333333
Wall time: 31.2 s


In [None]:
%%time
del bgen
bgen = read_bgen(filename,verbose=True)

In [None]:
%%time
chrom_list = bgen['variants']['chrom']
len(chrom_list)

In [1]:
from pysnptools.distreader import DistGen
iid_count = 100
sid_count = 1000
distgen = DistGen(seed=332,iid_count=iid_count,sid_count=sid_count)
distgen

DistGen(seed=332,iid_count=100,sid_count=1000,chrom_count=22,sid_batch_size=1000,cache_file=None)

In [2]:
from pysnptools.distreader import Bgen
mtop = '/mnt/m'
qctool_path='/mnt/m/qctool/build/release/qctool_v2.0.7'
bgen = Bgen.write(mtop+'/{0}x{1}.bgen'.format(iid_count,sid_count),distgen,qctool_path=qctool_path)#bits=32,
bgen

Bgen('/mnt/m/100x1000.bgen')

In [9]:
Bgen('/mnt/m/100x1000.bgen')[0,:5].read(dtype='float32').val[0,0][0]

0.466804

In [8]:
distgen[0,:5].read(dtype='float32').val[0,0][0]

0.4668119

In [11]:
import math
bits=23
decimal_places = math.ceil(math.log(2**bits,10))
print(decimal_places )

7


In [20]:
math.log(2**bits,10)

9.632959861247397

In [10]:
math.ceil(math.log(2^bits,10))

2

2^32

In [12]:
2**32

4294967296

In [14]:
2^32

34

In [19]:
2^5

7

In [25]:
import os
import shutil
from bgen_reader import read_bgen

filename1 = '/mnt/d/OneDrive/Shares/bgenreaderpy/abs_error1.bgen'
filename31 = '/mnt/d/OneDrive/Shares/bgenreaderpy/abs_error31.bgen'
filenameX = '/mnt/d/OneDrive/Shares/bgenreaderpy/abs_errorX.bgen'

rb1 = read_bgen(filename1,verbose=True)
print(os.path.getsize(filename1), rb1["genotype"][0].compute()['probs'][0])
del rb1

rb31 = read_bgen(filename31,verbose=True)
print(os.path.getsize(filename31),rb31["genotype"][0].compute()['probs'][0])
del rb31

shutil.copy(filename1,filenameX)
rbX = read_bgen(filenameX,verbose=True)
print(os.path.getsize(filenameX),rbX["genotype"][0].compute()['probs'][0])
del rbX

shutil.copy(filename31,filenameX)
rbX = read_bgen(filenameX,verbose=True)
print(os.path.getsize(filenameX),rbX["genotype"][0].compute()['probs'][0])
del rbX

os.remove(filenameX+'.metadata')
rbX = read_bgen(filenameX,verbose=True)
print(os.path.getsize(filenameX),rbX["genotype"][0].compute()['probs'][0])
del rbX


Mapping variants: 100%|██████████| 1/1 [00:00<00:00, 3994.58it/s]
Mapping variants: 100%|██████████| 1/1 [00:00<00:00, 5017.11it/s]
Mapping variants: 100%|██████████| 1/1 [00:00<00:00, 2205.21it/s]
Mapping variants: 100%|██████████| 1/1 [00:00<00:00, 2343.19it/s]
Mapping variants: 100%|██████████| 1/1 [00:00<00:00, 5405.03it/s]

6189 [1. 0. 0.]
9545 [0.99346924 0.0027771  0.00375366]
We will create the metafile `/mnt/d/OneDrive/Shares/bgenreaderpy/abs_errorX.bgen.metadata`. This file will speed up further
reads and only need to be created once. So, please, bear with me.
6189 [1. 0. 0.]
File `/mnt/d/OneDrive/Shares/bgenreaderpy/abs_errorX.bgen` has been modified after the creation of `/mnt/d/OneDrive/Shares/bgenreaderpy/abs_errorX.bgen.metadata`.
We will therefore recreate the metadata file. So, please, bear with me.
9545 [1. 0. 0.]
We will create the metafile `/mnt/d/OneDrive/Shares/bgenreaderpy/abs_errorX.bgen.metadata`. This file will speed up further
reads and only need to be created once. So, please, bear with me.
9545 [1. 0. 0.]



