In [3]:
# import
import twobitreader
from twobitreader import TwoBitFile
import numpy as np 
import torch
from sklearn.preprocessing import OneHotEncoder

print("have pytorch version {}".format(torch.__version__))
print("have numpy version {}".format(np.__version__))


have pytorch version 1.5.1
have numpy version 1.19.0


In [4]:
# get the genome file
hg19 = TwoBitFile('../../../../../../Data/Broad/Basset/TwoBitReader/hg19.2bit')

print("two bit file of type {}".format(type(hg19)))


two bit file of type <class 'twobitreader.TwoBitFile'>


In [5]:
# get the chrom
chromosome = hg19['chr8']

print("two bit chromosome of type {}".format(type(chromosome)))

two bit chromosome of type <class 'twobitreader.TwoBitSequence'>


In [7]:
# method to create string sequence from position
def create_sequence(position, offset, chromosome):
    sequence = chromosome[position - offset: position + offset]
    return sequence.upper()

seq1 = create_sequence(118184783, 5, chromosome)
seq2 = create_sequence(112233554, 5, chromosome)
seq3 = create_sequence(112117754, 5, chromosome)

print("the 1 sequence is of type {} of length {}".format(type(seq1), len(seq1)))
print("the 1 sequence is \n{}".format(seq1))
print("the 2 sequence is of type {} of length {}".format(type(seq2), len(seq2)))
print("the 2 sequence is \n{}".format(seq2))
print("the 3 sequence is of type {} of length {}".format(type(seq3), len(seq3)))
print("the 3 sequence is \n{}".format(seq3))

the 1 sequence is of type <class 'str'> of length 10
the 1 sequence is 
CAGCCGGGAC
the 2 sequence is of type <class 'str'> of length 10
the 2 sequence is 
TTACATTCAA
the 3 sequence is of type <class 'str'> of length 10
the 3 sequence is 
AGTACAGATT


In [23]:
# create the numpy array from the sequences
sequence_np = np.array(list(seq1))
sequence_np = np.vstack((sequence_np, np.array(list(seq2))))
sequence_np = np.vstack((sequence_np, np.array(list(seq3))))

print("the np sequence is of type {} and shape {}".format(type(sequence_np), sequence_np.shape))
print(sequence_np)

the np sequence is of type <class 'numpy.ndarray'> and shape (3, 10)
[['C' 'A' 'G' 'C' 'C' 'G' 'G' 'G' 'A' 'C']
 ['T' 'T' 'A' 'C' 'A' 'T' 'T' 'C' 'A' 'A']
 ['A' 'G' 'T' 'A' 'C' 'A' 'G' 'A' 'T' 'T']]


In [24]:
# use the numpy utility to replace the letters by numbers
sequence_np[sequence_np == 'A'] = int(0)
sequence_np[sequence_np == 'C'] = 1
sequence_np[sequence_np == 'G'] = 2
sequence_np[sequence_np == 'T'] = 3

print("the np sequence_num is of type {} and shape {}".format(type(sequence_np), sequence_np.shape))
print(sequence_np)

the np sequence_num is of type <class 'numpy.ndarray'> and shape (3, 10)
[['1' '0' '2' '1' '1' '2' '2' '2' '0' '1']
 ['3' '3' '0' '1' '0' '3' '3' '1' '0' '0']
 ['0' '2' '3' '0' '1' '0' '2' '0' '3' '3']]


In [30]:
# convert to float
sequence_np = sequence_np.astype(np.int)

print("the np sequence_num is of type {} and shape {}".format(type(sequence_np), sequence_np.shape))
print(sequence_np)

the np sequence_num is of type <class 'numpy.ndarray'> and shape (3, 10)
[[1 0 2 1 1 2 2 2 0 1]
 [3 3 0 1 0 3 3 1 0 0]
 [0 2 3 0 1 0 2 0 3 3]]


In [31]:
# one hot encode using numpy
number_classes = 4
sequence_np_one_hot = np.eye(number_classes)[sequence_np]

print("the np sequence one hot is of type {} and shape {}".format(type(sequence_np_one_hot), sequence_np_one_hot.shape))
print(sequence_np_one_hot)

the np sequence one hot is of type <class 'numpy.ndarray'> and shape (3, 10, 4)
[[[0. 1. 0. 0.]
  [1. 0. 0. 0.]
  [0. 0. 1. 0.]
  [0. 1. 0. 0.]
  [0. 1. 0. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [1. 0. 0. 0.]
  [0. 1. 0. 0.]]

 [[0. 0. 0. 1.]
  [0. 0. 0. 1.]
  [1. 0. 0. 0.]
  [0. 1. 0. 0.]
  [1. 0. 0. 0.]
  [0. 0. 0. 1.]
  [0. 0. 0. 1.]
  [0. 1. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]]

 [[1. 0. 0. 0.]
  [0. 0. 1. 0.]
  [0. 0. 0. 1.]
  [1. 0. 0. 0.]
  [0. 1. 0. 0.]
  [1. 0. 0. 0.]
  [0. 0. 1. 0.]
  [1. 0. 0. 0.]
  [0. 0. 0. 1.]
  [0. 0. 0. 1.]]]


In [19]:
# one hot the numpy array
onehot_encoder = OneHotEncoder(sparse=False)
sequence_np_one_hot = onehot_encoder.fit_transform(sequence_np.reshape(-1))

print("the np sequence one hot is of type {} and shape {}".format(type(sequence_np_one_hot), sequence_np_one_hot.shape))
print(sequence_np_one_hot)
print(sequence_np_one_hot)

the np sequence one hot is of type <class 'numpy.ndarray'> and shape (3, 26)
[[0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0.
  1. 0.]
 [0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1.
  0. 0.]
 [1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0.
  0. 1.]]
[[0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0.
  1. 0.]
 [0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1.
  0. 0.]
 [1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0.
  0. 1.]]


In [11]:
# modify the string from ACGT to 1234
sequence_one_hot = sequence.replace('A', '0')
sequence_one_hot = sequence_one_hot.replace('C', '1')
sequence_one_hot = sequence_one_hot.replace('G', '2')
sequence_one_hot = sequence_one_hot.replace('T', '3')

print("the one hot sequence is of type {} of length {}".format(type(sequence_one_hot), len(sequence_one_hot)))
print("the one hot sequence is \n{}".format(sequence_one_hot))

# get the allele at the middle position
allele_one_hot = sequence_one_hot[299:300]

print("the one hot allele is {}".format(allele_one_hot))

the one hot sequence is of type <class 'str'> of length 600
the one hot sequence is 
333110222133231311113311030230021311302200321102013110202030010232201020002023311103021201022210133321321013020233311113211332313232320032302132033031020210001232213311313202321113211313211110111102102231000201000230133200233220231020210231211103212323210031023213003131113232133133303100102102110211222010211002322331220202000332130002111330210000213330120321013101310110331020322003131102332011022011112013211333313232002011113232013021310231010112310233311100033320102211011331000103213213032102333132103103020000300220011000220020003310323103223210032101033330313033303330233110331011032002200202
the one hot allele is 1


In [12]:
# create the one hot numpy array
sequence_np = np.array(list(sequence_one_hot))

print("got np sequence of type {} and shape {}".format(type(sequence_np), sequence_np.shape))

got np sequence of type <class 'numpy.ndarray'> and shape (600,)


In [13]:
sequence_np

array(['3', '3', '3', '1', '1', '0', '2', '2', '2', '1', '3', '3', '2',
       '3', '1', '3', '1', '1', '1', '1', '3', '3', '1', '1', '0', '3',
       '0', '2', '3', '0', '0', '2', '1', '3', '1', '1', '3', '0', '2',
       '2', '0', '0', '3', '2', '1', '1', '0', '2', '0', '1', '3', '1',
       '1', '0', '2', '0', '2', '0', '3', '0', '0', '1', '0', '2', '3',
       '2', '2', '0', '1', '0', '2', '0', '0', '0', '2', '0', '2', '3',
       '3', '1', '1', '1', '0', '3', '0', '2', '1', '2', '0', '1', '0',
       '2', '2', '2', '1', '0', '1', '3', '3', '3', '2', '1', '3', '2',
       '1', '0', '1', '3', '0', '2', '0', '2', '3', '3', '3', '1', '1',
       '1', '1', '3', '2', '1', '1', '3', '3', '2', '3', '1', '3', '2',
       '3', '2', '3', '2', '0', '0', '3', '2', '3', '0', '2', '1', '3',
       '2', '0', '3', '3', '0', '3', '1', '0', '2', '0', '2', '1', '0',
       '0', '0', '1', '2', '3', '2', '2', '1', '3', '3', '1', '1', '3',
       '1', '3', '2', '0', '2', '3', '2', '1', '1', '1', '3', '2

In [17]:
# reshape the array for one hot encoding
sequence_np_reshape = sequence_np.reshape(-1, 1)

print("have reshaped array of type {} and shape {}".format(type(sequence_np_reshape), sequence_np_reshape.shape))

have reshaped array of type <class 'numpy.ndarray'> and shape (600, 1)


In [18]:
# one hot the numpy array
onehot_encoder = OneHotEncoder(sparse=False)
sequence_np_one_hot = onehot_encoder.fit_transform(sequence_np_reshape)
print(sequence_np_one_hot)



[[0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 ...
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]]


In [19]:
# check the new matrix shape
print("the new one hot encoded np tensor is of type {} and shape {}".format(type(sequence_np_one_hot), sequence_np_one_hot.shape))

the new one hot encoded np tensor is of type <class 'numpy.ndarray'> and shape (600, 4)


In [20]:
# convert numpy array to torch tensor
sequence_torch = torch.from_numpy(sequence_np_one_hot)

print("the torch sequence is of type {} and shape {}".format(type(sequence_torch), sequence_torch.shape))

the torch sequence is of type <class 'torch.Tensor'> and shape torch.Size([600, 4])
