# Chapter 4. Real-world data representation using tensors

## Imports

In [21]:
import imageio
import torch
import os
import string

import csv
import numpy as np

## Images

In [10]:
img_arr = imageio.imread("../data/dlwpt/ch4/image-dog/bobby.jpg")
print(img_arr.shape)
print(type(img_arr))

(720, 1280, 3)
<class 'imageio.core.util.Array'>


  img_arr = imageio.imread("../data/dlwpt/ch4/image-dog/bobby.jpg")


In [6]:
img = torch.from_numpy(img_arr)
out = img.permute(2, 0, 1) # pytorch image modules use (N, C, H (rows), W (cols))

In [9]:
batch_size = 3
batch = torch.zeros(batch_size, 3, 256, 256, dtype=torch.uint8)

data_dir = "../data/dlwpt/ch4/image-cats/"
filenames = [name for name in os.listdir(data_dir) if os.path.splitext(name)[-1] == ".png"]
for i, fname in enumerate(filenames):
    img_arr = imageio.imread(os.path.join(data_dir, fname))
    img_t = torch.from_numpy(img_arr)
    img_t = img_t.permute(2, 0, 1)
    img_t = img_t[:3] # only keep the first three channels in case the image has additional channels, like transparency
    batch[i] = img_t

  img_arr = imageio.imread(os.path.join(data_dir, fname))


### Max norm

In [11]:
batch = batch.float()
batch /= 255.0

### Z norm

In [12]:
n_channels = batch.shape[1]
for c in range(n_channels):
    mean = torch.mean(batch[:, c])  # generally, mean and std should be computed over the dataset, not the batch
    std = torch.std(batch[:, c])
    batch[:, c] = (batch[:, c] - mean) / std

## Volumetric data

In [13]:
# (N, C, D, H, W); D = depth

In [15]:
dir_path = "../data/dlwpt/ch4/volumetric-dicom/2-LUNG 3.0  B70f-04083/"
vol_arr = imageio.volread(dir_path, "DICOM")
print(vol_arr.shape, vol_arr.dtype)

Reading DICOM (examining files): 1/99 files (1.0%99/99 files (100.0%)
  Found 1 correct series.
Reading DICOM (loading data): 99/99  (100.0%)
(99, 512, 512) int16


In [16]:
# add channel dimension
vol = torch.from_numpy(vol_arr).float()
vol = torch.unsqueeze(vol, 0)
print(vol.shape)

torch.Size([1, 99, 512, 512])


## Tabular data

In [18]:
wine_path = "../data/dlwpt/ch4/tabular-wine/winequality-white.csv"
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";", skiprows=1)
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [20]:
col_list = next(csv.reader(open(wine_path), delimiter=";"))

print(wineq_numpy.shape, col_list)

(4898, 12) ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']


In [21]:
wineq = torch.from_numpy(wineq_numpy)

print(wineq.shape, wineq.dtype)

torch.Size([4898, 12]) torch.float32


In [23]:
data = wineq[:, :-1]
print(data, data.shape)

tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
        [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
        [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
        ...,
        [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
        [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
        [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]]) torch.Size([4898, 11])


In [24]:
target = wineq[:, -1]
print(target, target.shape)

tensor([6., 6., 6.,  ..., 6., 7., 6.]) torch.Size([4898])


In [25]:
target = wineq[:, -1].long()
print(target, target.dtype)

tensor([6, 6, 6,  ..., 6, 7, 6]) torch.int64


In [28]:
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot.scatter_(1, target.unsqueeze(1), 1.0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [29]:
target.unsqueeze(0)

tensor([[6, 6, 6,  ..., 6, 7, 6]])

In [35]:
# self[index[i][j]][j] = src[i][j]  # if dim == 0
# self[i][index[i][j]] = src[i][j]  # if dim == 1

# if index[i][j] doesn't exist, gives 0

# target_onehot = torch.zeros(target.shape[0], 10)
# target_onehot.scatter_(1, target.unsqueeze(0), 1.0)

# target_onehot[0]

tensor([0., 0., 0., 1., 1., 1., 1., 1., 1., 1.])

In [38]:
data_mean = torch.mean(data, dim=0)
print(data_mean)

tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
        1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01])


In [37]:
data_var = torch.var(data, dim=0)
print(data_var)

tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
        1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00])


In [39]:
data_normalized = (data - data_mean) / torch.sqrt(data_var)
print(data_normalized)

tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
         -3.4915e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
          1.3422e-03, -8.2419e-01],
        [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
         -4.3677e-01, -3.3663e-01],
        ...,
        [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
         -2.6153e-01, -9.0545e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
         -9.6251e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
         -1.4882e+00,  1.0448e+00]])


In [40]:
bad_indices = target <= 3
print(bad_indices.shape, bad_indices.dtype, bad_indices.sum())

torch.Size([4898]) torch.bool tensor(20)


In [41]:
bad_data = data[bad_indices]
print(bad_data.shape)

torch.Size([20, 11])


In [42]:
bad_data = data[target <= 3]
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >= 7]

bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print("{:2} {:20} {:6.2f} {:6.2f} {:6.2f}".format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [43]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:, 6] # sulfur data is in column index 6
predicted_indices = torch.lt(total_sulfur_data, total_sulfur_threshold)

print(predicted_indices.shape, predicted_indices.dtype, predicted_indices.sum())

torch.Size([4898]) torch.bool tensor(2727)


In [44]:
actual_indices = target > 5
print(actual_indices.shape, actual_indices.dtype, actual_indices.sum())

torch.Size([4898]) torch.bool tensor(3258)


In [None]:
n_matches = torch.sum(actual_indices & predicted_indices).item()
n_predicted = torch.sum(predicted_indices).item(0)
n_actual = torch.sum(actual_indices).item()

## Time series

In [4]:
bikes_np = np.loadtxt("../data/dlwpt/ch4/bike-sharing-dataset/hour-fixed.csv",
                      dtype=np.float32,
                      delimiter=",",
                      skiprows=1,
                      converters={1: lambda x: float(x[8:10])})
bikes = torch.from_numpy(bikes_np)
print(bikes, bikes.shape)

tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 3.0000e+00, 1.3000e+01,
         1.6000e+01],
        [2.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 8.0000e+00, 3.2000e+01,
         4.0000e+01],
        [3.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 5.0000e+00, 2.7000e+01,
         3.2000e+01],
        ...,
        [1.7377e+04, 3.1000e+01, 1.0000e+00,  ..., 7.0000e+00, 8.3000e+01,
         9.0000e+01],
        [1.7378e+04, 3.1000e+01, 1.0000e+00,  ..., 1.3000e+01, 4.8000e+01,
         6.1000e+01],
        [1.7379e+04, 3.1000e+01, 1.0000e+00,  ..., 1.2000e+01, 3.7000e+01,
         4.9000e+01]]) torch.Size([17520, 17])


In [5]:
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [6]:
daily_bikes = bikes.view(-1, 24, bikes.shape[1])
print(daily_bikes.shape, daily_bikes.stride())

torch.Size([730, 24, 17]) (408, 17, 1)


In [7]:
daily_bikes = daily_bikes.transpose(1, 2)
print(daily_bikes.shape, daily_bikes.stride())

torch.Size([730, 17, 24]) (408, 1, 17)


In [8]:
first_day = bikes[:24].long()
weather_onehot = torch.zeros(first_day.shape[0], 4)
first_day[:, 9]

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [9]:
weather_onehot.scatter_(
    dim=1,
    index=first_day[:, 9].unsqueeze(1).long() - 1,
    value=1.0
)

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [10]:
torch.cat((bikes[:24], weather_onehot), 1)[:1]

tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
          0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         16.0000,  1.0000,  0.0000,  0.0000,  0.0000]])

In [12]:
daily_weather_onehot = torch.zeros(daily_bikes.shape[0], 4, daily_bikes.shape[2]) # we will cat along dim=1
daily_weather_onehot.scatter_(
    1,
    daily_bikes[:, 9, :].long().unsqueeze(1) - 1,
    1.0
)

tensor([[[1., 1., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0

In [13]:
daily_bikes = torch.cat((daily_bikes, daily_weather_onehot), dim=1)

In [15]:
print(daily_bikes.shape)

torch.Size([730, 21, 24])


## Representing text

In [16]:
with open("../data/dlwpt/ch4/jane-austen/1342-0.txt", encoding="utf8") as f:
    text = f.read()

In [18]:
lines = text.split("\n")
line = lines[200]
print(line)

“Impossible, Mr. Bennet, impossible, when I am not acquainted with him


In [19]:
letter_t = torch.zeros(len(line), 128)
print(letter_t.shape)

torch.Size([70, 128])


In [20]:
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0
    letter_t[i][letter_index] = 1

In [26]:
def clean_words(input_str):
    punctuation = string.punctuation + "“”"
    word_list = input_str.lower().replace("\n", " ").split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

words_in_line = clean_words(line)
print(line, words_in_line)

“Impossible, Mr. Bennet, impossible, when I am not acquainted with him ['impossible', 'mr', 'bennet', 'impossible', 'when', 'i', 'am', 'not', 'acquainted', 'with', 'him']


In [27]:
word_list = sorted(set(clean_words(text)))
word2index_dict = {word: i for (i, word) in enumerate(word_list)}

print(len(word2index_dict), word2index_dict["impossible"])

7165 3326


In [28]:
word_t = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print('{:2} {:4} {}'.format(i, word_index, word))
    
print(word_t.shape)

 0 3326 impossible
 1 4225 mr
 2  758 bennet
 3 3326 impossible
 4 6986 when
 5 3247 i
 6  364 am
 7 4353 not
 8  188 acquainted
 9 7054 with
10 3148 him
torch.Size([11, 7165])
