## Image Data

In [20]:
import os

import torch
import imageio.v2 as iio

In [9]:
img_arr = iio.imread('data/bobby.jpg')

# (height, width, channels)
img_arr.shape

(720, 1280, 3)

In [10]:
# Numpy arrays perfect for converting to tensors
type(img_arr)

numpy.ndarray

In [13]:
# PyTorch needs shape (channels, height, width)
img = torch.from_numpy(img_arr)

# permute() simply requires the desired ordering of the dimensions
# and works inplace (also changing img will change out)
out = img.permute(2, 0, 1)
out.shape

torch.Size([3, 720, 1280])

In [14]:
# To build up a stack of tensors, preallocate a tensor of the desired size
batch_size = 3
batch = torch.zeros(batch_size, 3, 256, 256, dtype=torch.uint8)

In [16]:
# Load data into the prepared tensor
data_dir = 'data/ch4/image-cats/'
filenames = [name for name in os.listdir(data_dir) if os.path.splitext(name)[-1] == '.png']
for i, filename in enumerate(filenames):
    img_arr = iio.imread(os.path.join(data_dir, filename))
    img_t = torch.from_numpy(img_arr)
    img_t = img_t.permute(2, 0, 1)
    img_t = img_t[:3]   # Sometimes, images have an alpha channel
    batch[i] = img_t    # Add to batch

In [18]:
# Normalize: NNs generally work best with inputs between 0 and 1 or -1 and 1

# One possibility: Simply scale down
batch = batch.float()
batch /= 255

# Or: standardize each channel
n_channels = batch.shape[1]
for c in range(n_channels):
    mean = torch.mean(batch[:, c])
    std = torch.std(batch[:, c])
    batch[:, c] = (batch[:, c] - mean) / std

## Medical Data

Of a CT scan

In [23]:
dir_path = 'data/ch4/volumetric-dicom/2-LUNG 3.0  B70f-04083'
vol_arr = iio.volread(dir_path, 'DICOM')
vol_arr.shape, type(vol_arr)

Reading DICOM (examining files): 99/99 files (100.0%)
  Found 1 correct series.
Reading DICOM (loading data): 99/99  (100.0%)


((99, 512, 512), imageio.core.util.Array)

In [26]:
# Add channel dimension
vol = torch.from_numpy(vol_arr).float()
vol = torch.unsqueeze(vol, 0)
vol.shape

torch.Size([1, 99, 512, 512])

## Tabular Data

In [60]:
import csv
import numpy as np

wine_path = 'data/ch4/tabular-wine/winequality-white.csv'
wine_np = np.loadtxt(wine_path, dtype=np.float32, delimiter=';', skiprows=1)
wine_np.shape

(4898, 12)

In [61]:
col_list = next(csv.reader(open(wine_path), delimiter=';'))
col_list

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [62]:
wine_t = torch.from_numpy(wine_np)
wine_t.shape, wine_t.dtype

(torch.Size([4898, 12]), torch.float32)

In [63]:
# Split into inputs and targets
data = wine_t[:, :-1]
data, data.shape

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
         [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
         [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]]),
 torch.Size([4898, 11]))

In [64]:
target = wine_t[:, -1]
target, target.shape

(tensor([6., 6., 6.,  ..., 6., 7., 6.]), torch.Size([4898]))

In [65]:
target

tensor([6., 6., 6.,  ..., 6., 7., 6.])

In [66]:
# We can treat targets like numbers for a regression
target = target.long()

In [67]:
# Or as labels
# Onehot encoding is only required for inputs, targets can remain simply classes
target_onehot = torch.zeros(target.shape[0], 10)

# Arguments:
#   Dimension along which the following two arguments are specified
#   Column tensor indicating the indices of the elements to scatter
#   The value to use
# The output and input tensors have to have same shape, so we use unsqueeze
target_onehot.scatter_(1, target.unsqueeze(1), 1.0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [68]:
# Get mean of each column
data_mean = torch.mean(data, dim=0)
data_std = torch.std(data, dim=0)

# Normalize/Standardize
data_norm = (data - data_mean) / data_std

In [69]:
# Find bad wines
bad_indexes = target <= 3
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [70]:
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [71]:
mid_data = data[(target > 3) & (target < 7)]
good_data = data[(target >= 7)]

In [72]:
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

In [73]:
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [74]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:, 6]
predicted_indices = torch.lt(total_sulfur_data, total_sulfur_threshold)
predicted_indices.shape, predicted_indices.dtype, predicted_indices.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [76]:
actual_indices = target > 5
actual_indices.shape, actual_indices.dtype, actual_indices.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [77]:
n_matches = torch.sum(actual_indices & predicted_indices).item()
n_predicted = torch.sum(predicted_indices).item()
n_actual = torch.sum(actual_indices).item()
n_matches, n_matches / n_predicted, n_matches / n_actual

(2018, 0.74000733406674, 0.6193984039287906)

## Time-Series Data

In [2]:
import torch
import numpy as np

In [3]:
bikes_numpy = np.loadtxt(
    'data/ch4/bike-sharing-dataset/hour-fixed.csv',
    dtype=np.float32,
    delimiter=',',
    skiprows=1,
    converters={1: lambda x: float(x[8:10])}    # Date is stored as yyyy-mm-dd
)
bikes = torch.from_numpy(bikes_numpy)

# Data is sorted by date first and time of day second
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [5]:
# view() takes in the desired dimension:
#   -1 is inferred from the other dimensions
#   24 comes from the hours per day
#   bikes.shape[1] returns the number of columns
# view() only changes the number of dimensions and striding information, memory remains the same -> very efficient
daily_bikes = bikes.view(-1, 24, bikes.shape[1])
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 24, 17]), (408, 17, 1))

In [8]:
# Both stay contiguous after view
# For daily bikes: In memory we first have the 17 columns of the first entry
# Then the next 17 columns of the next hour... until we reach the next day after 24 of these tuples
daily_bikes.is_contiguous(), bikes.is_contiguous()

(True, True)

In [16]:
# Desired ordering is (num_days, channels, hour_of_day)
daily_bikes = daily_bikes.transpose(1, 2)
daily_bikes.shape, daily_bikes.stride(), daily_bikes.is_contiguous()

(torch.Size([730, 17, 24]), (408, 1, 17), False)

In [19]:
# Handle categorical/ordinal variables
# Limit to just one day for now
first_day = bikes[:24].long()
weather_onehot = torch.zeros(first_day.shape[0], 4)

# This is the weather condition going from 1 to 4
first_day[:, 9]

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [21]:
first_day[:, 9].unsqueeze(1).shape, weather_onehot.shape

(torch.Size([24, 1]), torch.Size([24, 4]))

In [23]:
# For each row i in weather_onehot, set the value to 1.0 at the index first_day[i, 9]
# So if first_day[i, 9] = 1, row i column 1 of weather_onehot is set to 1.0
# We unsqueeze first_day, because it needs to have same dimensionality as target tensor
weather_onehot.scatter_(
    dim=1,  # Axis along which to index, in this case the columns
    index=first_day[:, 9].unsqueeze(1).long() - 1,  # Decrease by 1 since values go from 1 to 4
    value=1.0
)

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [24]:
# Now concatenate along the columns (dim 1) -> add one hot encoded weather as columns
torch.cat((bikes[:24], weather_onehot), 1)[:1]

tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
          0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         16.0000,  1.0000,  0.0000,  0.0000,  0.0000]])

In [26]:
daily_weather_onehot = torch.zeros(daily_bikes.shape[0], 4, daily_bikes.shape[2])
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [28]:
daily_weather_onehot.scatter_(
    dim=1,
    index=daily_bikes[:, 9, :].long().unsqueeze(1) - 1,
    value=1.0
)
daily_bikes = torch.cat((daily_bikes, daily_weather_onehot), 1)
daily_bikes.shape

torch.Size([730, 21, 24])

In [None]:
# Can also treat as ordinal and simply scale to between 0 and 1
daily_bikes[:, 9, :] = (daily_bikes[:, 9, :] - 1) / 3.0

In [32]:
# For quantitative variables, it would be good to scale to [0, 1] or [-1, 1]
temp = daily_bikes[:, 10, :]
temp_max = torch.max(temp)
temp_min = torch.min(temp)
daily_bikes[:, 10, :] = ((daily_bikes[:, 10, :] - temp_min) / temp_max - temp_min)

# Or for [-1, 1]:
mean = temp.mean()
std = temp.std()
daily_bikes[:, 10, :] = (daily_bikes[:, 10, :] - mean) / std

## Text Data

In [33]:
with open('data/ch4/jane-austen/1342-0.txt', encoding='utf8') as f:
    text = f.read()

In [35]:
# The book starts out with one-hot encoding, which is not state-of-the-art anymore
lines = text.split('\n')
line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [37]:
letter_t = torch.zeros(len(line), 128)
letter_t.shape

torch.Size([70, 128])

In [None]:
# One hot encoding letters
for i, letter in enumerate(line):
    letter_index = ord(letter) if ord(letter) < 128 else 0
    letter_t[i, letter_index] = 1

In [39]:
# One-hot encoding words
def clean_words(input_str):
    punctuation = '.,;:"!?_-”“'
    word_list = input_str.lower().replace('\n', ' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

words_in_line = clean_words(line)
words_in_line

['impossible',
 'mr',
 'bennet',
 'impossible',
 'when',
 'i',
 'am',
 'not',
 'acquainted',
 'with',
 'him']

In [41]:
words = sorted(set(clean_words(text)))
word2index_dict = {word: i for (i, word) in enumerate(words)}
len(word2index_dict), word2index_dict['impossible']

(8267, 3848)

In [44]:
word_t = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i, word_index] = 1
    print('{:2} {:4} {}'.format(i, word_index, word))
word_t.shape

 0 3848 impossible
 1 4928 mr
 2  899 bennet
 3 3848 impossible
 4 8049 when
 5 3759 i
 6  449 am
 7 5081 not
 8  249 acquainted
 9 8126 with
10 3637 him


torch.Size([11, 8267])

## Exercises

### Exercise 1

In [50]:
import os
import torch
import imageio.v2 as iio

In [47]:
img_np = iio.imread('data/ch4/ex1/IMG_4856.jpg')
img_np.shape

In [66]:
batch_size = 3
batch = torch.zeros(batch_size, 3, 1600, 1200, dtype=torch.uint8)

image_dir = 'data/ch4/ex1'
for i, image in enumerate(os.listdir(image_dir)):
    image_file = os.path.join(image_dir, image)
    image_np = iio.imread(image_file)
    image_t = torch.from_numpy(image_np)
    image_v = image_t.permute(2, 0, 1)
    batch[i] = image_v
    print(f'{i:>2} {image:>3} {batch[i].float().mean():>5.4f}')

 0 IMG_4856.jpg 115.0545
 1 IMG_4857.jpg 103.5018
 2 IMG_4858.jpg 114.7122


In [74]:
for channel in range(3):
    channel_pixels = batch[:, channel, :]
    channel_mean = channel_pixels.float().mean()
    print(f'{channel:>2} {channel_mean:5>.4f}')

 0 116.9941
 1 122.6129
 2 93.6616


### Exercise 2

In [80]:
import re

with open('data/ch4/ex2/data_utils.py', encoding='utf8') as f:
    python_text = f.read()

In [77]:
python_text

'import os\nimport tarfile\nimport urllib\nimport pandas as pd\nimport numpy as np\nfrom zlib import crc32\n\nDOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"\nHOUSING_PATH = os.path.join("datasets", "housing")\nHOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"\n\ndef fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n    os.makedirs(housing_path, exist_ok=True)\n    tgz_path = os.path.join(housing_path, "housing.tgz")\n    urllib.request.urlretrieve(housing_url, tgz_path)\n    housing_tgz = tarfile.open(tgz_path)\n    housing_tgz.extractall(path=housing_path)\n    housing_tgz.close()\n\ndef load_housing_data(housing_path=HOUSING_PATH):\n    csv_path = os.path.join(housing_path, "housing.csv")\n    return pd.read_csv(csv_path)\n\ndef test_set_check(identifier, test_ratio):\n    """\n    Computes a has value for each instance\'s identifier and puts an instance in the test set, if the hash is lower\n    than 20 % of the maxim

In [83]:
def clean_code(input_str):
    # Replace characters
    input_str = re.sub(r'[^a-zA-Z0-9_]+', ' ', input_str)
    words_in_str = input_str.split(' ')
    return words_in_str

In [93]:
cleaned_text = clean_code(python_text)
word2index = dict()
for word in cleaned_text:
    if word not in word2index:
        word2index[word] = len(word2index)

In [94]:
word_onehot = torch.zeros(len(cleaned_text), len(word2index))
for i, word in enumerate(cleaned_text):
    word_index = word2index[word]
    word_onehot[i, word_index] = 1

In [95]:
word_onehot

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])