# Deep Learning with PyTorch


## Explore the Dataset

In [20]:
import pandas as pd

# load the training dataset (excluding rows with null values)
penguins = pd.read_csv('data/penguins.csv').dropna()

# Deep Learning models work best when features are on similar scales
# In a real solution, we'd implement some custom normalization for each feature, but to keep things simple
# we'll just rescale the FlipperLength and BodyMass so they're on a similar scale to the bill measurements
penguins['FlipperLength'] = penguins['FlipperLength']/10
penguins['BodyMass'] = penguins['BodyMass']/100

# The dataset is too small to be useful for deep learning
# So we'll oversample it to increase its size
for i in range(1,3):
    penguins = pd.concat([penguins, penguins], axis=0)

# Display a random sample of 10 observations
sample = penguins.sample(10)
sample

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
179,47.8,15.0,21.5,56.5,1
66,35.5,16.2,19.5,33.5,0
338,45.7,17.0,19.5,36.5,2
41,40.8,18.4,19.5,39.0,0
278,51.3,19.2,19.3,36.5,2
69,41.8,19.4,19.8,44.5,0
99,43.2,18.5,19.2,41.0,0
332,45.2,16.6,19.1,32.5,2
142,32.1,15.5,18.8,30.5,0
302,50.5,18.4,20.0,34.0,2


In [22]:
sample.shape

(10, 5)

In [18]:
penguin_classes = ['Adelie', 'Gentoo', 'Chinstrap']
print(sample.columns[0:5].values, 'SpeciesName')
for index, row in penguins.sample(10).iterrows():
    print('[',row[0], row[1], row[2],row[3], int(row[4]), ']',penguin_classes[int(row[-1])])

['CulmenLength' 'CulmenDepth' 'FlipperLength' 'BodyMass' 'Species'] SpeciesName
[ 46.4 17.8 19.1 37.0 2 ] Chinstrap
[ 55.8 19.8 20.7 40.0 2 ] Chinstrap
[ 40.6 18.8 19.3 38.0 0 ] Adelie
[ 50.5 15.9 22.2 55.5 1 ] Gentoo
[ 51.4 19.0 20.1 39.5 2 ] Chinstrap
[ 39.5 17.8 18.8 33.0 0 ] Adelie
[ 45.2 15.8 21.5 53.0 1 ] Gentoo
[ 36.0 17.9 19.0 34.5 0 ] Adelie
[ 40.8 18.4 19.5 39.0 0 ] Adelie
[ 46.3 15.8 21.5 50.5 1 ] Gentoo


  print('[',row[0], row[1], row[2],row[3], int(row[4]), ']',penguin_classes[int(row[-1])])


In [24]:
from sklearn.model_selection import train_test_split

features = ['CulmenLength','CulmenDepth','FlipperLength','BodyMass']
label = 'Species'
   
# Split data 70%-30% into training set and test set
x_train, x_test, y_train, y_test = train_test_split(penguins[features].values,
                                                    penguins[label].values,
                                                    test_size=0.30,
                                                    random_state=0)

print ('Training Set: %d, Test Set: %d \n' % (len(x_train), len(x_test)))
print("Sample of features and labels:")

# Take a look at the first 25 training features and corresponding labels
for n in range(0,24):
    print(x_train[n], y_train[n], '(' + penguin_classes[y_train[n]] + ')')

Training Set: 957, Test Set: 411 

Sample of features and labels:
[51.1 16.5 22.5 52.5] 1 (Gentoo)
[50.7 19.7 20.3 40.5] 2 (Chinstrap)
[49.5 16.2 22.9 58. ] 1 (Gentoo)
[39.3 20.6 19.  36.5] 0 (Adelie)
[42.5 20.7 19.7 45. ] 0 (Adelie)
[50.  15.3 22.  55.5] 1 (Gentoo)
[50.2  18.7  19.8  37.75] 2 (Chinstrap)
[50.7 19.7 20.3 40.5] 2 (Chinstrap)
[49.1  14.5  21.2  46.25] 1 (Gentoo)
[43.2 16.6 18.7 29. ] 2 (Chinstrap)
[38.8  17.6  19.1  32.75] 0 (Adelie)
[37.8 17.1 18.6 33. ] 0 (Adelie)
[45.8 14.2 21.9 47. ] 1 (Gentoo)
[43.8 13.9 20.8 43. ] 1 (Gentoo)
[36.  17.1 18.7 37. ] 0 (Adelie)
[43.3 13.4 20.9 44. ] 1 (Gentoo)
[36.  18.5 18.6 31. ] 0 (Adelie)
[41.1  19.   18.2  34.25] 0 (Adelie)
[33.1 16.1 17.8 29. ] 0 (Adelie)
[40.9 13.7 21.4 46.5] 1 (Gentoo)
[45.2 17.8 19.8 39.5] 2 (Chinstrap)
[48.4 14.6 21.3 58.5] 1 (Gentoo)
[43.6 13.9 21.7 49. ] 1 (Gentoo)
[38.5  17.9  19.   33.25] 0 (Adelie)


In [26]:
import torch
import torch.nn as nn
import torch.utils.data as td

# Set random seed for reproducability
torch.manual_seed(0)

print("Libraries imported - ready to use PyTorch", torch.__version__)

Libraries imported - ready to use PyTorch 2.4.1+cpu


## Prepare the data for PyTorch


In [29]:
# Create a dataset and loader for the training data and labels
train_x = torch.Tensor(x_train).float()
train_y = torch.Tensor(y_train).long()
train_ds = td.TensorDataset(train_x, train_y)
train_loader = td.DataLoader(train_ds, batch_size=20,
    shuffle=False, num_workers=1)

# Create a dataset and loader for the test data and labels
test_x = torch.Tensor(x_test).float()
test_y = torch.Tensor(y_test).long()
test_ds = td.TensorDataset(test_x,test_y)
test_loader = td.DataLoader(test_ds, batch_size=20,
    shuffle=False, num_workers=1)
print('Ready to load data')

Ready to load data


In [61]:
import numpy as np
import time

t = []
for i in range(3, 1000):
    arr1 = np.random.randint(size=i)
    arr2 = np.random.randint(size=i)
    arr3 = np.random.randint(size=i)

    st1 = time.time()
    st1 = np.sort(arr1, kind='quicksort')
    et1 = time.time()
    t1 = et1 - st1

    st2 = time.time()
    st2 = np.sort(arr2, kind='mergesort')
    et2 = time.time()
    t2 = et2 - st2

    st3 = time.time()
    st3 = np.sort(arr3, kind='heapsort')
    et3 = time.time()
    t3 = et3 - st3

    t.append([t1, t2, t3])
    


TypeError: randint() takes at least 1 positional argument (0 given)