In [71]:
import torch
import imageio
import os
import pandas as pd
import numpy as np

In [72]:
#Turn an image into an array
img_arr = imageio.imread('images/rainbow_alaska.jpeg') #Produces a numpy array
img_arr.shape

  img_arr = imageio.imread('images/rainbow_alaska.jpeg') #Produces a numpy array


(630, 1200, 3)

In [73]:
img = torch.from_numpy(img_arr) #Creates a tensory from a numpy array
out = img.permute(2,0,1)
img.shape, out.shape, out[:3].shape

(torch.Size([630, 1200, 3]),
 torch.Size([3, 630, 1200]),
 torch.Size([3, 630, 1200]))

In [74]:
#Set up batch, with a size of 3
batch_size = 3 #Number of images
batch = torch.zeros(batch_size, 3, 256, 256, dtype=torch.uint8) #3 images, 

In [75]:
#Code for reading multiple images in .png format from a directory
image_directory = '/images'
image_path = os.getcwd() + image_directory

images = [name for name in os.listdir(image_path) if os.path.splitext(name)[-1] == '.png'] #Retrieve files in image directory

for i, filename in enumerate(images):
    img_arr = imageio.imread(os.path.join(image_directory,filename))
    img_t = torch.from_numpy(img_arr)
    img_t = img_t.permute(2, 0, 1) #Turns dimension 0 into colour channel
    img_t = img_t[:3] #Makes sure that we only have 3 entries in colour dimension (i.e. makes sure we don't have alpha values or something)
    batch[i] = img_arr



<h3>Normalising Data</h3>
<ul>
<li>Best performance of Neural Network occurs when data in range [0,1]</li>
<li>Max representatble value in 8-bit format is 255</li>
<li>Divide each pixel value by 255</li>
</ul>

In [76]:
batch = batch.float()

n_channels = batch.shape[1] #Size of the 1th dimension of batch (i.e. the number of colour channels, which should be 3)

#Cycle through each colour and compute the average
for c in range(n_channels):
    mean = torch.mean(batch[:, c]) #Mean value of the selected colour channel across all images and all pixels
    std = torch.std(batch[:,c]) #Standard deviation for the colour
    batch[:,c] = (batch[:,c]-mean)/std #Centred at 0 with unit standard deviation

<h2>Wine Data</h2>
In the following section, we take a look at a csv file containing attributes of a number of wines (acidity, sulfur level, etc.)

In [77]:
#Extract wine data from the .csv
wine_csv = pd.read_csv('ch4data/winequality-white.csv', delimiter=';')
data = torch.tensor(wine_csv.iloc[:, :-1].values) #All information about the wine, with the final score extracted
target = torch.tensor(wine_csv.iloc[:, -1].values) #Get the last column, which contains the score

attribute_list = wine_csv.columns.values[:-1]
attribute_list #the 11 attributes without the quality score

#data, target


array(['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
      dtype=object)

In [78]:
#Compute mean value and variance of each of the 11 wine attributes
data_mean = torch.mean(data, dim=0) #dim=0 i.e. compute COLUMN average for each row. dim=1 would compute ROW average for each column
data_var = torch.var(data, dim=0)

#Produce normalised data, with mean at 0 and stddev length 1
data_normalized = (data - data_mean)/torch.sqrt(data_var)

#data_normalized

In [79]:
#Use advanced indexing to get the bad wines, mid wines, and good wines
bad_data = data[target <= 3] #Wine score of 3 or less is bad
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target>=7] #Wine score greater than or equal to 7 is good!

bad_data.shape, mid_data.shape, good_data.shape

(torch.Size([20, 11]), torch.Size([3818, 11]), torch.Size([1060, 11]))

Last cell told us that there were 20 bad wines. We have their data in a new tensor called <code>bad_data</code>, with dimensions <code>20x11</code>- 20 bad wines with 11 attributes each.
There are 3818 "mid" wines, and 1060 "good" wines. Let's take a look at their <b>mean scores</b>.

In [80]:
#Mean attribute values for each attribute
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

bad_mean, mid_mean, good_mean #Should each give lists with 11 entries.

(tensor([7.6000e+00, 3.3325e-01, 3.3600e-01, 6.3925e+00, 5.4300e-02, 5.3325e+01,
         1.7060e+02, 9.9488e-01, 3.1875e+00, 4.7450e-01, 1.0345e+01],
        dtype=torch.float64),
 tensor([6.8869e+00, 2.8153e-01, 3.3644e-01, 6.7051e+00, 4.7841e-02, 3.5424e+01,
         1.4183e+02, 9.9447e-01, 3.1808e+00, 4.8707e-01, 1.0265e+01],
        dtype=torch.float64),
 tensor([6.7251e+00, 2.6535e-01, 3.2606e-01, 5.2615e+00, 3.8160e-02, 3.4550e+01,
         1.2525e+02, 9.9241e-01, 3.2151e+00, 5.0014e-01, 1.1416e+01],
        dtype=torch.float64))

In [81]:
scoresheet = {
    'Attribute':attribute_list,
    'Bad Mean':bad_mean,
    'Mid Mean':mid_mean,
    'Good Mean':good_mean
}

scoresheet_df = pd.DataFrame(scoresheet)
scoresheet_df.to_csv('ch4data/scorecard.csv', index=False)

In the previous cell, we saved a "scorecard" showing the average attribute value for each category of wine. We can see by eye that bad wines typically have a higher acidity and sulfur dioxide content than good wines.

In [82]:
#Look at starting to set threshold values of sulfur dioxide content
total_sulfur_threshold = mid_mean[6] #Set threshold value for sulfur content as the mean sulfur content for mid wines
total_sulfur_data = data[:,6] #All wines, but only their sulfur content- should be a single-dim tensor with length = no. of wines

predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold) #Compares each wine's sulfur content to the threshold
actual_indexes = target > 5 #Indices of wines with wines of scores over 5

#How many wines are we predicting to be good vs how many actually have a score over 5?
predicted_indexes.sum(), actual_indexes.sum() #We see from the output that we are underestimating the "good wine" number by about 500 wines

(tensor(2727), tensor(3258))

<h1>Bike Sharing in Washington D.C.</h1>

In [83]:
#Each row of the .csv file gives a different hour of the data
bikes_numpy = np.loadtxt(
    'ch4data/hour-fixed.csv', 
    delimiter=',', 
    dtype=np.float32, 
    skiprows=1,
    converters={1: lambda x: float(x[8:10])} #Converts dates to just day of the month using 2 of the numbers in the date
    )

bikes = torch.from_numpy(bikes_numpy)
bikes.size(), bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [84]:
daily_bikes = bikes.view(-1, 24, bikes.shape[1]) #Rearrange tensor with no memory cost (uses same storage space as bikes)
#bikes.shape[1] = 17 (number of columns in tensor)
#24 hours in a day- EACH ROW IS NOW ONE DAY

daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 24, 17]), (408, 17, 1))

In [85]:
daily_bikes = daily_bikes.transpose(1,2)
daily_bikes.shape, daily_bikes.stride() #Bikes is now Days x Parameters x Hours

(torch.Size([730, 17, 24]), (408, 1, 17))

In [86]:
first_day = bikes[:24].long() #Gives us the first 24 hours of data, i.e. the first day Cast to float64
weather_onehot = torch.zeros(first_day.shape[0], 4) #Empty tensor with number of rows equal to number of hours in the day
first_day[:, 9] #Weather status

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [87]:
#Add ones into the weather_onehot tensor
weather_onehot.scatter_(
    dim = 1,
    index = first_day[:,9].unsqueeze(1).long() - 1, #Specifies index for one-hot encoding
    value = 1.0
)
#One-hot: 0, 1, 2, 3- four weather categories
#Weather rating 1 gives us a vector [1,0,0,0]
#Weather rating 2 gives us a vector [0,1,0,0] etc.


weather_onehot #24 rows, 4 columns. Each row is a row vector encoding weather in a one-hot manner

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [88]:
#Concatenate the weather_onehot vector to the original dataset

bikes[:24][:1], torch.cat((bikes[:24], weather_onehot), 1)[:1] #Last four columns give the onehot weather

(tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
           0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
          16.0000]]),
 tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
           0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
          16.0000,  1.0000,  0.0000,  0.0000,  0.0000]]))

So, we've taken the first 24hrs of data for the bike system, and added four columns at the side of the tensor- these four columns in each row represent the weather vector produced by one-hot encoding. There is another way of doing this in a bit of a shorter manner.

In [89]:
#Create a new tensor with 24 rows (one for each hour), 4 columns (for the onehot vector), and a depth
daily_weather_onehot = torch.zeros(daily_bikes.shape[0], 4, daily_bikes.shape[2])

#Dimension 0: number of days (730)
#Dimension 1: 4 spaces for onehot vector encoding
#DImension 2: 24 entries long one for each hour
daily_weather_onehot.shape


torch.Size([730, 4, 24])

In [90]:
daily_weather_onehot.scatter_(
    dim = 1, #Choose the first dimension as this is 4 spaces long
    index = daily_bikes[:, 9, :].unsqueeze(1).long() - 1, #Specifies index for one-hot encoding, subtract 1 cause 0 indexing
    value = 1.0
)

daily_bikes[:, 9, :].shape 
#730x24- 730 days, 24 hours in each day

torch.Size([730, 24])

In [91]:
daily_weather_onehot.shape, daily_weather_onehot
#730(days) x 4(dimension of weather onehot vector) x 24(hours)

(torch.Size([730, 4, 24]),
 tensor([[[1., 1., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 1., 1.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[1., 1., 1.,  ..., 1., 1., 1.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         ...,
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[1., 1., 1.,  ..., 1., 1., 1.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0.,

In [92]:
print(daily_bikes.shape)
daily_bikes = torch.cat((daily_bikes, daily_weather_onehot), dim=1)
print(daily_bikes.shape) #Gain 4 more columns, which is the dimension of the onehot weather vector

torch.Size([730, 17, 24])
torch.Size([730, 21, 24])


<b>Temperature data</b>

In [93]:
temp = daily_bikes[:, 10, :]
temp.shape #Each row is a 24-long vector, where each entry is the temperature

torch.Size([730, 24])

In [None]:
#Normalise to a gaussian with a mean at 0 and a std of 1
print(temp)
std = torch.std(temp)
mu = torch.mean(temp)
temp = (temp - mu)/std
print(temp)


tensor([[0.2400, 0.2200, 0.2200,  ..., 0.4000, 0.4000, 0.4600],
        [0.4600, 0.4400, 0.4200,  ..., 0.2600, 0.2400, 0.2200],
        [0.2200, 0.2000, 0.2000,  ..., 0.1800, 0.1400, 0.1800],
        ...,
        [0.2400, 0.2400, 0.2400,  ..., 0.2800, 0.2600, 0.2600],
        [0.2600, 0.2600, 0.2600,  ..., 0.2000, 0.2000, 0.2000],
        [0.1800, 0.1800, 0.1600,  ..., 0.2600, 0.2600, 0.2600]])
tensor([[-1.3213, -1.4248, -1.4248,  ..., -0.4932, -0.4932, -0.1827],
        [-0.1827, -0.2862, -0.3897,  ..., -1.2178, -1.3213, -1.4248],
        [-1.4248, -1.5284, -1.5284,  ..., -1.6319, -1.8389, -1.6319],
        ...,
        [-1.3213, -1.3213, -1.3213,  ..., -1.1143, -1.2178, -1.2178],
        [-1.2178, -1.2178, -1.2178,  ..., -1.5284, -1.5284, -1.5284],
        [-1.6319, -1.6319, -1.7354,  ..., -1.2178, -1.2178, -1.2178]])


<h1>Processing Text</h1>
Can fine-grain text files down to the single-letter or single-word level- for this, can use a text file of Pride & Prejudice

In [None]:
#Open the text file and read
with open('ch4data/1342.txt', encoding='utf8') as f:
    text = f.read()

#Separate into lines
lines = text.split('\n')