<a href="https://colab.research.google.com/github/nupursjsu/Deep-Learning/blob/master/Ungraded_Assignment4/PyTorch_Basics_Ch3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PyTorch Basics chapter 3


## Importing necessary libraries

In [0]:
import numpy as np
import pandas as pd
import csv
import torch
torch.set_printoptions(edgeitems=2, precision=2)

## Loading wine quality dataset

In [0]:
# Code to read data file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [10]:
#Getting wine quality datset
link = 'https://drive.google.com/open?id=1bxf8g4P7XFvsT-GCrO91dQ74mCExlmjr'
fluff, id = link.split('=')
print (id) # Verify that you have everything after '='

downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('winequality-white.csv')

1bxf8g4P7XFvsT-GCrO91dQ74mCExlmjr


In [20]:
df = pd.read_csv('winequality-white.csv', encoding='utf-8', dtype=np.float32, delimiter=";", skiprows=1)
#Converting pandas dataframe to numpy array
df_numpy = df.values
df_numpy

array([[ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       [ 7.2 ,  0.23,  0.32, ...,  0.4 ,  9.9 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [22]:
columns = next(csv.reader(open('winequality-white.csv'), delimiter=';'))

#Displaying shape of array and column labels
df_numpy.shape, columns

((4897, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

## Converting Numpy array to PyTorch tensor and performing prediction

In [23]:
#Converting the NumPy array to a PyTorch tensor
wine_data = torch.from_numpy(df_numpy)
wine_data.shape, wine_data.type()

(torch.Size([4897, 12]), 'torch.FloatTensor')

In [24]:
#Selecting all rows and all columns except the last
final_data = wine_data[:, :-1] 
final_data, final_data.shape

(tensor([[ 6.30,  0.30,  ...,  0.49,  9.50],
         [ 8.10,  0.28,  ...,  0.44, 10.10],
         ...,
         [ 5.50,  0.29,  ...,  0.38, 12.80],
         [ 6.00,  0.21,  ...,  0.32, 11.80]]), torch.Size([4897, 11]))

In [25]:
#Selecting all rows and the last column
label = wine_data[:, -1] # <2>
label, label.shape

(tensor([6., 6.,  ..., 7., 6.]), torch.Size([4897]))

In [26]:
#Converting label tensor into integer values
label = wine_data[:, -1].long() 
label

tensor([6, 6,  ..., 7, 6])

In [27]:
#Onehot encoding the label column of categorical values
label_onehot = torch.zeros(label.shape[0], 10)
label_onehot.scatter_(1, label.unsqueeze(1), 1.0)

tensor([[0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.],
        ...,
        [0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.]])

In [29]:
#Adding an extra dummy dimension to label by using unsqueeze
label_unsqueezed = label.unsqueeze(1) 
label_unsqueezed

tensor([[6],
        [6],
        ...,
        [7],
        [6]])

In [30]:
#Finding means and standard deviations for each column
final_data_mean = torch.mean(final_data, dim=0) 
final_data_mean

tensor([6.85e+00, 2.78e-01, 3.34e-01, 6.39e+00, 4.58e-02, 3.53e+01, 1.38e+02,
        9.94e-01, 3.19e+00, 4.90e-01, 1.05e+01])

In [31]:
final_data_var = torch.var(final_data, dim=0) 
final_data_var

tensor([7.12e-01, 1.02e-02, 1.46e-02, 2.57e+01, 4.77e-04, 2.89e+02, 1.81e+03,
        8.94e-06, 2.28e-02, 1.30e-02, 1.51e+00])

In [32]:
#Normalizing the data by subtracting the mean and dividing by the standard deviation
final_data_normalized = (final_data - final_data_mean) / torch.sqrt(final_data_var) 
final_data_normalized

tensor([[-6.57e-01,  2.16e-01,  ...,  1.27e-03, -8.25e-01],
        [ 1.48e+00,  1.74e-02,  ..., -4.37e-01, -3.37e-01],
        ...,
        [-1.61e+00,  1.17e-01,  ..., -9.62e-01,  1.86e+00],
        [-1.01e+00, -6.77e-01,  ..., -1.49e+00,  1.04e+00]])

Normalizing will help in the learning process

In [33]:
#Using the torch.le function to determine which rows in target correspond to a score less than or equal to 3
bad_idx = torch.le(label, 3) 
bad_idx.shape, bad_idx.dtype, bad_idx.sum()

(torch.Size([4897]), torch.bool, tensor(20))

Note that only 20 of the bad_indexes entries are set to 1!

In [34]:
#Filtering data to be only items (or rows) that correspond to 1 in the indexing tensor
bad_final_data = final_data[bad_idx] 
bad_final_data.shape

torch.Size([20, 11])

In [0]:
#Getting information about wines grouped into good, middling, and bad categories. Take the .mean() of each column
bad_final_data = final_data[torch.le(label, 3)] 
mid_final_data = final_data[torch.gt(label, 3) & torch.lt(label, 7)] 
good_final_data = final_data[torch.ge(label, 7)]

bad_mean = torch.mean(bad_final_data, dim=0) 
mid_mean = torch.mean(mid_final_data, dim=0) 
good_mean = torch.mean(good_final_data, dim=0)

In [37]:
for i, args in enumerate(zip(columns, bad_mean, mid_mean, good_mean)):
  print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.70   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.27  11.42


The bad wines seem to have higher total sulfur dioxide, among other differences. You could use a threshold on total sulfur dioxide as a crude criterion for discriminating good wines from bad ones.

In [39]:
#Getting the indexes in which the total sulfur dioxide column is below the midpoint you calculated earlier
threshold_sulphur = 141.83 
data_sulphur= final_data[:,6] 
pred_idxs = torch.lt(data_sulphur, threshold_sulphur)

pred_idxs.shape, pred_idxs.dtype, pred_idxs.sum()

(torch.Size([4897]), torch.bool, tensor(2727))

In [40]:
#Getting the indexes of the good wines
actual_idxs = torch.gt(label, 5)
actual_idxs.shape, actual_idxs.dtype, actual_idxs.sum()

(torch.Size([4897]), torch.bool, tensor(3257))

Because you have about 500 more good wines than your threshold predicted, you already have hard evidence that the threshold isn’t perfect.

In [41]:
#Seeing how well your predictions line up with the actual rankings.
total_matches = torch.sum(actual_idxs & pred_idxs).item() 
total_predicted = torch.sum(pred_idxs).item() 
total_actual = torch.sum(actual_idxs).item()

total_matches, total_matches / total_predicted, total_matches / total_actual

(2018, 0.74000733406674, 0.6195885784464231)

We got around 2,018 wines correct! Because we had 2,700 wines predicted, a 74 percent chance exists that if we predict a wine to be high-quality, it is. Unfortunately, we have 3,200 good wines and identified only 61 percent of them. The result is barely better than random.