In [1]:
# Setup
import numpy as np

filename = 'Ex_PC_data.csv'

In [2]:
# Question 1 part (a)

# Reading the data file and converting it into a matrix.
data = np.genfromtxt(filename, delimiter=',')
print(data)


# Since we're given that the last column is labels
features, labels = data[:, :-1], data[:, -1].astype(int)

[[      nan 1.710e+00 2.430e+00 ... 3.920e+00 1.065e+03 1.000e+00]
 [1.320e+01 1.780e+00 2.140e+00 ... 3.400e+00 1.050e+03 1.000e+00]
 [1.316e+01 2.360e+00 2.670e+00 ... 3.170e+00 1.185e+03 1.000e+00]
 ...
 [1.327e+01 4.280e+00 2.260e+00 ... 1.560e+00 8.350e+02 3.000e+00]
 [1.317e+01 2.590e+00 2.370e+00 ... 1.620e+00 8.400e+02 3.000e+00]
 [1.413e+01 4.100e+00 2.740e+00 ... 1.600e+00 5.600e+02 3.000e+00]]


In [3]:
# Question 1 part (b)

# Number of samples and features
n_samples, n_features = features.shape

print(f'Number of samples = {n_samples}')
print(f'Number of features = {n_features}')

Number of samples = 178
Number of features = 13


In [4]:
# Question 1 part (b) continued

classes = np.unique(labels)
n_classes = classes.shape[0]

print(f'Number of classes = {n_classes}')
print(f'The classes are:', ', '.join(map(str, classes)))

Number of classes = 3
The classes are: 1, 2, 3


In [5]:
# Question 1 part (b) continued

for i in classes:
    # Number of labels with class i
    count = np.sum(labels == i)
    
    # pdf of that class
    pdf = count / n_samples
    
    # rounded to 5 decimal places
    print(f'Class distribution for label {i} = {pdf:.5}')
    

Class distribution for label 1 = 0.33146
Class distribution for label 2 = 0.39888
Class distribution for label 3 = 0.26966


In [6]:
# Question 1 part (c)

# To impute, we need to first calculate the means of the features
feature_means = np.nanmean(features, axis=0)
print(f'Mean Vector for the features = \n{feature_means}')

Mean Vector for the features = 
[1.29936723e+01 2.33634831e+00 2.36651685e+00 1.95062500e+01
 9.97415730e+01 2.29511236e+00 2.03757062e+00 3.61073446e-01
 1.59536723e+00 5.05808989e+00 9.57471910e-01 2.61168539e+00
 7.46893258e+02]


In [7]:
# Question 1 part (c) continued

# Imputing nan values,
for i, m in enumerate(feature_means):
    # determining where the nan's are in each column
    nans = np.where(np.isnan(features[:,i]))[0]
    for j in nans:
        features[j, i] = m
print(features)

[[1.29936723e+01 1.71000000e+00 2.43000000e+00 ... 1.04000000e+00
  3.92000000e+00 1.06500000e+03]
 [1.32000000e+01 1.78000000e+00 2.14000000e+00 ... 1.05000000e+00
  3.40000000e+00 1.05000000e+03]
 [1.31600000e+01 2.36000000e+00 2.67000000e+00 ... 1.03000000e+00
  3.17000000e+00 1.18500000e+03]
 ...
 [1.32700000e+01 4.28000000e+00 2.26000000e+00 ... 5.90000000e-01
  1.56000000e+00 8.35000000e+02]
 [1.31700000e+01 2.59000000e+00 2.37000000e+00 ... 6.00000000e-01
  1.62000000e+00 8.40000000e+02]
 [1.41300000e+01 4.10000000e+00 2.74000000e+00 ... 6.10000000e-01
  1.60000000e+00 5.60000000e+02]]


In [8]:
# Question 1 part (d)

ratio = 0.8 # 80 / 20 split

# Number of elements in the training set
length = int(features.shape[0] * ratio)

# get a random permutation of the indices.
indices = np.random.permutation(features.shape[0])

train_features = features[indices[:length]]
train_labels   = labels[indices[:length]]
test_features  = features[indices[length:]]
test_labels    = labels[indices[length:]]

print("Shape of training dataset's features =", train_features.shape)
print("Shape of training dataset's labels   =", train_labels.shape)
print("Shape of testing dataset's features  =", test_features.shape)
print("Shape of testing dataset's labels    =", test_labels.shape)

Shape of training dataset's features = (142, 13)
Shape of training dataset's labels   = (142,)
Shape of testing dataset's features  = (36, 13)
Shape of testing dataset's labels    = (36,)


In [9]:
# Question 1 part (e)

max_vector = np.max(train_features, axis=0)
min_vector = np.min(train_features, axis=0)
print(max_vector)
print(min_vector)

[1.439e+01 5.800e+00 3.230e+00 3.000e+01 1.620e+02 3.880e+00 5.080e+00
 6.600e-01 3.580e+00 1.300e+01 1.710e+00 4.000e+00 1.680e+03]
[1.103e+01 7.400e-01 1.360e+00 1.060e+01 7.000e+01 9.800e-01 3.400e-01
 1.300e-01 4.100e-01 1.280e+00 4.800e-01 1.270e+00 2.780e+02]


In [10]:
# Question 1 part (e) continued

def scale(X: np.ndarray, start: float, end: float) -> np.ndarray:
    return (end - start) * (X - min_vector) / (max_vector - min_vector) + start

scaled_training_features = scale(train_features, -5, 5)
# print(f'scaled_training_features\n{scaled_training_features}')

scaled_testing_features = scale(test_features, -5, 5)
# print(f'scaled_testing_features\n{scaled_testing_features}')

print('Max in scaled testing,', np.max(scaled_testing_features))
print('Min in scaled testing,', np.min(scaled_testing_features))

Max in scaled testing, 6.309523809523808
Min in scaled testing, -4.926739926739927


In [11]:
# Question 1 part (f)

mean_scaled = np.mean(scaled_training_features, axis=0)
std_scaled = np.std(scaled_training_features, axis=0)

def normalize(X: np.ndarray) -> np.ndarray:
    return (X - mean_scaled) / std_scaled

n_s_train = normalize(scaled_training_features)
n_s_test = normalize(scaled_testing_features)

# print('Scaled and Normalized Training Features,', n_s_train, sep='\n')
# print('Scaled and Normalized Testing Features,', n_s_test, sep='\n')

_mean = np.mean(n_s_test, axis=0)
print('Testing Features mean, ', _mean)
_std = np.std(n_s_test, axis=0)
print('Testing Features std, ', _std)

Testing Features mean,  [-0.13751286 -0.22719036  0.20036156  0.13058728 -0.19928962 -0.07675987
  0.14512074 -0.13111013  0.08270551 -0.26286111  0.26101634 -0.0426573
 -0.15247943]
Testing Features std,  [0.98297416 0.71760859 0.94781102 1.15158323 0.90993945 0.89348215
 0.81246996 0.96095066 0.92824342 0.93399986 1.02408764 0.86194867
 0.94288846]


In [12]:
# Question 1 part (g)

from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
for train_index, test_index in kf.split(n_s_train):
    X_train = n_s_train[train_index]
    y_train = train_labels[train_index]

for i in classes:
    # Number of labels with class i
    count = np.sum(y_train == i)
    
    # pdf of that class
    pdf = count / y_train.shape[0]
    
    # rounded to 5 decimal places
    print(f'Class distribution for label {i} = {pdf:.5}')

Class distribution for label 1 = 0.32632
Class distribution for label 2 = 0.37895
Class distribution for label 3 = 0.29474
