# AI from scratch

In [2]:
!pip install chelo

Collecting chelo
  Downloading chelo-0.0.4-py3-none-any.whl.metadata (7.2 kB)
Downloading chelo-0.0.4-py3-none-any.whl (25 kB)
Installing collected packages: chelo
Successfully installed chelo-0.0.4


In [6]:
from chelo import DatasetRegistry
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load and prepare the dataset
dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="white")
dataset.load_data()

Downloading 'winequality-white.csv' for dataset 'wine_quality' from https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv...


Downloading winequality-white.csv: 264kB [00:00, 515kB/s]

File downloaded and saved at '/root/.chelo/wine_quality/winequality-white.csv'.





Let's load the data:

In [8]:
X, y = dataset.to_numpy()
print(X.shape, y.shape)

(4898, 11) (4898, 1)


First we need to have a train-test split:

In [9]:
print(y)

[[6]
 [6]
 [6]
 ...
 [6]
 [7]
 [6]]


In [10]:
X_train = X[:4000, :]
y_train = y[:4000]
X_test = X[4000:, :]
y_test = y[4000:]

It is a good practice to validate that we can correctly loaded our dataset:

In [11]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4000, 11) (4000, 1)
(898, 11) (898, 1)


What is the simplest algorithm that you can think of for predicting the unknown values for X_test?

In [36]:
import numpy as np

def predict_sample_1nn(X_train, y_train, x):
    # Calculate the distance between x and each sample in X_train

    distance = (X_train - x)**2
    distance = np.mean(distance, axis=1)
    idx = np.argmin(distance)
    return y_train[idx][0]

In [37]:
predict_sample_1nn(X_train, y_train, X_test[1, :])

6

Let's calculate the predictions for all the samples:

In [54]:
y_test_predictions = []
for i in range(X_test.shape[0]):
  prediction = predict_sample_1nn(X_train, y_train, X_test[i, :])
  y_test_predictions.append(prediction)
print(y_test_predictions)

[6, 6, 7, 7, 7, 7, 7, 6, 7, 6, 7, 7, 6, 6, 5, 7, 6, 6, 5, 7, 4, 7, 6, 6, 6, 7, 6, 5, 7, 7, 6, 6, 6, 6, 6, 5, 5, 6, 6, 3, 6, 7, 6, 6, 6, 6, 5, 7, 7, 6, 6, 6, 7, 6, 5, 6, 5, 6, 7, 6, 6, 6, 8, 6, 9, 7, 6, 5, 6, 6, 6, 5, 6, 4, 8, 6, 8, 5, 5, 6, 6, 6, 6, 7, 5, 7, 6, 7, 6, 6, 6, 6, 6, 5, 7, 5, 5, 6, 6, 6, 7, 6, 6, 6, 7, 6, 6, 5, 6, 5, 7, 7, 8, 6, 6, 5, 5, 5, 6, 6, 7, 5, 6, 7, 6, 7, 7, 5, 6, 4, 6, 5, 5, 6, 6, 5, 8, 5, 6, 6, 5, 5, 6, 5, 5, 6, 6, 6, 5, 6, 5, 6, 5, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 7, 7, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 7, 6, 6, 5, 6, 5, 7, 6, 6, 6, 6, 6, 6, 5, 5, 6, 7, 6, 7, 5, 6, 6, 6, 7, 7, 6, 6, 6, 5, 4, 6, 6, 6, 6, 6, 6, 7, 7, 6, 5, 7, 7, 5, 7, 7, 6, 5, 8, 7, 6, 5, 6, 6, 7, 6, 7, 6, 6, 6, 6, 5, 5, 6, 6, 6, 5, 8, 6, 8, 7, 5, 5, 5, 7, 6, 7, 7, 5, 5, 6, 7, 7, 5, 5, 5, 6, 6, 5, 5, 5, 5, 5, 6, 5, 6, 7, 5, 6, 6, 7, 7, 7, 7, 6, 7, 7, 6, 5, 6, 5, 6, 5, 6, 8, 5, 6, 3, 6, 6, 7, 7, 7, 6, 6, 6, 6, 5, 6, 7, 6, 7, 8, 5, 7, 7, 8, 5, 5, 6, 6, 7, 5, 6, 5, 7, 5, 5, 6, 6, 

Let's compare these predictions to the actual raw values:

In [56]:
print(y_test.shape)

(898, 1)


It is very hard to actually understand if these are good or bad so let's do the following. 1) Calculate the mean squared error (mse), as well as 2) calculate the accuracy on correctly classifying the samples to the their class

In [57]:
mse = np.mean(y_test_predictions - y_test.squeeze())**2
print("MSE = ", mse)

MSE =  0.036686325960684725


In [58]:
accuracy = np.mean(y_test_predictions == y_test.squeeze())
print("Accuracy = ", accuracy)


Accuracy =  0.4053452115812918


Can we do the same with the train set?

In [59]:
y_train_predictions = []
for i in range(X_train.shape[0]):
  prediction = predict_sample_1nn(X_train, y_train, X_train[i, :])
  y_train_predictions.append(prediction)
print(y_train_predictions)

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 7, 5, 7, 6, 8, 6, 5, 8, 7, 8, 5, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 5, 5, 5, 6, 5, 5, 6, 6, 6, 6, 6, 7, 4, 5, 6, 5, 6, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 5, 7, 5, 8, 5, 6, 5, 5, 6, 8, 5, 7, 7, 5, 5, 6, 6, 5, 6, 5, 6, 6, 6, 5, 6, 6, 5, 7, 7, 7, 6, 6, 7, 4, 6, 5, 5, 5, 5, 5, 6, 5, 6, 6, 5, 6, 5, 5, 5, 5, 4, 6, 6, 5, 5, 5, 5, 5, 6, 6, 6, 5, 7, 7, 6, 5, 7, 5, 5, 5, 5, 6, 5, 7, 6, 5, 5, 6, 6, 6, 6, 6, 4, 7, 6, 7, 6, 6, 5, 6, 6, 6, 7, 8, 8, 7, 5, 5, 6, 5, 5, 6, 7, 5, 5, 6, 6, 4, 7, 5, 6, 4, 5, 4, 6, 6, 5, 5, 6, 5, 5, 6, 5, 8, 4, 6, 5, 6, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 6, 4, 5, 5, 4, 5, 6, 5, 7, 5, 6, 7, 5, 5, 5, 5, 5, 5, 6, 7, 6, 6, 5, 6, 6, 6, 5, 4, 6, 6, 6, 6, 6, 6, 6, 7, 6, 5, 5, 7, 6, 5, 6, 7, 7, 7, 5, 4, 3, 5, 3, 6, 8, 7, 7, 6, 4, 6, 5, 5, 6, 6, 5, 6, 5, 6, 6, 6, 5, 5, 5, 5, 6, 6, 5, 4, 7, 8, 8, 4, 5, 5, 5, 6, 7, 7, 7, 7, 6, 5, 7, 3, 6, 5, 7, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 7, 6, 7, 8, 6, 6, 5, 6, 6, 5, 7, 6, 7, 5, 6, 6, 5, 5, 6, 6, 6, 5, 8, 5, 6, 

In [60]:
mse = np.mean(y_train_predictions - y_train.squeeze())**2
print("MSE = ", mse)
accuracy = np.mean(y_train_predictions == y_train.squeeze())
print("Accuracy = ", accuracy)

MSE =  0.0
Accuracy =  1.0


Why is that?

Doing the same using scikit-learn:

In [72]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, accuracy_score

model = KNeighborsRegressor(n_neighbors=1)
model.fit(X_train, y_train)

In [73]:
y_test_predictions = model.predict(X_test)

print("Test mse: ", mean_squared_error(y_test, y_test_predictions))
print("Test accuracy: ", accuracy_score(y_test, y_test_predictions))

y_train_predictions = model.predict(X_train)

print("Test mse: ", mean_squared_error(y_train, y_train_predictions))
print("Test accuracy: ", accuracy_score(y_train, y_train_predictions))



Test mse:  1.133630289532294
Test accuracy:  0.4053452115812918
Test mse:  0.0
Test accuracy:  1.0


What about different k-s?


In [79]:
for k in [1, 3, 5, 7, 9, 11]:
  print("---------------------------")
  print("Evaluating for k = ", k)
  model = KNeighborsRegressor(n_neighbors=k)
  model.fit(X_train, y_train)
  y_test_predictions = model.predict(X_test)

  print("Test mse: ", mean_squared_error(y_test, y_test_predictions))
  y_test_predictions = y_test_predictions.round()
  print("Test accuracy: ", accuracy_score(y_test, y_test_predictions))
  y_train_predictions = model.predict(X_train)

  print("Test mse: ", mean_squared_error(y_train, y_train_predictions))
  y_train_predictions = y_train_predictions.round()
  print("Test accuracy: ", accuracy_score(y_train, y_train_predictions))


---------------------------
Evaluating for k =  1
Test mse:  1.133630289532294
Test accuracy:  0.4053452115812918
Test mse:  0.0
Test accuracy:  1.0
---------------------------
Evaluating for k =  3
Test mse:  0.7876763177431328
Test accuracy:  0.4209354120267261
Test mse:  0.3187499999999999
Test accuracy:  0.6715
---------------------------
Evaluating for k =  5
Test mse:  0.7103340757238308
Test accuracy:  0.4510022271714922
Test mse:  0.43013
Test accuracy:  0.59425
---------------------------
Evaluating for k =  7
Test mse:  0.6604472523976183
Test accuracy:  0.4910913140311804
Test mse:  0.48509183673469386
Test accuracy:  0.54575
---------------------------
Evaluating for k =  9
Test mse:  0.6335890456157718
Test accuracy:  0.5022271714922049
Test mse:  0.5297438271604937
Test accuracy:  0.5185
---------------------------
Evaluating for k =  11
Test mse:  0.6222459459956929
Test accuracy:  0.49888641425389757
Test mse:  0.5568822314049587
Test accuracy:  0.511


What is the effect of normalizing the data?


In [81]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
# This is very important!
X_test = scaler.transform(X_test)

for k in [1, 3, 5, 7, 9, 11]:
  print("---------------------------")
  print("Evaluating for k = ", k)
  model = KNeighborsRegressor(n_neighbors=k)
  model.fit(X_train, y_train)
  y_test_predictions = model.predict(X_test)

  print("Test mse: ", mean_squared_error(y_test, y_test_predictions))
  y_test_predictions = y_test_predictions.round()
  print("Test accuracy: ", accuracy_score(y_test, y_test_predictions))
  y_train_predictions = model.predict(X_train)

  print("Test mse: ", mean_squared_error(y_train, y_train_predictions))
  y_train_predictions = y_train_predictions.round()
  print("Test accuracy: ", accuracy_score(y_train, y_train_predictions))


---------------------------
Evaluating for k =  1
Test mse:  0.8151447661469933
Test accuracy:  0.46325167037861914
Test mse:  0.0
Test accuracy:  1.0
---------------------------
Evaluating for k =  3
Test mse:  0.5942835931700073
Test accuracy:  0.5055679287305123
Test mse:  0.2385833333333333
Test accuracy:  0.73875
---------------------------
Evaluating for k =  5
Test mse:  0.5549220489977729
Test accuracy:  0.5367483296213809
Test mse:  0.3303399999999999
Test accuracy:  0.65475
---------------------------
Evaluating for k =  7
Test mse:  0.5185218853688469
Test accuracy:  0.5400890868596881
Test mse:  0.38024489795918365
Test accuracy:  0.6235
---------------------------
Evaluating for k =  9
Test mse:  0.49133877753031424
Test accuracy:  0.5512249443207127
Test mse:  0.41006481481481477
Test accuracy:  0.604
---------------------------
Evaluating for k =  11
Test mse:  0.4831121500487768
Test accuracy:  0.5512249443207127
Test mse:  0.4294896694214876
Test accuracy:  0.59025


In [82]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
# This is very important!
X_test = scaler.transform(X_test)

for k in [1, 3, 5, 7, 9, 11]:
  print("---------------------------")
  print("Evaluating for k = ", k)
  model = KNeighborsRegressor(n_neighbors=k)
  model.fit(X_train, y_train)
  y_test_predictions = model.predict(X_test)

  print("Test mse: ", mean_squared_error(y_test, y_test_predictions))
  y_test_predictions = y_test_predictions.round()
  print("Test accuracy: ", accuracy_score(y_test, y_test_predictions))
  y_train_predictions = model.predict(X_train)

  print("Test mse: ", mean_squared_error(y_train, y_train_predictions))
  y_train_predictions = y_train_predictions.round()
  print("Test accuracy: ", accuracy_score(y_train, y_train_predictions))

---------------------------
Evaluating for k =  1
Test mse:  0.8296213808463252
Test accuracy:  0.46325167037861914
Test mse:  0.0
Test accuracy:  1.0
---------------------------
Evaluating for k =  3
Test mse:  0.5812917594654788
Test accuracy:  0.5133630289532294
Test mse:  0.24341666666666664
Test accuracy:  0.74125
---------------------------
Evaluating for k =  5
Test mse:  0.5269933184855234
Test accuracy:  0.5144766146993318
Test mse:  0.34126999999999996
Test accuracy:  0.662
---------------------------
Evaluating for k =  7
Test mse:  0.48911413117585567
Test accuracy:  0.5434298440979956
Test mse:  0.3830612244897959
Test accuracy:  0.62725
---------------------------
Evaluating for k =  9
Test mse:  0.465602573620391
Test accuracy:  0.5456570155902004
Test mse:  0.4093302469135802
Test accuracy:  0.6015
---------------------------
Evaluating for k =  11
Test mse:  0.4573708332566401
Test accuracy:  0.5523385300668151
Test mse:  0.4259111570247933
Test accuracy:  0.60425
