### Predicting wheat species (0 = Kama, 1 = Rosa, 2 = Canadian)

### From area, perimeter, compactness, length, width, asymmetry, groove

In [5]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

# ------------------------------------------------

def show_confusion(cm):
    dim = len(cm)
    mx = np.max(cm)
    wid = len(str(mx)) + 1
    print("dim ",dim," mx ",mx," wid ",wid)
    fmt = "%" + str(wid) + "d"
    print(fmt)
    for i in range(dim):
        print("actual ", end = "")
        print("%3d:" %i, end = "")
        for j in range(dim):
            print(fmt % cm[i][j], end = "")
        print("")
    print("-----------------------")
    print("predicted ", end = "")
    for j in range(dim):
        print(fmt % j, end = "")
    print("")

#--------------------------------------------------

def main():
    # 0. prepare
    print("\nBegin Wheat Seeds k-NN usingscikit learn ")
    np.set_printoptions(precision=4, suppress=True)
    np.random.seed(1)

    # 1. load data
    print("\nLoading train and test data ")
    train_file = "train_seeds.txt" 
    train_X = np.loadtxt(train_file, usecols=[0,1,2,3,4,5,6], delimiter=",", dtype=np.float32, comments="#")
    train_y = np.loadtxt(train_file, usecols=[7], delimiter=",", dtype=np.int64, comments="#")

    test_file = "test_seeds.txt"
    test_X = np.loadtxt(test_file, usecols=[0,1,2,3,4,5,6], delimiter=",", dtype=np.float32, comments="#")
    test_y = np.loadtxt(test_file, usecols=[7], delimiter=",", dtype=np.int64, comments="#")

    print("\nTraining data: ")
    print(train_X[0:4])
    print("...\n")
    print(train_y[0:4])
    print("...")

    # 2. create and train model
    k = 7
    print("\nCreating kNN model, with k = " + str(k))
    model = KNeighborsClassifier(n_neighbors=k, algorithm="brute")
    model.fit(train_X, train_y)
    print("Done")

    # 3. evaluate model
    train_acc = model.score(train_X, train_y)
    test_acc = model.score(test_X, test_y)
    print("\nAccuracy on train data = %0.4f " % train_acc)
    print("Accuracy on test data = %0.4f " % test_acc)

    from sklearn.metrics import confusion_matrix, mean_squared_error
    y_predicteds = model.predict(test_X)
    print("\nPredicted values: ", y_predicteds)
    print("Actual values :   ", test_y)
    cm = confusion_matrix(test_y, y_predicteds)

    print("\nConfusion matrix: \n")
    show_confusion(cm)

    ms = mean_squared_error(test_y, y_predicteds)
    print("-----------\n\nThe mean squared error in this model is: ", ms)
    print("--------------------------------------------------------------------------\n\n\n")
    

if __name__ == "__main__":
    main()


Begin Wheat Seeds k-NN usingscikit learn 

Loading train and test data 

Training data: 
[[0.1629 0.5021 0.5708 0.4865 0.4861 0.1893 0.3452]
 [0.1978 0.4463 0.6624 0.3688 0.5011 0.0329 0.2152]
 [0.2326 0.3471 0.8793 0.2207 0.5039 0.2515 0.1507]
 [0.2675 0.3161 0.7931 0.2393 0.5339 0.1942 0.1408]]
...

[1 1 1 1]
...

Creating kNN model, with k = 7
Done

Accuracy on train data = 0.9611 
Accuracy on test data = 0.7000 

Predicted values:  [3 3 3 3 1 1 1 1 1 3 2 2 2 2 2 1 2 1 1 2 3 1 3 3 3 3 3 3 3 3]
Actual values :    [1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3]

Confusion matrix: 

dim  3  mx  9  wid  2
%2d
actual   0: 5 0 5
actual   1: 3 7 0
actual   2: 1 0 9
-----------------------
predicted  0 1 2
-----------

The mean squared error in this model is:  0.9
--------------------------------------------------------------------------





  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [6]:
from sklearn.metrics import mean_squared_error
help(mean_squared_error)

Help on function mean_squared_error in module sklearn.metrics._regression:

mean_squared_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average', squared=True)
    Mean squared error regression loss.
    
    Read more in the :ref:`User Guide <mean_squared_error>`.
    
    Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.
    
    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.
    
    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.
    
    multioutput : {'raw_values', 'uniform_average'} or array-like of shape             (n_outputs,), default='uniform_average'
        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.
    
        'raw_values' :
            Returns a full set of errors in case of multioutput input.
    
