In [64]:
#%pip install scipy
#%pip install seaborn

import numpy as np
from typing import Tuple
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
import seaborn as sns
csv_path = "iris.csv"

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.3/293.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.12.2
Note: you may need to restart the kernel to use updated packages.


In [21]:
def load_csv(csv_path:str) -> Tuple[np.ndarray,np.ndarray]:
    np.random.seed(42)
    dataset = np.genfromtxt(csv_path,delimiter=',')
    np.random.shuffle(dataset)
    x,y = dataset[:,:-1],dataset[:,-1]
    return x,y

In [None]:
load_csv(csv_path)

In [38]:
x,y = load_csv(csv_path)

In [13]:
np.mean(x,axis=0),np.var(x, axis=0)

(array([nan, nan, nan, nan]), array([nan, nan, nan, nan]))

In [18]:
np.nanmean(x,axis=0),np.nanvar(x, axis=0)

(array([ 355.46503497, -280.09189189,    2.95      ,   21.74726027]),
 array([1.73561968e+07, 1.18405444e+07, 1.51049922e+04, 6.11729208e+04]))

In [25]:
x[np.isnan(x)] = 3.5

In [26]:
(x>10.0).sum(),(x<0.0).sum()

(4, 2)

In [28]:
x[np.where(np.logical_or(x>10.0,x<0.0))]

array([ -1111.,    100.,   1000.,  50000.,   3000., -42000.])

In [30]:
less_then = np.where(x<0.0)
higher_then = np.where(x>10.0)
less_then,higher_then

((array([  4, 140]), array([2, 1])),
 (array([14, 27, 28, 62]), array([1, 2, 0, 3])))

In [39]:
y = np.delete(y,np.where(x<0.0)[0],axis=0)
x = np.delete(x,np.where(x>10.0)[0],axis=0)
y = np.delete(y,np.where(x>10.0)[0],axis=0)
x = np.delete(x,np.where(x<0.0)[0],axis=0)
x.shape,y.shape

((144, 4), (148,))

In [None]:
#Train test split

In [55]:
def train_test_split(features:np.ndarray, labels:np.ndarray,test_split_ratio:float):
    test_size = int(len(features)*test_split_ratio)
    train_size = len(features)-test_size
    assert len(features) != test_size * train_size, "size mismatch!"
    x_train,y_train = features[:train_size,:],labels[:train_size]
    x_test,y_test = features[train_size:,:],labels[train_size:]
    return (x_train,y_train,x_test,y_test)

In [56]:
x_train,y_train,x_test,y_test = train_test_split(x,y,0.2)

In [58]:
def euclidean(points:np.ndarray,element_of_x:np.ndarray):
    return np.sqrt(np.sum((points-element_of_x)**2,axis=0))


In [48]:
def predict(x_train,y_train,x_test,k:int):
    labels_pred=[]
    for x_test_element in x_test:
        distances = euclidean(x_train,x_test_element)
        distances = sorted(zip(distances,y_train))
        labels_pred = mode(distances[:k,1],keepdims=False).mode
        labels_pred.append(labels_pred)
    return np.array(labels_pred,dtype=np.int64)

In [63]:
predict(x_train,y_train,x_test,k=3)

TypeError: list indices must be integers or slices, not tuple

In [None]:
def accuracy(y_test:np.ndarray,y_preds:np.ndarray)->float:
    true_positive = (y_test == y_preds).sum()
    return true_positive / len(y_test)
