# Implementation of some essential ML functions
- The objective of this notebook is not to provide efficient implementations, but to practice/learn 

In [16]:
from sklearn.datasets import load_boston

X, y = load_boston(return_X_y=True)

(X.shape, y.shape)

((506, 13), (506,))

In [17]:
import pandas as pd

df = pd.DataFrame(X, y).reset_index().rename(columns={"index": "target"})

In [18]:
df.shape

(506, 14)

## train_test_split

In [4]:
import numpy as np

def split_train_test(data, test_ratio, random_seed=42):
    np.random.seed(random_seed)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [5]:
train, test = split_train_test(df, 0.2)

In [6]:
train.head(3)

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,9,10,11,12
75,21.4,0.09512,0.0,12.83,0.0,0.437,6.286,45.0,4.5026,5.0,398.0,18.7,383.23,8.94
477,12.0,15.0234,0.0,18.1,0.0,0.614,5.304,97.3,2.1007,24.0,666.0,20.2,349.48,24.91
15,19.9,0.62739,0.0,8.14,0.0,0.538,5.834,56.5,4.4986,4.0,307.0,21.0,395.62,8.47


In [7]:
test.head(3)

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,9,10,11,12
173,23.6,0.09178,0.0,4.05,0.0,0.51,6.416,84.1,2.6463,5.0,296.0,16.6,395.5,9.04
274,32.4,0.05644,40.0,6.41,1.0,0.447,6.758,32.9,4.0776,4.0,254.0,17.6,396.9,3.53
491,13.6,0.10574,0.0,27.74,0.0,0.609,5.983,98.8,1.8681,4.0,711.0,20.1,390.11,18.07


## train_test_split without numpy

In [3]:
import random

def train_test_split(data, split=0.2, random_seed=42):
    random.seed(random_seed)
    train = []
    train_size = (1-split) * len(data)
    data_copy = list(data)
    while len(train) < train_size:
        index = random.randrange(len(data_copy))
        train.append(data_copy.pop(index))
    return train, data_copy

In [4]:
data = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]
train, test = train_test_split(data)
print(train)
print(test)

[[2], [1], [7], [4], [5], [6], [3], [10]]
[[8], [9]]


## train_test_split without numpy without loop

In [10]:
import random

def train_test_split(data, split=0.2, random_seed=42):
    random.seed(random_seed)
    train = []
    shuffled_indexes = random.sample(list(data), len(data))
    test_size = int(split * len(data))
    test_indexes = shuffled_indexes[:test_size]
    train_indexes = shuffled_indexes[test_size:]
    return train_indexes, test_indexes

In [15]:
data = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]
train, test = train_test_split(data)
print(train)
print(test)

[[5], [10], [7], [6], [9], [3], [4], [8]]
[[2], [1]]


## k-fold cross validation

In [21]:
import random

def kfoldcv(indexes, k=10, random_seed=42):
    size = len(indexes)
    subset_size = round(size / k)
    random.Random(random_seed).shuffle(indexes)
    subsets = [indexes[x:x+subset_size] for x in range(0, 
                                                       len(indexes), 
                                                       subset_size)]
    kfolds = []
    for i in range(k):
        test = subsets[i]
        train = []
        for subset in subsets:
            if subset != test:
                train.append(subset)
        kfolds.append((train,test))
        
    return kfolds

In [24]:
kfolds = kfoldcv(data)
kfolds

[([[[9]], [[3]], [[1]], [[7]], [[10]], [[2]], [[6]], [[8]], [[4]]], [[5]]),
 ([[[5]], [[3]], [[1]], [[7]], [[10]], [[2]], [[6]], [[8]], [[4]]], [[9]]),
 ([[[5]], [[9]], [[1]], [[7]], [[10]], [[2]], [[6]], [[8]], [[4]]], [[3]]),
 ([[[5]], [[9]], [[3]], [[7]], [[10]], [[2]], [[6]], [[8]], [[4]]], [[1]]),
 ([[[5]], [[9]], [[3]], [[1]], [[10]], [[2]], [[6]], [[8]], [[4]]], [[7]]),
 ([[[5]], [[9]], [[3]], [[1]], [[7]], [[2]], [[6]], [[8]], [[4]]], [[10]]),
 ([[[5]], [[9]], [[3]], [[1]], [[7]], [[10]], [[6]], [[8]], [[4]]], [[2]]),
 ([[[5]], [[9]], [[3]], [[1]], [[7]], [[10]], [[2]], [[8]], [[4]]], [[6]]),
 ([[[5]], [[9]], [[3]], [[1]], [[7]], [[10]], [[2]], [[6]], [[4]]], [[8]]),
 ([[[5]], [[9]], [[3]], [[1]], [[7]], [[10]], [[2]], [[6]], [[8]]], [[4]])]

## k-fold cv investigating

In [64]:
import random

def kfoldcv(indexes, k=3, random_seed=42):
    size = len(indexes)
    random.seed(random_seed)
    subset_size = int(size / k)
    shuffled_indexes = random.sample(list(indexes), size)
    subsets = [
        shuffled_indexes[x:x+subset_size] for x in range(0,
                                                         size, 
                                                         subset_size)]
    kfolds = []
    for i in range(k):
        test = subsets[i]
        train = []
        for subset in subsets:
            if subset != test:
                train.append(subset)
        kfolds.append((train, test))
    
    return kfolds

In [65]:
kfolds = kfoldcv(data)
kfolds

[([[[4], [2], [10]], [[8], [3], [1]], [[6]]], [[9], [5], [7]]),
 ([[[9], [5], [7]], [[8], [3], [1]], [[6]]], [[4], [2], [10]]),
 ([[[9], [5], [7]], [[4], [2], [10]], [[6]]], [[8], [3], [1]])]

When the division between size of the indexes and k is not perfect, the 
```python
subsets = [shuffled_indexes[x:x+subset_size] for x in range(0 , size, subset_size)]
```

isn't right. As you can see with k=3 and len(indexes)=10, 3 subsets with 3 elements and 1 subset with 1 element. So, we asked for 3 folds and got 4?!

In [121]:
size = len(data)
k = 3

In [122]:
size / k

3.3333333333333335

In [125]:
size // k

3

In [124]:
size % k

1

### If the difference between (size//k) and (size%k) is greater than 1 we should do it differently than the list comprehension above

From Sklearn implementation:
The first ``n_samples % n_splits`` folds have size
    ``n_samples // n_splits + 1``, other folds have size
    ``n_samples // n_splits``, where ``n_samples`` is the number of samples

In [114]:
import random

def kfoldcv(indexes, k=10, random_seed=42):
    random.seed(random_seed)
    size = len(indexes)
    # using sample to avoid shuffling the indexes inplace
    shuffled_indexes = random.sample(list(indexes), size)
    fold_sizes = [size // k for i in range(k)]
    for i in range(size%k):
        fold_sizes[i] += 1
    
    current = 0
    subsets = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        subsets.append(shuffled_indexes[start:stop])
        current = stop
        
    kfolds = []
    for i in range(k):
        test = subsets[i]
        train = []
        for subset in subsets:
            if subset != test:
                train.append(subset)
        kfolds.append((train, test))
    
    return kfolds

In [120]:
kfoldcv(data, k=3)

[([[[2], [10], [8]], [[3], [1], [6]]], [[9], [5], [7], [4]]),
 ([[[9], [5], [7], [4]], [[3], [1], [6]]], [[2], [10], [8]]),
 ([[[9], [5], [7], [4]], [[2], [10], [8]]], [[3], [1], [6]])]

In [117]:
kfoldcv(data, k=2)

[([[[10], [8], [3], [1], [6]]], [[9], [5], [7], [4], [2]]),
 ([[[9], [5], [7], [4], [2]]], [[10], [8], [3], [1], [6]])]

In [118]:
kfoldcv(data, k=4)

[([[[4], [2], [10]], [[8], [3]], [[1], [6]]], [[9], [5], [7]]),
 ([[[9], [5], [7]], [[8], [3]], [[1], [6]]], [[4], [2], [10]]),
 ([[[9], [5], [7]], [[4], [2], [10]], [[1], [6]]], [[8], [3]]),
 ([[[9], [5], [7]], [[4], [2], [10]], [[8], [3]]], [[1], [6]])]

In [119]:
kfoldcv(data, k=5)

[([[[7], [4]], [[2], [10]], [[8], [3]], [[1], [6]]], [[9], [5]]),
 ([[[9], [5]], [[2], [10]], [[8], [3]], [[1], [6]]], [[7], [4]]),
 ([[[9], [5]], [[7], [4]], [[8], [3]], [[1], [6]]], [[2], [10]]),
 ([[[9], [5]], [[7], [4]], [[2], [10]], [[1], [6]]], [[8], [3]]),
 ([[[9], [5]], [[7], [4]], [[2], [10]], [[8], [3]]], [[1], [6]])]

Well, it seems to work now