In [3]:
import random
from typing import TypeVar, List, Tuple
X = TypeVar("X")#generic type to represent a data point.

In [4]:
def split_data(data: List[X], percentage: float) -> Tuple[List[X], List[X]]:
    '''split data into fractions [prob, 1-prob]'''
    data = data[:]
    random.shuffle(data)
    cut = int(len(data) * percentage)
    return data[:cut], data[cut:]

In [6]:
data = [n for n in range(10000000)]

In [7]:
train, test = split_data(data, 0.75)

In [9]:
assert len(train) == 7500000
assert len(test) == 2500000

In [10]:
assert sorted(train + test) == data

In [11]:
Y = TypeVar('Y')# generic type to represent output variables

In [12]:
def train_test_split(xs: List[X],
                     ys: List[Y],
                     test_pct: float) -> Tuple[List[X], List[X], List[Y], List[Y]]:
    idxs = [i for i in range(len(xs))]
    train_idxs, test_idxs = split_data(idxs, 1 - test_pct)
    return ([xs[i] for i in train_idxs],
            [xs[i] for i in test_idxs],
            [ys[i] for i in train_idxs],
            [ys[i] for i in test_idxs])

In [13]:
xs = [x for x in range(1000)]
ys = [2 * x for x in xs]
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.25)

In [15]:
assert len(x_train) == len(y_train)
assert len(x_test) == len(y_test)

In [17]:
assert all(y == 2 * x for x, y in zip(x_train, y_train))
assert all(y == 2 * x for x, y in zip(x_test, y_test))

# Correctness

In [1]:
def accuracy(tp: int, fp: int, fn: int, tn: int) -> float:
    correct = tp + tn
    total = tp + fp + fn + tn
    return correct/total

In [5]:
assert accuracy(70, 4930, 13930, 981070) == 0.98114

In [6]:
def precision(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp+fp)

In [7]:
assert precision(70, 4930, 13930, 981070) == 0.014

In [8]:
def recall(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp+fn)

assert recall(70, 4930, 13930, 981070) == 0.005