# Data Preparation and Preprocessing

In [12]:
!pip install scikit-learn



In [13]:
import numpy as np
from random import randint
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler

In [14]:
train_labels = []
train_samples = []

# Experimental data.

An experimental drug has been tested on individuals from age ages 13 to 100 in a clinical trial.
The trial had 2100 patients. Half were under 65 years old, half were above 65 years old.
Around 95% of patients 65 or older experienced side effects.
Around 95% of patients under 65 experienced no side effects.

In [15]:
for i in range(50):
    # 5% of younger population who experiences side effects
    random_young = randint(13,64)
    train_samples.append(random_young)
    train_labels.append(1)
    
    # 5% of older population who did not experiences side effects
    random_old = randint(65,100)
    train_samples.append(random_old)
    train_labels.append(0)
    
for i in range(1000):
    # 95% of younger population who did not experience side effect
    random_young = randint(13,64)
    train_samples.append(random_young)
    train_labels.append(0)
    
    # 95% of old population who did experience a side effect
    random_old = randint(65,100)
    train_samples.append(random_old)
    train_labels.append(1)

In [16]:
for i in train_samples, train_labels:
    print(i)

[59, 69, 39, 99, 17, 86, 64, 88, 62, 97, 23, 70, 31, 69, 49, 93, 48, 77, 32, 100, 54, 66, 55, 71, 33, 88, 20, 90, 53, 73, 35, 80, 44, 99, 16, 72, 24, 81, 64, 84, 31, 67, 40, 91, 52, 80, 37, 88, 25, 76, 36, 93, 16, 97, 61, 94, 61, 94, 33, 81, 64, 84, 60, 67, 22, 68, 18, 73, 28, 79, 21, 85, 38, 88, 42, 92, 61, 92, 32, 93, 46, 90, 19, 92, 63, 99, 56, 89, 34, 75, 62, 98, 41, 99, 13, 95, 31, 80, 59, 89, 54, 98, 54, 81, 64, 75, 59, 67, 38, 77, 64, 99, 30, 83, 49, 68, 45, 65, 39, 85, 24, 68, 28, 92, 56, 79, 23, 86, 63, 92, 54, 81, 48, 66, 26, 96, 30, 84, 62, 94, 43, 76, 41, 91, 46, 82, 13, 92, 38, 77, 24, 76, 49, 88, 19, 99, 57, 100, 39, 76, 43, 94, 56, 97, 15, 81, 57, 66, 54, 72, 26, 85, 39, 97, 17, 85, 59, 68, 18, 93, 46, 98, 53, 92, 19, 87, 14, 97, 56, 92, 31, 75, 36, 72, 61, 92, 16, 75, 13, 99, 30, 82, 22, 87, 47, 98, 32, 100, 39, 78, 39, 81, 47, 94, 53, 91, 17, 76, 52, 70, 42, 66, 52, 80, 39, 77, 22, 68, 22, 79, 52, 69, 49, 90, 44, 99, 13, 69, 13, 85, 45, 65, 37, 91, 30, 68, 57, 84, 14, 

In [17]:
train_labels = np.array(train_labels)
train_samples = np.array(train_samples)
train_labels, train_samples = shuffle(train_labels, train_labels)

In [25]:
# Scaling the data between the range of 0 to 1 to normalize
scaler = MinMaxScaler(feature_range=(0,1))
# fit_transform dosent accepts 1d data so we transform it into 2d data
scaled_train_samples = scaler.fit_transform(train_samples.reshape(-1,1))

In [27]:
for i in scaled_train_samples:
    print(i)

[1.]
[1.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[0.]
[1.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[1.]
[0.]
[0.]
[1.]
[1.]
[1.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[1.]
[1.]
[1.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[1.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[1.]
[1.]
[0.]
[0.]
[1.]
[1.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[1.]
[1.]
[0.]
[0.]
[1.]
[1.]
[0.]
[0.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[1.]
[0.]
[0.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[1.]
[1.]
[1.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[1.]
[0.]
[0.]
[0.]
[0.]
[1.]
[0.]
[1.]
[1.]
[1.]
[0.]
[0.]
[0.]
[1.]
[1.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[0.]
[0.]
[0.]
[1.]
[1.]
[1.]
[1.]
[0.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
[1.]
[0.]
[0.]
[1.]
[1.]
[1.]
[1.]
[0.]
[0.]
[0.]
[0.]
[0.]
[1.]
[1.]
[0.]
[1.]
[1.]
[0.]
