In [14]:
import numpy 

from random import randint
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler

### Dataset creation
In this cell we are going to create our dataset, based on the following characteristics:
* A new drug is going has been tested to patients with age between 18 and 90
* The trial involves N participants (defined soon).
* Half of the N participants were under 65, the other half 65 or older.
* The 90% of the most old patients, has some side effects.
* The 90% of patients under 65, has not some side effects.

In [18]:
N = 1000 # The total number of patients, set to 1000 for each half

train_samples = []
train_labels  = []

for i in range(0, int(N*10/100)):
    # The 10% of 65 year old patients or older, has not some side effects.
    random_age_older = randint(65, 90)
    train_samples.append(random_age_older)
    train_labels.append(0)
    
    # The 10% of patients under 65 years old, has some side effects.
    random_age_younger = randint(18, 64)
    train_samples.append(random_age_younger)
    train_labels.append(1)

for i in range(int(N*10/100), N):
    # The 90% of 65 year old patients or older, has some side effects.
    random_age_older = randint(65, 90)
    train_samples.append(random_age_older)
    train_labels.append(1)
    
    # The 90% of patients under 65 years old, has not some side effects.
    random_age_younger = randint(18, 64)
    train_samples.append(random_age_younger)
    train_labels.append(0)
    
# Let's convert our lists in numpy arrays
train_samples = numpy.array(train_samples)
train_labels  = numpy.array(train_labels)

# We are going to shuffle them for our Neural Network
train_labels, train_samples = shuffle(train_labels, train_samples)

# Typicaly, data are not normalized, also in our case.
# To normalize them, we are going to use the MinMaxScaler, compressing our data in range [0, 1]
scaler = MinMaxScaler(feature_range = (0, 1))
scaled_train_samples = scaler.fit_transform(train_samples.reshape(-1, 1))

Shapes: (2000,) (2000,)


In [19]:
# Now that we have created our dataset, we can export it in the CSV format by using Pandas

import pandas

data_frame = pandas.DataFrame({'values': scaled_train_samples.reshape(N*2,), 'labels': train_labels})

In [22]:
data_frame.to_csv('./dataset/train/train.csv')

In [24]:
# Let's do the same thing but for test dataset

N = 50 # The total number of patients, set to 100 for each half

train_samples = []
train_labels  = []

for i in range(0, int(N*10/100)):
    # The 10% of 65 year old patients or older, has not some side effects.
    random_age_older = randint(65, 90)
    train_samples.append(random_age_older)
    train_labels.append(0)
    
    # The 10% of patients under 65 years old, has some side effects.
    random_age_younger = randint(18, 64)
    train_samples.append(random_age_younger)
    train_labels.append(1)

for i in range(int(N*10/100), N):
    # The 90% of 65 year old patients or older, has some side effects.
    random_age_older = randint(65, 90)
    train_samples.append(random_age_older)
    train_labels.append(1)
    
    # The 90% of patients under 65 years old, has not some side effects.
    random_age_younger = randint(18, 64)
    train_samples.append(random_age_younger)
    train_labels.append(0)
    
# Let's convert our lists in numpy arrays
train_samples = numpy.array(train_samples)
train_labels  = numpy.array(train_labels)

# We are going to shuffle them for our Neural Network
train_labels, train_samples = shuffle(train_labels, train_samples)

# Typicaly, data are not normalized, also in our case.
# To normalize them, we are going to use the MinMaxScaler, compressing our data in range [0, 1]
scaler = MinMaxScaler(feature_range = (0, 1))
scaled_train_samples = scaler.fit_transform(train_samples.reshape(-1, 1))

data_frame = pandas.DataFrame({'values': scaled_train_samples.reshape(N*2,), 'labels': train_labels})
data_frame.to_csv('./dataset/test/test.csv')

Shapes: (100,) (100,)
