In [1]:
from random import randint
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler


# Dataset for Binary Classification

This dataset was created to build a binary classification model for predicting the likelihood of individuals experiencing side effects due to a certain treatment. The dataset is synthetically generated for educational purposes.

## Dataset Description

- **Data Type**: Synthetically generated
- **Problem Type**: Binary Classification
- **Classes**: Side Effect (1), No Side Effect (0)

## Features

The dataset contains one feature:

- **Age**: The age of individuals

## Labels

- **Side Effect (1)**: Individuals who experienced side effects.
- **No Side Effect (0)**: Individuals who did not experience side effects.

## Data Split

- Training Data: 50% younger individuals with side effects, 50% older individuals without side effects, and 95% younger and older individuals with no side effects.
- Testing Data: Custom data for prediction.

The goal is to train a model to predict whether an individual is likely to experience side effects based on their age.

## Sample Data

Here is a sample of the data:

| Age | Label |
|-----|-------|
| 30  |   0   |
| 54  |   1   |
| 100 |   0   |
| ... |  ...  |

## Data Scaling

The data was scaled to a range of [0, 1] to ensure that different features are on a similar scale.

The model used for classification is a neural network with multiple dense layers.

Please note that this dataset is for educational purposes and does not represent real-world data.

---

Feel free to modify this Markdown cell to add more details or explanations as needed. This will help anyone reading your notebook understand the problem and data you're working with.

In [2]:
train_samples = []
train_label = []


In [3]:
for i in range(50):
  # 5% of younger individuals who did expereince a side effect
  random_younger = randint(13, 64)
  train_samples.append(random_younger)
  train_label.append(1) # 1 means side effects

  # 5% of older individuals who did not experience a side effect
  random_older = randint(65, 100)
  train_samples.append(random_older)
  train_label.append(0) # 0 means no side effects

for i in range(1000):
  # 95% of younger individuals that did not experience any side effects
  random_younger = randint(13, 64)
  train_samples.append(random_younger)
  train_label.append(0)

  # 95% of older individuals that did experience side effects
  random_older = randint(65, 100)
  train_samples.append(random_older)
  train_label.append(1)



In [4]:
# convert our list to ndim array using numpy
train_samples = np.array(train_samples)
train_label = np.array(train_label)

# shuffle the data to add randomnes
train_label, train_samples = shuffle(train_label, train_samples)

In [5]:
# normalizing our data,  Normalization scales the data to a specific range,
# typically between 0 and 1, to ensure that different features are on a similar
# scale. This can improve the training process
scaler = MinMaxScaler(feature_range=(0,1))


# because the train_sample is a 1D numpy array of ages
# and we convert it into 2D array where each age is its own row, the -1 is a place
# holder that tells the numpy to calculate the number of rows automatically
scaled_train_samples = scaler.fit_transform(train_samples.reshape(-1, 1))
scaled_train_samples

array([[0.31034483],
       [0.88505747],
       [0.68965517],
       ...,
       [0.03448276],
       [0.59770115],
       [0.62068966]])

# **Building a simple tf.keras Sequential Model**

In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy

In [7]:
# instantiating the sequential class
model = Sequential([
    Dense(units=16, input_shape=(1,), activation='relu'), # 1st hidden layer
    Dense(units=32, activation='relu'), # second hidden layer
    Dense(units=2, activation='softmax') # output layer with a softmax
])

In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                32        
                                                                 
 dense_1 (Dense)             (None, 32)                544       
                                                                 
 dense_2 (Dense)             (None, 2)                 66        
                                                                 
Total params: 642 (2.51 KB)
Trainable params: 642 (2.51 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
from keras.api._v2.keras import metrics
#compile our model
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy']
              )

In [10]:
from re import VERBOSE
# Fitting the with our data
model.fit(x=scaled_train_samples,
          y=train_label, batch_size=10,
          epochs=30,
          shuffle=True,
          verbose=2
          )

Epoch 1/30
210/210 - 7s - loss: 0.7016 - accuracy: 0.5005 - 7s/epoch - 32ms/step
Epoch 2/30
210/210 - 0s - loss: 0.6774 - accuracy: 0.6571 - 459ms/epoch - 2ms/step
Epoch 3/30
210/210 - 0s - loss: 0.6529 - accuracy: 0.7343 - 466ms/epoch - 2ms/step
Epoch 4/30
210/210 - 1s - loss: 0.6243 - accuracy: 0.7981 - 617ms/epoch - 3ms/step
Epoch 5/30
210/210 - 1s - loss: 0.5950 - accuracy: 0.8048 - 648ms/epoch - 3ms/step
Epoch 6/30
210/210 - 1s - loss: 0.5639 - accuracy: 0.8214 - 650ms/epoch - 3ms/step
Epoch 7/30
210/210 - 1s - loss: 0.5313 - accuracy: 0.8376 - 611ms/epoch - 3ms/step
Epoch 8/30
210/210 - 0s - loss: 0.4986 - accuracy: 0.8548 - 445ms/epoch - 2ms/step
Epoch 9/30
210/210 - 0s - loss: 0.4674 - accuracy: 0.8619 - 452ms/epoch - 2ms/step
Epoch 10/30
210/210 - 0s - loss: 0.4385 - accuracy: 0.8752 - 438ms/epoch - 2ms/step
Epoch 11/30
210/210 - 0s - loss: 0.4116 - accuracy: 0.8829 - 433ms/epoch - 2ms/step
Epoch 12/30
210/210 - 0s - loss: 0.3870 - accuracy: 0.8933 - 458ms/epoch - 2ms/step
Epo

<keras.src.callbacks.History at 0x796b37f85630>

In [11]:
print("completed")

completed


# **Making Predictions on new Data**

In [15]:
# Create new data (scaled in the same way as the training data)
new_data = [30, 54, 100, 40, 70, 99, 20, 14, 12]

# convert new data to numpy array
new_data = np.array(new_data)

# scale and transform our data
scaller = MinMaxScaler(feature_range=(0, 1))

scaled_test_data = scaller.fit_transform(new_data.reshape(-1, 1))

print("New Data:", scaled_test_data)

# Make predictions
predictions = model.predict(scaled_test_data)

# The predictions will be in the form of probabilities for each class
# In your case, it's binary classification (side effect or no side effect), so you'll get two probabilities for each input
# You can interpret the results based on the threshold (e.g., if probability > 0.5, classify as side effect)

# If you want to get the class label directly, you can use argmax
class_labels = np.argmax(predictions, axis=1)

# class_labels will contain the predicted labels (0 or 1) for your new data
print(class_labels)


New Data: [[0.20454545]
 [0.47727273]
 [1.        ]
 [0.31818182]
 [0.65909091]
 [0.98863636]
 [0.09090909]
 [0.02272727]
 [0.        ]]
[0 0 1 0 1 1 0 0 0]
