<a href="https://colab.research.google.com/github/nicolehao34/KnowledgeDistillation/blob/main/Binary_classfication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.metrics import accuracy_score
import tensorflow as tf
import sklearn as sk
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers

## set up distiller class

In [3]:
class Distiller(keras.Model):
    def __init__(self, student, teacher):
        super().__init__()
        self.teacher = teacher
        self.student = student

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        """ Configure the distiller.

        Args:
            optimizer: Keras optimizer for the student weights
            metrics: Keras metrics for evaluation
            student_loss_fn: Loss function of difference between student
                predictions and ground-truth
            distillation_loss_fn: Loss function of difference between soft
                student predictions and soft teacher predictions
            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
            temperature: Temperature for softening probability distributions.
                Larger temperature gives softer distributions.
        """
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature

    def train_step(self, data):
        # Unpack data
        x, y = data

        # Forward pass of teacher
        teacher_predictions = self.teacher(x, training=False)

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)

            # Compute scaled distillation loss from https://arxiv.org/abs/1503.02531
            # The magnitudes of the gradients produced by the soft targets scale
            # as 1/T^2, multiply them by T^2 when using both hard and soft targets.
            distillation_loss = (
                self.distillation_loss_fn(
                    tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
                    tf.nn.softmax(student_predictions / self.temperature, axis=1),
                )
                * self.temperature**2
            )

            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss




        # Compute gradients
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update(
            {"student_loss": student_loss, "distillation_loss": distillation_loss}
        )
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Calculate the loss
        student_loss = self.student_loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

## Prepare data

In [10]:
# Generate some random data
num_samples = 100 # the number of data points generated
input_dim = 5 # dimensionality of the input features for each data point

In [11]:
# Generate some random data

# ABADONED DATA GENERATION CODE
X_2 = np.random.rand(num_samples, input_dim)
y_2 = np.random.randint(2, size=num_samples)

num_classes = 2
y_2 = keras.utils.to_categorical(y_2, num_classes)

In [12]:
# Define the complicated function with two-dimensional output
def complicated_function(x):
    y1 = 2*x**2 + 3*x + 5
    y2 = -0.5*x**2 + 2*x + 3
    y3 = x**2 + 3*x + 8
    y4 = -5*x**3 + 2*x**2
    y5 = 5*x
    return np.column_stack((y1, y2, y3, y4, y5))

# Generate input data
input_data = np.random.uniform(low=-10, high=10, size=(100, 1))

# Apply the function to generate output data
output_data = complicated_function(input_data)

In [13]:
output_data.shape

(100, 5)

In [14]:
# # Define the complicated function with two-dimensional output
# def function(x):
#     y1 = 2*x**2 + 3*x + 5
#     y2 = -0.5*x**2 + 2*x + 3
#     return np.column_stack((y1, y2, y3))

# # Softmax function to convert values to probabilities
def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / np.sum(exp_x)

# Generate input data
# input_data = np.random.uniform(low=-10, high=10, size=(1000, 1))

# Apply the function to generate output data
# output_data = complicated_function(input_data)

# Add noise (optional)
# output_data += np.random.normal(loc=0, scale=5, size=output_data.shape)

# Apply softmax to each dimension separately
soft_labels = np.apply_along_axis(softmax, axis=1, arr=output_data)

In [15]:
soft_labels

array([[6.46126411e-002, 6.67311258e-003, 9.21801808e-001,
        6.84425137e-003, 6.81869954e-005],
       [1.00000000e+000, 3.01911793e-086, 8.74640881e-032,
        0.00000000e+000, 4.01988655e-060],
       [1.27703586e-001, 4.10897635e-004, 8.71092264e-001,
        1.39214983e-007, 7.93112842e-004],
       [1.00000000e+000, 1.95684797e-076, 5.95516078e-028,
        0.00000000e+000, 6.49091328e-053],
       [1.95790275e-009, 2.26671207e-012, 3.09432639e-009,
        9.99999995e-001, 3.36644255e-015],
       [1.00000000e+000, 4.35192958e-056, 5.03774676e-020,
        0.00000000e+000, 4.06870450e-038],
       [1.00000000e+000, 5.58119249e-107, 6.70031465e-040,
        0.00000000e+000, 1.82128506e-075],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        1.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        1.00000000e+000, 0.00000000e+000],
       [6.36201640e-002, 6.76010779e-003, 9.23574362e-001,
        5.97372197e-003

In [16]:
# Sum the first 3 values and the rest of the values in each list
# Redefine softlabels
soft_labels = np.column_stack([soft_labels[:, :3].sum(axis=1), soft_labels[:, 3:].sum(axis=1)])

print(soft_labels)

[[9.93087562e-001 6.91243836e-003]
 [1.00000000e+000 4.01988655e-060]
 [9.99206748e-001 7.93252057e-004]
 [1.00000000e+000 6.49091328e-053]
 [5.05449585e-009 9.99999995e-001]
 [1.00000000e+000 4.06870450e-038]
 [1.00000000e+000 1.82128506e-075]
 [0.00000000e+000 1.00000000e+000]
 [0.00000000e+000 1.00000000e+000]
 [9.93954634e-001 6.04536637e-003]
 [1.00000000e+000 6.05519060e-024]
 [9.66099493e-013 1.00000000e+000]
 [0.00000000e+000 1.00000000e+000]
 [1.00000000e+000 1.64753664e-069]
 [1.00000000e+000 1.51480393e-076]
 [0.00000000e+000 1.00000000e+000]
 [9.99999686e-001 3.13997438e-007]
 [0.00000000e+000 1.00000000e+000]
 [0.00000000e+000 1.00000000e+000]
 [9.99270114e-001 7.29886119e-004]
 [0.00000000e+000 1.00000000e+000]
 [1.00000000e+000 2.31260922e-075]
 [1.00000000e+000 3.13438906e-062]
 [9.99976200e-001 2.38004743e-005]
 [1.00000000e+000 1.83625583e-030]
 [0.00000000e+000 1.00000000e+000]
 [1.00000000e+000 3.21897077e-046]
 [1.00000000e+000 9.54603747e-036]
 [1.00000000e+000 1.

In [17]:
# Function to add Gaussian noise to soft labels
def add_gaussian_noise(labels, mean=0, std=0.3):
    noise = np.random.normal(mean, std, labels.shape)
    return labels + noise

In [18]:
# Add Gaussian noise to soft labels
soft_labels_with_noise = add_gaussian_noise(soft_labels)

In [19]:
soft_labels_with_noise

array([[ 1.78577452, -0.51442697],
       [ 1.55296347,  0.12749777],
       [ 0.80964907,  0.49396388],
       [ 0.78790737,  0.27955634],
       [ 0.255871  ,  1.29993924],
       [ 1.24711922,  0.4649027 ],
       [ 1.05500108,  0.03167393],
       [-0.09372482,  0.88268182],
       [-0.32366789,  1.39703012],
       [ 1.64466557,  0.13643602],
       [ 0.88575673, -0.22107004],
       [-0.09232642,  1.15415827],
       [-0.12043692,  0.7908363 ],
       [ 0.8856409 ,  0.30713253],
       [ 0.78898108,  0.47935553],
       [-0.19201483,  1.02431737],
       [ 1.02592025,  0.19901511],
       [-0.16403753,  1.09131365],
       [ 0.32519468,  0.96933316],
       [ 1.33050378, -0.13329899],
       [-0.1716108 ,  1.13902623],
       [ 1.27783878, -0.08835207],
       [ 0.69374852, -0.10065874],
       [ 1.15550455, -0.6466438 ],
       [ 0.93216995,  0.14748207],
       [-0.6505965 ,  1.47131184],
       [ 0.87667708, -0.06114799],
       [ 1.30046839, -0.07254091],
       [ 1.07503288,

In [20]:
# Convert soft labels to hard labels
def soft_to_hard(soft_labels):
    hard_labels = []
    for x in soft_labels:
      if x[0] >= 0.50:
        hard_labels.append([1,0])
      else:
        hard_labels.append([0,1])

    return hard_labels

In [21]:
# Convert soft labels with noise to hard labels
hard_labels = soft_to_hard(soft_labels_with_noise)

In [22]:
hard_labels

[[1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1]]

In [23]:
# Add Gaussian noise to the input data
# noise_factor = 0.1  # Adjust the noise factor to see how much noise there is
# noise = np.random.normal(0, noise_factor, X_2.shape)
# noisy_X_2 = X_2 + noise

In [24]:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(output_data, hard_labels, test_size=0.33, random_state=42)

## Create teacher and student models

In [26]:
# Create teacher and student models
def create_teacher_model():
    model = keras.Sequential([
        # A fully connected (dense) layer with 64 units and ReLU (Rectified Linear Unit) activation function.
        # This layer has an input dimension of input_dim, which specifies the number of input features (in this case, it's set to 20).
        keras.layers.Dense(64, activation='relu', input_dim=input_dim),

        # Another fully connected layer with 32 units and ReLU activation function.
        keras.layers.Dense(32, activation='relu'),

        # Final layer, Softmax activation
        keras.layers.Dense(num_classes, activation='softmax')
    ])

    # The model is compiled with a binary cross-entropy loss function ('binary_crossentropy') and the Adam optimizer ('adam').
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def create_student_model():
    model = keras.Sequential([
        keras.layers.Dense(32, activation='relu', input_dim=input_dim), # no softmax layer here because we want to convert prediction to hard labels first

        keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

teacher_model = create_teacher_model()
student_model = create_student_model()

In [None]:
X_2_train

array([[ 4.30215878e+00,  2.38286318e+00,  7.21929914e+00,
         2.84976454e-01, -1.43926749e+00],
       [ 4.18341282e+01,  3.70949187e+00,  3.18268953e+01,
        -2.08541983e+02,  1.80327708e+01],
       [ 1.31603667e+02, -5.26900866e+01,  5.81895589e+01,
         3.49270408e+03, -4.37075817e+01],
       ...,
       [ 4.00514641e+00,  4.84703513e-01,  5.99493145e+00,
         7.09723744e+00, -5.02547252e+00],
       [ 4.83433735e+00,  2.88351388e+00,  7.83104043e+00,
         7.54037106e-03, -2.87094149e-01],
       [ 8.34433682e+00,  4.21233781e+00,  1.07894895e+01,
        -9.56780738e-01,  3.72440367e+00]])

## Train the teacher model

In [28]:
# ensure that X_2_train and y_2_train  are in the correct format
X_2_train = np.array(X_2_train)
y_2_train = np.array(y_2_train)
X_2_test = np.array(X_2_test)
y_2_test = np.array(y_2_test)

In [None]:
# Train teacher as usual
teacher_model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.BinaryAccuracy()],
)

# Train and evaluate teacher on data.
teacher_model.fit(X_2_train, y_2_train, epochs=500)
teacher_model.evaluate(X_2_test, y_2_test)

Epoch 1/500


  output, from_logits = _get_logits(


Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 7

[0.23511089384555817, 0.9575757384300232]

## Set a range of alpha values to test

In [None]:
student_losses = []
distillation_losses = []
student_accuracies = []

# Initialize and compile the distiller
distiller = Distiller(student=student_model, teacher=teacher_model)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.BinaryAccuracy()],
    student_loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    temperature=10,
)

# Set a range of alpha values to test
alpha_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Loop over different alpha values
for alpha in alpha_values:
    # Compile the distiller with the current alpha
    distiller.compile(alpha=alpha,
                      optimizer=keras.optimizers.Adam(),
                      metrics=[keras.metrics.BinaryAccuracy()],
                      student_loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
                      distillation_loss_fn=keras.losses.KLDivergence(),
                      temperature=10,)

    # Train the distiller
    distiller.fit(X_2_train, y_2_train, epochs=500)

    # Evaluate the distiller on the test dataset
    distiller_results = distiller.evaluate(X_2_test, y_2_test)

    # Print the results to inspect the structure
    # print("Distiller Results 1:", distiller_results[1])

    # Store results
    student_losses.append(distiller_results[0])
    student_accuracies.append(distiller_results[1])

    # Print results for the current alpha
    print(f"Alpha: {alpha}")


    print("Distiller Results:", distiller_results)
    print("=" * 50)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500

## Student model trained from scratch

In [None]:
# Clone student for later comparison
student_scratch = keras.models.clone_model(student_model)

# Train student as doen usually
student_scratch.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.BinaryAccuracy()],
)

# Train and evaluate student trained from scratch.
student_scratch.fit(X_2_train, y_2_train, epochs=500)
student_scratch.evaluate(X_2_test, y_2_test)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

[0.423084557056427, 0.9575757384300232]

## Generate plots

In [None]:
# Store results for plotting
alpha_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
student_scratch_losses = []
student_scratch_accuracies = []

# Loop over different alpha values
for alpha in alpha_values:
    # Compile the student model from scratch
    student_scratch.compile(
        optimizer=keras.optimizers.Adam(),
        loss=keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=[keras.metrics.BinaryAccuracy()],
    )

    # Train the student model from scratch
    student_scratch.fit(X_2_train, y_2_train, epochs=500)

    # Evaluate the student model on the test dataset
    student_scratch_results = student_scratch.evaluate(X_2_test, y_2_test)

    # Print the results to inspect the structure
    print("Student Scratch Results:", student_scratch_results)

    # Adapt the code based on the actual structure of student_scratch_results
    student_scratch_loss = student_scratch_results[0]
    student_scratch_accuracy = student_scratch_results[1]  # Second element is the accuracy
    student_scratch_accuracies.append(student_scratch_accuracy)
    student_scratch_losses.append(student_scratch_loss)



Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [None]:
print(student_accuracies)

In [None]:
len(student_accuracies)

11

In [None]:
# Plotting accuracies vs. alphas
plt.figure(figsize=(10, 6))


# Accuracy plot
# plt.subplot(1, 1, 2)
plt.plot(alpha_values, student_accuracies, label="Student Accuracy", marker='o')
plt.plot(alpha_values, student_scratch_accuracies, label="Student Scratch Accuracy", marker='o')
plt.title('Student and Student Scratch Accuracies for Different Alpha Values, 500 Epochs', fontsize = 14)
plt.xlabel('Alpha', fontsize = 14)
plt.ylabel('Accuracy', fontsize = 14)
plt.legend()


# # Accuracy plot for student trained from scratch
# # plt.subplot(1, 2, 2)
# plt.plot(alpha_values, student_scratch_losses, label="Student Scratch Loss", marker='o', color='green')
# plt.plot(alpha_values, student_losses, label="Student Loss", marker='o', color='red')

# plt.title('Student and Student Scratch Combined Loss for Different Alpha Values, 10 Epochs', fontsize = 14)
# plt.xlabel('Alpha', fontsize = 14)
# plt.ylabel('Combined Loss', fontsize = 14)
# plt.legend()

# Save the figure
plt.savefig('new_500epochs_acc.png')

plt.tight_layout()
plt.show()



In [None]:
# Accuracy plot for student trained from scratch
# plt.subplot(1, 2, 2)

plt.figure(figsize=(10, 6))
plt.plot(alpha_values, student_scratch_losses, label="Student Scratch Loss", marker='o', color='green')
plt.plot(alpha_values, student_losses, label="Student Loss", marker='o', color='red')

plt.title('Student and Student Scratch Combined Loss for Different Alpha Values, 500 Epochs', fontsize = 14)
plt.xlabel('Alpha', fontsize = 14)
plt.ylabel('Combined Loss', fontsize = 14)
plt.legend()

# Save the figure
plt.savefig('new_500epochs_loss.png')

plt.tight_layout()
plt.show()

# Poor teacher model, experiments

In [29]:
# Train teacher as usual
teacher_model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.BinaryAccuracy()],
)

# Train and evaluate teacher on data.
teacher_model.fit(X_2_train, y_2_train, epochs=100)
teacher_model.evaluate(X_2_test, y_2_test)

Epoch 1/100


  output, from_logits = _get_logits(


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

[0.35021650791168213, 0.9696969985961914]

In [None]:
student_losses = []
distillation_losses = []
student_accuracies = []

# Initialize and compile the distiller
distiller = Distiller(student=student_model, teacher=teacher_model)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.BinaryAccuracy()],
    student_loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    temperature=10,
)

# Set a range of alpha values to test
alpha_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Loop over different alpha values
for alpha in alpha_values:
    # Compile the distiller with the current alpha
    distiller.compile(alpha=alpha,
                      optimizer=keras.optimizers.Adam(),
                      metrics=[keras.metrics.BinaryAccuracy()],
                      student_loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
                      distillation_loss_fn=keras.losses.KLDivergence(),
                      temperature=10,)

    # Train the distiller
    distiller.fit(X_2_train, y_2_train, epochs=500)

    # Evaluate the distiller on the test dataset
    distiller_results = distiller.evaluate(X_2_test, y_2_test)

    # Print the results to inspect the structure
    # print("Distiller Results 1:", distiller_results[1])

    # Store results
    student_losses.append(distiller_results[0])
    student_accuracies.append(distiller_results[1])

    # Print results for the current alpha
    print(f"Alpha: {alpha}")


    print("Distiller Results:", distiller_results)
    print("=" * 50)


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78



Alpha: 0.3
Distiller Results: [0.8484848737716675, 0.06175714358687401]
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
E



Alpha: 0.4
Distiller Results: [0.7575757503509521, 0.18680478632450104]
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
E

In [None]:
# Clone student for later comparison
student_scratch = keras.models.clone_model(student_model)

In [None]:
# Store results for plotting
alpha_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
student_scratch_losses = []
student_scratch_accuracies = []

# Loop over different alpha values
for alpha in alpha_values:
    # Compile the student model from scratch
    student_scratch.compile(
        optimizer=keras.optimizers.Adam(),
        loss=keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=[keras.metrics.BinaryAccuracy()],
    )

    # Train the student model from scratch
    student_scratch.fit(X_2_train, y_2_train, epochs=500)

    # Evaluate the student model on the test dataset
    student_scratch_results = student_scratch.evaluate(X_2_test, y_2_test)

    # Print the results to inspect the structure
    print("Student Scratch Results:", student_scratch_results)

    # Adapt the code based on the actual structure of student_scratch_results
    student_scratch_loss = student_scratch_results[0]
    student_scratch_accuracy = student_scratch_results[1]  # Second element is the accuracy
    student_scratch_accuracies.append(student_scratch_accuracy)
    student_scratch_losses.append(student_scratch_loss)

In [None]:
# Plotting accuracies vs. alphas
plt.figure(figsize=(10, 6))


# Accuracy plot
# plt.subplot(1, 1, 2)
plt.plot(alpha_values, student_accuracies, label="Student Accuracy", marker='o')
plt.plot(alpha_values, student_scratch_accuracies, label="Student Scratch Accuracy", marker='o')
plt.title('T 100 Epochs, Student and Student Scratch Accuracies, 500 Epochs', fontsize = 14)
plt.xlabel('Alpha', fontsize = 14)
plt.ylabel('Accuracy', fontsize = 14)
plt.legend()


# # Accuracy plot for student trained from scratch
# # plt.subplot(1, 2, 2)
# plt.plot(alpha_values, student_scratch_losses, label="Student Scratch Loss", marker='o', color='green')
# plt.plot(alpha_values, student_losses, label="Student Loss", marker='o', color='red')

# plt.title('Student and Student Scratch Combined Loss for Different Alpha Values, 10 Epochs', fontsize = 14)
# plt.xlabel('Alpha', fontsize = 14)
# plt.ylabel('Combined Loss', fontsize = 14)
# plt.legend()

# Save the figure
plt.savefig('poorT100_S500epochs_acc.png')

plt.tight_layout()
plt.show()


In [None]:
# Accuracy plot for student trained from scratch
# plt.subplot(1, 2, 2)

plt.figure(figsize=(10, 6))
plt.plot(alpha_values, student_scratch_losses, label="Student Scratch Loss", marker='o', color='green')
plt.plot(alpha_values, student_losses, label="Student Loss", marker='o', color='red')

plt.title('T 100 Epochs, Student and Student Scratch Combined Loss for Different Alpha Values, 500 Epochs', fontsize = 14)
plt.xlabel('Alpha', fontsize = 14)
plt.ylabel('Combined Loss', fontsize = 14)
plt.legend()

# Save the figure
plt.savefig('poorT100_S500epochs_loss.png')

plt.tight_layout()
plt.show()